dataanalysis-python/fetch_ind_eng_tests_bbb_2025.py at main · MNKCrypto/dataanalysis-python · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
# fetch_ind_eng_tests_bbb_2025.py
# Download ball-by-ball for India vs England Tests (2025) from Cricsheet JSON and save as a CSV.

import io
import os
import re
import zipfile
import json
from datetime import date, datetime
import requests
import pandas as pd
from tqdm import tqdm

CRICSHEET_TESTS_ZIP = "https://cricsheet.org/downloads/tests_json.zip"  # public dataset
OUT_CSV = "ind_eng_2025_tests_balls.csv"

# Date window for the 2025 England vs India Test series
START = date(2025, 6, 1)
END   = date(2025, 8, 31)

def within_series_window(info):
    """Cricsheet 'info.dates' is a list of ISO dates (strings)."""
    ds = info.get("dates") or []
    for d in ds:
        try:
            dt = datetime.fromisoformat(str(d)).date()
            if START <= dt <= END:
                return True
        except Exception:
            continue
    return False

def is_ind_eng_test(info):
    if (info.get("match_type") or info.get("match_type_number")) and (info.get("match_type") or "").lower() != "test":
        return False
    teams = set((info.get("teams") or []))
    return {"India", "England"}.issubset(teams)

def stream_zip_bytes(url):
    r = requests.get(url, stream=True, timeout=120)
    r.raise_for_status()
    return io.BytesIO(r.content)

def flatten_match(json_obj):
    """
    Cricsheet JSON (new format):
      {
        "meta": {...},
        "info": {...},
        "innings": [
          {"team": "England", "overs": [
               {"over": 0, "deliveries":[
                   {"ball": 0.1, "batter": "...", "bowler": "...", "non_striker": "...",
                    "runs": {"batter":0,"extras":0,"total":0}, "extras":{"legbyes":1}, "wickets":[...]},
                   ...
               ]},
               ...
          ]},
          ...
        ]
      }
    """
    info = json_obj.get("info", {})
    match_id = info.get("match_id") or info.get("event", {}).get("match_number")  # match_id is present in new JSON
    venue = info.get("venue")
    city = info.get("city")
    teams = info.get("teams")
    dates = info.get("dates")
    season = info.get("season")
    balls = []

    for inn_no, inn in enumerate(json_obj.get("innings", []), start=1):
        team = inn.get("team")
        for over_block in inn.get("overs", []):
            over_no = over_block.get("over")
            for d in over_block.get("deliveries", []):
                runs = d.get("runs", {})
                wicket_list = d.get("wickets", []) or []
                extras = d.get("extras", {}) or {}
                balls.append({
                    "match_id": match_id,
                    "season": season,
                    "date_list": ",".join(map(str, dates or [])),
                    "venue": venue,
                    "city": city,
                    "teams": ",".join(teams or []),

                    "innings": inn_no,
                    "batting_team": team,
                    "over": over_no,
                    "ball_label": d.get("ball"),  # string like "12.3" or float; keep as-is
                    "batter": d.get("batter"),
                    "bowler": d.get("bowler"),
                    "non_striker": d.get("non_striker"),

                    "runs_batter": runs.get("batter", 0),
                    "runs_extras": runs.get("extras", 0),
                    "runs_total": runs.get("total", 0),

                    # extras breakdown (may be absent)
                    "extra_byes": extras.get("byes", 0),
                    "extra_legbyes": extras.get("legbyes", 0),
                    "extra_wides": extras.get("wides", 0),
                    "extra_noballs": extras.get("noballs", 0),
                    "extra_penalty": extras.get("penalty", 0),

                    # wicket info (flatten first if multiple)
                    "wicket_kind": wicket_list[0].get("kind") if wicket_list else None,
                    "wicket_player_out": wicket_list[0].get("player_out") if wicket_list else None,
                    "wicket_fielders": ",".join([w.get("fielders", [{}])[0].get("name","")
                                                 for w in wicket_list if w.get("fielders")]) if wicket_list else None,
                })
    return balls

def main():
    print("Downloading Cricsheet Tests JSON zip ...")
    buf = stream_zip_bytes(CRICSHEET_TESTS_ZIP)

    rows = []
    with zipfile.ZipFile(buf) as zf:
        # Iterate all JSON files, filter to Ind v Eng within window
        json_files = [n for n in zf.namelist() if n.endswith(".json")]
        for name in tqdm(json_files, desc="Scanning matches"):
            with zf.open(name) as f:
                try:
                    obj = json.load(f)
                except Exception:
                    continue
            info = obj.get("info", {})
            if not is_ind_eng_test(info):
                continue
            if not within_series_window(info):
                continue
            rows.extend(flatten_match(obj))

    if not rows:
        raise SystemExit("No India vs England Test matches found in the specified window. "
                         "Try widening START/END or verify series dates.")

    df = pd.DataFrame(rows)
    # Optional tidy-ups
    # Ensure numeric over/ball columns
    if "over" in df.columns:
        df["over"] = pd.to_numeric(df["over"], errors="coerce").astype("Int64")
    if "runs_total" in df.columns:
        df["runs_total"] = pd.to_numeric(df["runs_total"], errors="coerce").fillna(0).astype(int)

    df.to_csv(OUT_CSV, index=False, encoding="utf-8")
    print(f"Saved → {OUT_CSV} with {len(df):,} balls")

if __name__ == "__main__":
    main()