Tighten faceoff decay notebook shot filtering#56
Conversation
There was a problem hiding this comment.
💡 Codex Review
Here are some automated review suggestions for this pull request.
Reviewed commit: 22e7ffab78
ℹ️ About Codex in GitHub
Your team has set up Codex to review pull requests in this repo. Reviews are triggered when you
- Open a pull request for review
- Mark a draft as ready
- Comment "@codex review".
If Codex has suggestions, it will comment; otherwise it will react with 👍.
Codex can also answer questions or update the PR. Try commenting "@codex address that feedback".
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": "cur = conn.cursor()\n\ncur.execute(\"SELECT COUNT(*) FROM shot_events\")\ntotal_shots = cur.fetchone()[0]\n\ncur.execute(\"SELECT COUNT(*) FROM shot_events WHERE seconds_since_faceoff IS NOT NULL\")\nwith_faceoff = cur.fetchone()[0]\n\ncur.execute(\"SELECT COUNT(*) FROM shot_events WHERE faceoff_zone_code IS NOT NULL\")\nwith_zone = cur.fetchone()[0]\n\ncoverage_pct = (with_faceoff / total_shots * 100) if total_shots > 0 else 0\nzone_pct = (with_zone / total_shots * 100) if total_shots > 0 else 0\n\nprint(f\"Total shot events: {total_shots:,}\")\nprint(f\"With seconds_since_faceoff: {with_faceoff:,} ({coverage_pct:.1f}%)\")\nprint(f\"With faceoff_zone_code: {with_zone:,} ({zone_pct:.1f}%)\")\nprint()\n\nZONE_CODES = [\"O\", \"D\", \"N\"]\nZONE_LABELS = {\"O\": \"Offensive zone\", \"D\": \"Defensive zone\", \"N\": \"Neutral zone\"}\n\ncur.execute(\"\"\"\n SELECT faceoff_zone_code,\n COUNT(*) AS shots,\n SUM(is_goal) AS goals,\n ROUND(CAST(SUM(is_goal) AS REAL) / COUNT(*), 4) AS goal_rate\n FROM shot_events\n WHERE seconds_since_faceoff IS NOT NULL\n AND faceoff_zone_code IS NOT NULL\n GROUP BY faceoff_zone_code\n ORDER BY faceoff_zone_code\n\"\"\")\nprint(f\"{'Zone':<20} {'Shots':>10} {'Goals':>8} {'Goal Rate':>10}\")\nprint(\"-\" * 52)\nfor row in cur.fetchall():\n label = ZONE_LABELS.get(row[0], row[0])\n print(f\"{label:<20} {row[1]:>10,} {row[2]:>8,} {row[3]:>10.4f}\")\n\nif total_shots == 0:\n print(\"\\n*** No shot events found. Run the scraper first. ***\")" | ||
| "source": "cur = conn.cursor()\n\nMIN_FACEOFF_SHOTS_PER_SEASON = 1000\nMIN_FACEOFF_COVERAGE_PCT = 50.0\nANALYSIS_SHOT_WHERE = \"(s.is_goal = 1 OR s.goalie_id IS NOT NULL)\"\n\n\ndef format_season_label(season_code):\n season_text = str(season_code)\n if len(season_text) == 8:\n return f\"{season_text[:4]}-{season_text[-2:]}\"\n return season_text\n\n\n\ndef season_params(*leading_params):\n return [*leading_params, *INCLUDED_SEASONS]\n\n\ncur.execute(\"SELECT COUNT(*) FROM shot_events\")\ntotal_shots = cur.fetchone()[0]\n\ncur.execute(f\"SELECT COUNT(*) FROM shot_events AS s WHERE {ANALYSIS_SHOT_WHERE}\")\nanalysis_shots = cur.fetchone()[0]\n\ncur.execute(f\"SELECT COUNT(*) FROM shot_events AS s WHERE {ANALYSIS_SHOT_WHERE} AND s.seconds_since_faceoff IS NOT NULL\")\nwith_faceoff = cur.fetchone()[0]\n\ncur.execute(f\"SELECT COUNT(*) FROM shot_events AS s WHERE {ANALYSIS_SHOT_WHERE} AND s.faceoff_zone_code IS NOT NULL\")\nwith_zone = cur.fetchone()[0]\n\ncoverage_pct = (with_faceoff / analysis_shots * 100) if analysis_shots > 0 else 0\nzone_pct = (with_zone / analysis_shots * 100) if analysis_shots > 0 else 0\nexcluded_shots = total_shots - analysis_shots\n\nprint(f\"Total shot events in table: {total_shots:,}\")\nprint(f\"Analysis-eligible shots: {analysis_shots:,}\")\nprint(f\"Excluded rows (blocked-shot proxy): {excluded_shots:,}\")\nprint(f\"With seconds_since_faceoff: {with_faceoff:,} ({coverage_pct:.1f}%)\")\nprint(f\"With faceoff_zone_code: {with_zone:,} ({zone_pct:.1f}%)\")\nprint()\n\nZONE_CODES = [\"O\", \"D\", \"N\"]\nZONE_LABELS = {\"O\": \"Offensive zone\", \"D\": \"Defensive zone\", \"N\": \"Neutral zone\"}\n\ncur.execute(f\"\"\"\n SELECT g.season,\n COUNT(*) AS total_shots,\n SUM(CASE WHEN s.seconds_since_faceoff IS NOT NULL THEN 1 ELSE 0 END) AS faceoff_shots,\n SUM(CASE WHEN s.faceoff_zone_code IS NOT NULL THEN 1 ELSE 0 END) AS zone_shots\n FROM shot_events AS s\n JOIN games AS g\n ON g.game_id = s.game_id\n WHERE {ANALYSIS_SHOT_WHERE}\n GROUP BY g.season\n ORDER BY g.season\n\"\"\")\nseason_coverage_rows = cur.fetchall()\n\nINCLUDED_SEASONS = []\nEXCLUDED_SEASONS = []\n\nprint(\"Season-level faceoff coverage:\")\nprint(f\"{'Season':<10} {'Total Shots':>12} {'With Faceoff':>13} {'Coverage %':>11} {'Status':>10}\")\nprint(\"-\" * 62)\nfor row in season_coverage_rows:\n faceoff_count = row[\"faceoff_shots\"] or 0\n season_total = row[\"total_shots\"] or 0\n season_coverage_pct = (faceoff_count / season_total * 100) if season_total > 0 else 0.0\n include_season = (\n faceoff_count >= MIN_FACEOFF_SHOTS_PER_SEASON\n and season_coverage_pct >= MIN_FACEOFF_COVERAGE_PCT\n )\n status = \"included\" if include_season else \"excluded\"\n target_list = INCLUDED_SEASONS if include_season else EXCLUDED_SEASONS\n target_list.append(row[\"season\"])\n print(f\"{format_season_label(row['season']):<10} {season_total:>12,} {faceoff_count:>13,} {season_coverage_pct:>10.1f}% {status:>10}\")\n\nif not INCLUDED_SEASONS:\n raise RuntimeError(\"No seasons meet the faceoff coverage threshold for analysis.\")\n\nFACEOFF_SEASON_PLACEHOLDERS = \", \".join([\"?\" for _ in INCLUDED_SEASONS])\nFACEOFF_SEASON_WHERE = f\"g.season IN ({FACEOFF_SEASON_PLACEHOLDERS})\"\n\nincluded_labels = \", \".join(format_season_label(season) for season in INCLUDED_SEASONS)\nprint(f\"\\nFaceoff analysis will use: {included_labels}\")\nprint(\"Blocked shots are excluded via the current derived-table proxy: keep goals plus shots with a recorded goalie.\")\n\nif EXCLUDED_SEASONS:\n excluded_labels = \", \".join(format_season_label(season) for season in EXCLUDED_SEASONS)\n print(f\"Excluded due to incomplete faceoff event coverage: {excluded_labels}\")\n print(\"These seasons still have shot rows, but their raw play-by-play feed is missing most faceoff events.\")\n\nprint()\n\ncur.execute(f\"\"\"\n SELECT s.faceoff_zone_code,\n COUNT(*) AS shots,\n SUM(s.is_goal) AS goals,\n ROUND(CAST(SUM(s.is_goal) AS REAL) / COUNT(*), 4) AS goal_rate\n FROM shot_events AS s\n JOIN games AS g\n ON g.game_id = s.game_id\n WHERE {ANALYSIS_SHOT_WHERE}\n AND s.seconds_since_faceoff IS NOT NULL\n AND s.faceoff_zone_code IS NOT NULL\n AND {FACEOFF_SEASON_WHERE}\n GROUP BY s.faceoff_zone_code\n ORDER BY s.faceoff_zone_code\n\"\"\", season_params())\nprint(f\"{'Zone':<20} {'Shots':>10} {'Goals':>8} {'Goal Rate':>10}\")\nprint(\"-\" * 52)\nfor row in cur.fetchall():\n label = ZONE_LABELS.get(row[0], row[0])\n print(f\"{label:<20} {row[1]:>10,} {row[2]:>8,} {row[3]:>10.4f}\")\n\nif total_shots == 0:\n print(\"\\n*** No shot events found. Run the scraper first. ***\")" |
There was a problem hiding this comment.
Replace goalie-null proxy with true blocked-shot filter
The new ANALYSIS_SHOT_WHERE clause (s.is_goal = 1 OR s.goalie_id IS NOT NULL) does not isolate blocked shots; it also drops any non-goal rows where goalie_id is missing for other reasons (for example empty-net attempts or incomplete event details). Because this predicate is now reused across all notebook queries (including season inclusion/exclusion), it can systematically bias coverage checks and recency/zone/manpower rate estimates instead of only removing blocked shots as intended.
Useful? React with 👍 / 👎.
| "id": "season-period-heatmaps-code", | ||
| "metadata": {}, | ||
| "outputs": [], | ||
| "source": "SEASON_PANEL_COLUMNS = 4\nHEXBIN_GRIDSIZE = 24\nHEXBIN_MIN_COUNT = 1\nMIN_PERIOD_SHOTS_FOR_HEATMAP = 5_000\nHEATMAP_CMAP = \"magma\"\nHEATMAP_BINS = \"log\"\nHALF_RINK_EXTENT = (0.0, float(RINK_HALF_LENGTH), -float(RINK_HALF_WIDTH), float(RINK_HALF_WIDTH))\n\n\ndef format_period_label(period_number):\n if period_number <= 3:\n return f\"Period {period_number}\"\n return f\"Overtime {period_number - 3}\"\n\n\ncur.execute(f\"\"\"\n SELECT g.season,\n s.period,\n s.x_coord,\n s.y_coord\n FROM shot_events AS s\n JOIN games AS g\n ON g.game_id = s.game_id\n WHERE {ANALYSIS_SHOT_WHERE}\n AND s.seconds_since_faceoff IS NOT NULL\n AND s.x_coord IS NOT NULL\n AND s.y_coord IS NOT NULL\n AND {FACEOFF_SEASON_WHERE}\n ORDER BY g.season, s.period\n\"\"\", season_params())\n\nseason_period_shots = {}\nperiod_totals = {}\n\nfor row in cur.fetchall():\n key = (row[\"season\"], row[\"period\"])\n if key not in season_period_shots:\n season_period_shots[key] = {\"x\": [], \"y\": []}\n season_period_shots[key][\"x\"].append(row[\"x_coord\"])\n season_period_shots[key][\"y\"].append(row[\"y_coord\"])\n period_totals[row[\"period\"]] = period_totals.get(row[\"period\"], 0) + 1\n\nseasons = INCLUDED_SEASONS\nperiods_for_figures = [\n period for period, shot_total in sorted(period_totals.items())\n if shot_total >= MIN_PERIOD_SHOTS_FOR_HEATMAP\n]\nskipped_periods = [\n (period, shot_total) for period, shot_total in sorted(period_totals.items())\n if shot_total < MIN_PERIOD_SHOTS_FOR_HEATMAP\n]\n\nprint(\"Rendered period figures:\")\nfor period in periods_for_figures:\n print(f\" {format_period_label(period)}: {period_totals[period]:,} shots\")\n\nif skipped_periods:\n print(\"\\nSparse periods omitted from the seasonal grids:\")\n for period, shot_total in skipped_periods:\n print(f\" {format_period_label(period)}: {shot_total:,} shots\")\n\nfor period in periods_for_figures:\n n_rows = int(np.ceil(len(seasons) / SEASON_PANEL_COLUMNS))\n fig, axes = plt.subplots(\n n_rows,\n SEASON_PANEL_COLUMNS,\n figsize=(SEASON_PANEL_COLUMNS * 4.2, n_rows * 3.8),\n )\n axes = np.atleast_1d(axes).ravel()\n used_axes = []\n last_hexbin = None\n\n for ax, season in zip(axes, seasons):\n draw_half_rink(ax)\n ax.set_xlabel(\"\")\n ax.set_ylabel(\"\")\n used_axes.append(ax)\n\n season_key = (season, period)\n coords = season_period_shots.get(season_key)\n shot_count = len(coords[\"x\"]) if coords is not None else 0\n\n if shot_count > 0:\n last_hexbin = ax.hexbin(\n coords[\"x\"],\n coords[\"y\"],\n gridsize=HEXBIN_GRIDSIZE,\n extent=HALF_RINK_EXTENT,\n cmap=HEATMAP_CMAP,\n mincnt=HEXBIN_MIN_COUNT,\n bins=HEATMAP_BINS,\n )\n else:\n ax.text(\n 0.5,\n 0.5,\n \"No shots\",\n transform=ax.transAxes,\n ha=\"center\",\n va=\"center\",\n fontsize=10,\n color=\"#7f8c8d\",\n )\n\n ax.set_title(f\"{format_season_label(season)} (n={shot_count:,})\", fontsize=10)\n\n for ax in axes[len(seasons):]:\n ax.axis(\"off\")\n\n if last_hexbin is not None and used_axes:\n colorbar = fig.colorbar(last_hexbin, ax=used_axes, shrink=0.92, pad=0.01)\n colorbar.set_label(\"Shot count per hex (log scale)\")\n\n fig.suptitle(\n f\"Post-faceoff shot locations by season — {format_period_label(period)}\",\n fontsize=16,\n y=0.995,\n )\n fig.tight_layout(rect=(0.0, 0.0, 1.0, 0.985))\n plt.show()" |
There was a problem hiding this comment.
Do not clip heatmap points to half-rink extent
Constraining hexbin to HALF_RINK_EXTENT (x from 0 to RINK_HALF_LENGTH) silently drops any out-of-range coordinates before plotting. This section is explicitly meant as a coordinate sanity check, so clipping removes exactly the anomalous points (e.g., negative-x rows) that the diagnostic should surface, leading to misleading season-by-period heatmaps and counts.
Useful? React with 👍 / 👎.
Summary
Notes