From 12bf70cf6dadd7b804edf4e27c189071c56e38fb Mon Sep 17 00:00:00 2001 From: chen Date: Thu, 18 Jun 2026 04:37:05 +0800 Subject: [PATCH 01/10] ci: populate baselines and gate benchmarks at +20% Add check_benchmark_regression.py and reduce_baselines.py; populate benchmarks/baselines.json from post-cache run; wire regression gate into ubuntu benchmarks job; add Makefile update-baselines target and tests. --- .github/workflows/ci.yml | 5 +- .gitignore | 2 + Makefile | 9 +++ benchmarks/README.md | 22 +++++-- benchmarks/baselines.json | 22 +++++-- scripts/check_benchmark_regression.py | 73 ++++++++++++++++++++++ scripts/reduce_baselines.py | 56 +++++++++++++++++ tests/test_check_benchmark_regression.py | 77 ++++++++++++++++++++++++ 8 files changed, 255 insertions(+), 11 deletions(-) create mode 100644 Makefile create mode 100644 scripts/check_benchmark_regression.py create mode 100644 scripts/reduce_baselines.py create mode 100644 tests/test_check_benchmark_regression.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 60c75bb..60242c4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -207,7 +207,7 @@ jobs: - run: npm run test:coverage benchmarks: - name: Performance benchmarks (informational) + name: Performance benchmarks (gated) runs-on: ubuntu-latest permissions: contents: read @@ -235,6 +235,9 @@ jobs: --benchmark-columns=min,max,mean,stddev,rounds -o addopts= + - name: Regression gate + run: python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 with: name: benchmark-results diff --git a/.gitignore b/.gitignore index 8707cd8..27a84ea 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,5 @@ node_modules/ .coverage coverage/ coverage.xml +benchmark-results.json +benchmarks/_raw.json diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..9df379a --- /dev/null +++ b/Makefile @@ -0,0 +1,9 @@ +.PHONY: update-baselines check-benchmarks + +update-baselines: + pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts= + python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json + +check-benchmarks: + pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts= + python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json diff --git a/benchmarks/README.md b/benchmarks/README.md index 75ff9cc..8519e97 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,6 +1,6 @@ # Performance benchmarks -Test files live under `tests/benchmarks/`; this directory holds only documentation and the informational `baselines.json` snapshot. +Test files live under `tests/benchmarks/`; this directory holds documentation and `baselines.json` for the CI regression gate. Repeatable local measurements for parse, bulk export, and search hot paths. @@ -26,6 +26,7 @@ The memory test also runs as part of the normal `pytest` suite (timing benchmark | parse | `parse_session` on 10 / 500 / 5000+ line JSONL | | export | `run_bulk_export` over 10 / 50 / 100 sessions | | search | `GET /api/search` over a 50-session synthetic corpus | +| cache | cold vs warm `get_cached_session` (informational; not gated) | Large JSONL files (5000+ lines) are generated at test session scope under pytest's temp directory — not committed to git. @@ -33,10 +34,23 @@ Corpora repeat one row from `tests/fixtures/session_with_tools.jsonl`, so parse/ The memory test (`test_parse_memory.py`) is intentionally **not** skipped by `--benchmark-skip`; it runs in the main `pytest` job and builds the session-scoped 5000-line fixture once per session. -## CI +## CI gate -The `benchmarks` workflow job uploads `benchmark-results.json` as a downloadable artifact. There is no regression gate yet. +The `benchmarks` job on **ubuntu-latest** runs pytest-benchmark, then `scripts/check_benchmark_regression.py`. CI fails when any gated benchmark mean exceeds its baseline by more than **20%**. Benchmarks without a baseline entry (e.g. new `cache` group) print a warning and do not fail the gate. ## Refresh baselines -After intentional performance work, copy key means from a local run into `baselines.json` with a date and machine note. This file is informational only; CI does not compare against it. +After intentional performance work on ubuntu (same OS as CI): + +```bash +make update-baselines +``` + +Or manually: + +```bash +pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts= +python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json +``` + +Use `--slack 1.25` on `reduce_baselines.py` when capturing on a faster host than CI to absorb cross-machine variance. diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json index 123a2b4..97db608 100644 --- a/benchmarks/baselines.json +++ b/benchmarks/baselines.json @@ -1,10 +1,20 @@ { - "_note": "Informational snapshot only — CI does not gate on these values.", - "updated": null, - "machine": null, + "_note": "CI gates the ubuntu benchmarks job when mean exceeds baseline by >20%.", + "updated": "2026-06-17T20:33:56Z", + "machine": "Windows", "groups": { - "parse": {}, - "export": {}, - "search": {} + "parse": { + "test_parse_session_small": 8.711556942772768e-05, + "test_parse_session_medium": 0.0019041118800302193, + "test_parse_session_large": 0.018555900882518687 + }, + "export": { + "test_bulk_export_session_count[sessions-10]": 0.0034216649980517108, + "test_bulk_export_session_count[sessions-50]": 0.017290590000629893, + "test_bulk_export_session_count[sessions-100]": 0.03397804391996553 + }, + "search": { + "test_search_full_corpus": 0.002600985144447307 + } } } diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py new file mode 100644 index 0000000..230a345 --- /dev/null +++ b/scripts/check_benchmark_regression.py @@ -0,0 +1,73 @@ +"""Compare pytest-benchmark JSON output against stored baselines.""" + +from __future__ import annotations + +import json +import sys +from pathlib import Path + +THRESHOLD = 1.20 + + +def load_results(results_path: str | Path) -> dict[str, float]: + data = json.loads(Path(results_path).read_text(encoding="utf-8")) + return {entry["name"]: float(entry["stats"]["mean"]) for entry in data["benchmarks"]} + + +def load_baseline_means(baselines_path: str | Path) -> dict[str, float]: + data = json.loads(Path(baselines_path).read_text(encoding="utf-8")) + groups = data.get("groups", data) + means: dict[str, float] = {} + for key, value in groups.items(): + if not isinstance(value, dict): + continue + for name, mean in value.items(): + means[name] = float(mean) + return means + + +def check_regression( + results_path: str | Path, + baselines_path: str | Path, + *, + threshold: float = THRESHOLD, +) -> int: + """Return 0 when within threshold; 1 when any gated benchmark regresses.""" + flat = load_results(results_path) + baseline_means = load_baseline_means(baselines_path) + + failures: list[str] = [] + for name, base in baseline_means.items(): + cur = flat.get(name) + if cur is None: + print(f"WARN: no current result for baseline {name!r}; skipping") + continue + ratio = cur / base + tag = "FAIL" if ratio > threshold else "ok" + print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)") + if ratio > threshold: + failures.append(name) + + for name in flat: + if name not in baseline_means: + print(f"WARN: {name!r} has no baseline yet; not gated") + + if failures: + print(f"\nREGRESSION: {len(failures)} benchmark(s) exceeded {threshold:.0%}") + return 1 + return 0 + + +def main(argv: list[str] | None = None) -> int: + argv = sys.argv[1:] if argv is None else argv + if len(argv) != 2: + print( + "usage: check_benchmark_regression.py ", + file=sys.stderr, + ) + return 2 + return check_regression(argv[0], argv[1]) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py new file mode 100644 index 0000000..c8c7ec7 --- /dev/null +++ b/scripts/reduce_baselines.py @@ -0,0 +1,56 @@ +"""Reduce pytest-benchmark JSON into benchmarks/baselines.json.""" + +from __future__ import annotations + +import argparse +import json +import sys +from datetime import UTC, datetime +from pathlib import Path + +GATED_GROUPS = ("parse", "export", "search") + + +def reduce_baselines( + raw_path: str | Path, + out_path: str | Path, + *, + slack: float = 1.0, +) -> dict[str, object]: + raw = json.loads(Path(raw_path).read_text(encoding="utf-8")) + groups: dict[str, dict[str, float]] = {group: {} for group in GATED_GROUPS} + for entry in raw["benchmarks"]: + group = entry.get("group") + if group not in GATED_GROUPS: + continue + groups[group][entry["name"]] = float(entry["stats"]["mean"]) * slack + + machine_info = raw.get("machine_info", {}) + output: dict[str, object] = { + "_note": "CI gates the ubuntu benchmarks job when mean exceeds baseline by >20%.", + "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"), + "machine": machine_info.get("system"), + "groups": groups, + } + path = Path(out_path) + path.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8") + return output + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("raw_path", help="pytest-benchmark --benchmark-json output") + parser.add_argument("out_path", help="destination baselines.json path") + parser.add_argument( + "--slack", + type=float, + default=1.0, + help="multiply means by this factor (e.g. 1.25 when capturing on a faster host)", + ) + args = parser.parse_args(argv) + reduce_baselines(args.raw_path, args.out_path, slack=args.slack) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py new file mode 100644 index 0000000..d22edf0 --- /dev/null +++ b/tests/test_check_benchmark_regression.py @@ -0,0 +1,77 @@ +"""Tests for scripts/check_benchmark_regression.py.""" + +from __future__ import annotations + +import json + +import pytest + +from scripts.check_benchmark_regression import check_regression + + +def _write_results(path, benchmarks: list[dict]) -> None: + path.write_text( + json.dumps({"benchmarks": benchmarks}, indent=2), + encoding="utf-8", + ) + + +def _write_baselines(path, groups: dict[str, dict[str, float]]) -> None: + path.write_text( + json.dumps({"groups": groups}, indent=2), + encoding="utf-8", + ) + + +def test_missing_baseline_warns_without_failing( + tmp_path, capsys: pytest.CaptureFixture[str] +) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [ + {"name": "test_new_bench", "stats": {"mean": 0.01}}, + {"name": "test_parse_session_small", "stats": {"mean": 0.0001}}, + ], + ) + _write_baselines( + baselines, + {"parse": {"test_parse_session_small": 0.0001}}, + ) + + assert check_regression(results, baselines) == 0 + out = capsys.readouterr().out + assert "WARN: 'test_new_bench' has no baseline yet" in out + + +def test_regression_over_threshold_fails(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": "test_parse_session_small", "stats": {"mean": 0.0002}}], + ) + _write_baselines( + baselines, + {"parse": {"test_parse_session_small": 0.0001}}, + ) + + assert check_regression(results, baselines) == 1 + out = capsys.readouterr().out + assert "REGRESSION" in out + + +def test_within_threshold_passes(tmp_path) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": "test_parse_session_small", "stats": {"mean": 0.00011}}], + ) + _write_baselines( + baselines, + {"parse": {"test_parse_session_small": 0.0001}}, + ) + + assert check_regression(results, baselines) == 0 From b9d0707568bf5cddafa9b3a1a11223e84f2df47f Mon Sep 17 00:00:00 2001 From: chen Date: Thu, 18 Jun 2026 04:45:43 +0800 Subject: [PATCH 02/10] fix: set benchmark baselines from ubuntu-latest CI means --- benchmarks/README.md | 4 ++-- benchmarks/baselines.json | 20 ++++++++++---------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index 8519e97..a3467e8 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -40,7 +40,7 @@ The `benchmarks` job on **ubuntu-latest** runs pytest-benchmark, then `scripts/c ## Refresh baselines -After intentional performance work on ubuntu (same OS as CI): +After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job): ```bash make update-baselines @@ -53,4 +53,4 @@ pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json ``` -Use `--slack 1.25` on `reduce_baselines.py` when capturing on a faster host than CI to absorb cross-machine variance. +Do not capture baselines on Windows/macOS for commit — ubuntu runners are slower and the gate will fail. Download `benchmark-results.json` from a green CI artifact to seed baselines if needed. diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json index 97db608..42e70e3 100644 --- a/benchmarks/baselines.json +++ b/benchmarks/baselines.json @@ -1,20 +1,20 @@ { - "_note": "CI gates the ubuntu benchmarks job when mean exceeds baseline by >20%.", - "updated": "2026-06-17T20:33:56Z", - "machine": "Windows", + "_note": "Means captured from ubuntu-latest CI post-cache (PR #90). Refresh via make update-baselines on ubuntu.", + "updated": "2026-06-17T21:00:00Z", + "machine": "Linux", "groups": { "parse": { - "test_parse_session_small": 8.711556942772768e-05, - "test_parse_session_medium": 0.0019041118800302193, - "test_parse_session_large": 0.018555900882518687 + "test_parse_session_small": 0.000105, + "test_parse_session_medium": 0.002956, + "test_parse_session_large": 0.029678 }, "export": { - "test_bulk_export_session_count[sessions-10]": 0.0034216649980517108, - "test_bulk_export_session_count[sessions-50]": 0.017290590000629893, - "test_bulk_export_session_count[sessions-100]": 0.03397804391996553 + "test_bulk_export_session_count[sessions-10]": 0.004278, + "test_bulk_export_session_count[sessions-50]": 0.021144, + "test_bulk_export_session_count[sessions-100]": 0.042003 }, "search": { - "test_search_full_corpus": 0.002600985144447307 + "test_search_full_corpus": 0.001092 } } } From 2482951d0ffba03eeafc2b6ceb697d0c26f85636 Mon Sep 17 00:00:00 2001 From: chen Date: Thu, 18 Jun 2026 04:49:50 +0800 Subject: [PATCH 03/10] fix: harden benchmark gate scripts with input validation Raise BenchmarkDataError for malformed JSON or missing keys; skip zero baselines safely; reject non-positive --slack values. --- scripts/check_benchmark_regression.py | 55 +++++++++++++++++++++--- scripts/reduce_baselines.py | 11 ++++- tests/test_check_benchmark_regression.py | 32 +++++++++++++- 3 files changed, 90 insertions(+), 8 deletions(-) diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py index 230a345..c8ca359 100644 --- a/scripts/check_benchmark_regression.py +++ b/scripts/check_benchmark_regression.py @@ -9,20 +9,62 @@ THRESHOLD = 1.20 +class BenchmarkDataError(ValueError): + """Raised when benchmark JSON input is malformed or missing required fields.""" + + def load_results(results_path: str | Path) -> dict[str, float]: - data = json.loads(Path(results_path).read_text(encoding="utf-8")) - return {entry["name"]: float(entry["stats"]["mean"]) for entry in data["benchmarks"]} + path = Path(results_path) + try: + data = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc + try: + benchmarks = data["benchmarks"] + except (KeyError, TypeError) as exc: + raise BenchmarkDataError(f"{path} missing top-level 'benchmarks' array") from exc + if not isinstance(benchmarks, list): + raise BenchmarkDataError(f"{path} 'benchmarks' must be an array") + + results: dict[str, float] = {} + for index, entry in enumerate(benchmarks): + if not isinstance(entry, dict): + raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object") + try: + name = entry["name"] + mean = entry["stats"]["mean"] + except (KeyError, TypeError) as exc: + raise BenchmarkDataError( + f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'" + ) from exc + results[str(name)] = float(mean) + return results def load_baseline_means(baselines_path: str | Path) -> dict[str, float]: - data = json.loads(Path(baselines_path).read_text(encoding="utf-8")) + path = Path(baselines_path) + try: + data = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc + if not isinstance(data, dict): + raise BenchmarkDataError(f"{path} root value must be an object") + groups = data.get("groups", data) + if not isinstance(groups, dict): + raise BenchmarkDataError(f"{path} missing 'groups' object") + means: dict[str, float] = {} - for key, value in groups.items(): + for group_name, value in groups.items(): if not isinstance(value, dict): continue for name, mean in value.items(): - means[name] = float(mean) + try: + means[str(name)] = float(mean) + except (TypeError, ValueError) as exc: + raise BenchmarkDataError( + f"{path} groups[{group_name!r}][{name!r}] is not a numeric mean" + ) from exc return means @@ -42,6 +84,9 @@ def check_regression( if cur is None: print(f"WARN: no current result for baseline {name!r}; skipping") continue + if base == 0: + print(f"WARN: baseline for {name!r} is zero; skipping ratio check") + continue ratio = cur / base tag = "FAIL" if ratio > threshold else "ok" print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)") diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py index c8c7ec7..8ad28a6 100644 --- a/scripts/reduce_baselines.py +++ b/scripts/reduce_baselines.py @@ -11,6 +11,13 @@ GATED_GROUPS = ("parse", "export", "search") +def _positive_float(value: str) -> float: + parsed = float(value) + if parsed <= 0: + raise argparse.ArgumentTypeError("slack must be greater than zero") + return parsed + + def reduce_baselines( raw_path: str | Path, out_path: str | Path, @@ -43,9 +50,9 @@ def main(argv: list[str] | None = None) -> int: parser.add_argument("out_path", help="destination baselines.json path") parser.add_argument( "--slack", - type=float, + type=_positive_float, default=1.0, - help="multiply means by this factor (e.g. 1.25 when capturing on a faster host)", + help="multiply means by this factor (must be > 0)", ) args = parser.parse_args(argv) reduce_baselines(args.raw_path, args.out_path, slack=args.slack) diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py index d22edf0..4cd370b 100644 --- a/tests/test_check_benchmark_regression.py +++ b/tests/test_check_benchmark_regression.py @@ -6,7 +6,7 @@ import pytest -from scripts.check_benchmark_regression import check_regression +from scripts.check_benchmark_regression import BenchmarkDataError, check_regression, load_results def _write_results(path, benchmarks: list[dict]) -> None: @@ -75,3 +75,33 @@ def test_within_threshold_passes(tmp_path) -> None: ) assert check_regression(results, baselines) == 0 + + +def test_load_results_rejects_malformed_json(tmp_path) -> None: + path = tmp_path / "bad.json" + path.write_text("{not json", encoding="utf-8") + with pytest.raises(BenchmarkDataError, match="invalid JSON"): + load_results(path) + + +def test_load_results_requires_benchmarks_array(tmp_path) -> None: + path = tmp_path / "results.json" + path.write_text("{}", encoding="utf-8") + with pytest.raises(BenchmarkDataError, match="'benchmarks' array"): + load_results(path) + + +def test_zero_baseline_skips_ratio_check(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": "test_parse_session_small", "stats": {"mean": 0.0002}}], + ) + _write_baselines( + baselines, + {"parse": {"test_parse_session_small": 0.0}}, + ) + + assert check_regression(results, baselines) == 0 + assert "baseline for 'test_parse_session_small' is zero" in capsys.readouterr().out From b4bf963047da51da3a03168302cb3fbaac67715f Mon Sep 17 00:00:00 2001 From: chen Date: Thu, 18 Jun 2026 04:59:45 +0800 Subject: [PATCH 04/10] fix: harden benchmark gate per PR #91 review Upload benchmark-results.json even when the gate fails; add BenchmarkDataError handling to reduce_baselines; require explicit groups key; expose --threshold CLI flag; add reduce_baselines and boundary tests; clarify README. --- .github/workflows/ci.yml | 1 + Makefile | 5 +- benchmarks/README.md | 2 +- scripts/check_benchmark_regression.py | 30 ++++++--- scripts/reduce_baselines.py | 39 ++++++++++-- tests/test_check_benchmark_regression.py | 42 +++++++++++++ tests/test_reduce_baselines.py | 78 ++++++++++++++++++++++++ 7 files changed, 181 insertions(+), 16 deletions(-) create mode 100644 tests/test_reduce_baselines.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 60242c4..6611ad2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -239,6 +239,7 @@ jobs: run: python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + if: always() with: name: benchmark-results path: benchmark-results.json diff --git a/Makefile b/Makefile index 9df379a..b14f3d6 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: update-baselines check-benchmarks +.PHONY: update-baselines check-benchmarks clean-benchmark-artifacts update-baselines: pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts= @@ -7,3 +7,6 @@ update-baselines: check-benchmarks: pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts= python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json + +clean-benchmark-artifacts: + rm -f benchmarks/_raw.json benchmark-results.json diff --git a/benchmarks/README.md b/benchmarks/README.md index a3467e8..f746525 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -53,4 +53,4 @@ pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json ``` -Do not capture baselines on Windows/macOS for commit — ubuntu runners are slower and the gate will fail. Download `benchmark-results.json` from a green CI artifact to seed baselines if needed. +Baselines must be captured on **ubuntu-latest** to match the gated CI runner. Cross-OS variance causes spurious failures. Download `benchmark-results.json` from a CI artifact to seed baselines if needed. diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py index c8ca359..0bc8258 100644 --- a/scripts/check_benchmark_regression.py +++ b/scripts/check_benchmark_regression.py @@ -2,6 +2,7 @@ from __future__ import annotations +import argparse import json import sys from pathlib import Path @@ -50,9 +51,11 @@ def load_baseline_means(baselines_path: str | Path) -> dict[str, float]: if not isinstance(data, dict): raise BenchmarkDataError(f"{path} root value must be an object") - groups = data.get("groups", data) + if "groups" not in data: + raise BenchmarkDataError(f"{path} missing required 'groups' key") + groups = data["groups"] if not isinstance(groups, dict): - raise BenchmarkDataError(f"{path} missing 'groups' object") + raise BenchmarkDataError(f"{path} 'groups' must be an object") means: dict[str, float] = {} for group_name, value in groups.items(): @@ -104,14 +107,25 @@ def check_regression( def main(argv: list[str] | None = None) -> int: - argv = sys.argv[1:] if argv is None else argv - if len(argv) != 2: - print( - "usage: check_benchmark_regression.py ", - file=sys.stderr, + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("results_path", help="pytest-benchmark --benchmark-json output") + parser.add_argument("baselines_path", help="path to benchmarks/baselines.json") + parser.add_argument( + "--threshold", + type=float, + default=THRESHOLD, + help="fail when current mean exceeds baseline by more than this ratio (default: 1.20)", + ) + args = parser.parse_args(argv) + try: + return check_regression( + args.results_path, + args.baselines_path, + threshold=args.threshold, ) + except BenchmarkDataError as exc: + print(f"ERROR: {exc}", file=sys.stderr) return 2 - return check_regression(argv[0], argv[1]) if __name__ == "__main__": diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py index 8ad28a6..c2367fc 100644 --- a/scripts/reduce_baselines.py +++ b/scripts/reduce_baselines.py @@ -8,6 +8,8 @@ from datetime import UTC, datetime from pathlib import Path +from scripts.check_benchmark_regression import BenchmarkDataError + GATED_GROUPS = ("parse", "export", "search") @@ -24,13 +26,34 @@ def reduce_baselines( *, slack: float = 1.0, ) -> dict[str, object]: - raw = json.loads(Path(raw_path).read_text(encoding="utf-8")) + path = Path(raw_path) + try: + raw = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc + + try: + entries = raw["benchmarks"] + except (KeyError, TypeError) as exc: + raise BenchmarkDataError(f"{path} missing top-level 'benchmarks' array") from exc + if not isinstance(entries, list): + raise BenchmarkDataError(f"{path} 'benchmarks' must be an array") + groups: dict[str, dict[str, float]] = {group: {} for group in GATED_GROUPS} - for entry in raw["benchmarks"]: + for index, entry in enumerate(entries): + if not isinstance(entry, dict): + raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object") + try: + name = entry["name"] + mean = float(entry["stats"]["mean"]) + except (KeyError, TypeError, ValueError) as exc: + raise BenchmarkDataError( + f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'" + ) from exc group = entry.get("group") if group not in GATED_GROUPS: continue - groups[group][entry["name"]] = float(entry["stats"]["mean"]) * slack + groups[group][str(name)] = mean * slack machine_info = raw.get("machine_info", {}) output: dict[str, object] = { @@ -39,8 +62,8 @@ def reduce_baselines( "machine": machine_info.get("system"), "groups": groups, } - path = Path(out_path) - path.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8") + out = Path(out_path) + out.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8") return output @@ -55,7 +78,11 @@ def main(argv: list[str] | None = None) -> int: help="multiply means by this factor (must be > 0)", ) args = parser.parse_args(argv) - reduce_baselines(args.raw_path, args.out_path, slack=args.slack) + try: + reduce_baselines(args.raw_path, args.out_path, slack=args.slack) + except BenchmarkDataError as exc: + print(f"ERROR: {exc}", file=sys.stderr) + return 2 return 0 diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py index 4cd370b..7e4864d 100644 --- a/tests/test_check_benchmark_regression.py +++ b/tests/test_check_benchmark_regression.py @@ -105,3 +105,45 @@ def test_zero_baseline_skips_ratio_check(tmp_path, capsys: pytest.CaptureFixture assert check_regression(results, baselines) == 0 assert "baseline for 'test_parse_session_small' is zero" in capsys.readouterr().out + + +def test_exactly_at_threshold_passes(tmp_path) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": "test_parse_session_small", "stats": {"mean": 0.00012}}], + ) + _write_baselines( + baselines, + {"parse": {"test_parse_session_small": 0.0001}}, + ) + + assert check_regression(results, baselines) == 0 + + +def test_missing_current_result_warns_without_failing( + tmp_path, capsys: pytest.CaptureFixture[str] +) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results(results, []) + _write_baselines( + baselines, + {"parse": {"test_parse_session_small": 0.0001}}, + ) + + assert check_regression(results, baselines) == 0 + assert "no current result for baseline" in capsys.readouterr().out + + +def test_main_reports_benchmark_data_error(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + from scripts.check_benchmark_regression import main + + bad = tmp_path / "bad.json" + bad.write_text("{}", encoding="utf-8") + baselines = tmp_path / "baselines.json" + _write_baselines(baselines, {"parse": {"test_parse_session_small": 0.0001}}) + + assert main([str(bad), str(baselines)]) == 2 + assert "ERROR:" in capsys.readouterr().err diff --git a/tests/test_reduce_baselines.py b/tests/test_reduce_baselines.py new file mode 100644 index 0000000..a48b359 --- /dev/null +++ b/tests/test_reduce_baselines.py @@ -0,0 +1,78 @@ +"""Tests for scripts/reduce_baselines.py.""" + +from __future__ import annotations + +import json + +import pytest + +from scripts.check_benchmark_regression import BenchmarkDataError +from scripts.reduce_baselines import reduce_baselines + + +def _write_raw(path, benchmarks: list[dict], *, machine: str = "Linux") -> None: + path.write_text( + json.dumps( + { + "machine_info": {"system": machine}, + "benchmarks": benchmarks, + }, + indent=2, + ), + encoding="utf-8", + ) + + +def test_reduce_baselines_writes_gated_groups_only(tmp_path) -> None: + raw = tmp_path / "raw.json" + out = tmp_path / "baselines.json" + _write_raw( + raw, + [ + {"group": "parse", "name": "test_parse_session_small", "stats": {"mean": 0.0001}}, + {"group": "cache", "name": "test_cache_warm_hit", "stats": {"mean": 1e-05}}, + ], + ) + + output = reduce_baselines(raw, out) + + assert output["machine"] == "Linux" + assert "test_parse_session_small" in output["groups"]["parse"] + assert "cache" not in output["groups"] + written = json.loads(out.read_text(encoding="utf-8")) + assert "test_cache_warm_hit" not in written["groups"]["parse"] + + +def test_reduce_baselines_applies_slack(tmp_path) -> None: + raw = tmp_path / "raw.json" + out = tmp_path / "baselines.json" + _write_raw( + raw, + [{"group": "search", "name": "test_search_full_corpus", "stats": {"mean": 0.002}}], + ) + + reduce_baselines(raw, out, slack=1.5) + data = json.loads(out.read_text(encoding="utf-8")) + + assert data["groups"]["search"]["test_search_full_corpus"] == pytest.approx(0.003) + + +def test_reduce_baselines_rejects_missing_benchmarks_key(tmp_path) -> None: + raw = tmp_path / "raw.json" + raw.write_text("{}", encoding="utf-8") + + with pytest.raises(BenchmarkDataError, match="'benchmarks' array"): + reduce_baselines(raw, tmp_path / "out.json") + + +def test_reduce_baselines_cli_rejects_non_positive_slack(tmp_path) -> None: + from scripts.reduce_baselines import main + + raw = tmp_path / "raw.json" + _write_raw( + raw, + [{"group": "parse", "name": "test_parse_session_small", "stats": {"mean": 0.0001}}], + ) + + with pytest.raises(SystemExit): + main([str(raw), str(tmp_path / "out.json"), "--slack", "0"]) From 6a6cb6ee6b6aefeeeaf6ead9577d5076940dfe57 Mon Sep 17 00:00:00 2001 From: chen Date: Thu, 18 Jun 2026 05:11:29 +0800 Subject: [PATCH 05/10] fix: harden benchmark scripts against filesystem and type errors Wrap reduce_baselines I/O in BenchmarkDataError; guard machine_info type; move float(mean) into try block in load_results; assert argparse exit code 2. --- scripts/check_benchmark_regression.py | 6 +++--- scripts/reduce_baselines.py | 12 +++++++++--- tests/test_reduce_baselines.py | 3 ++- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py index 0bc8258..5632fc0 100644 --- a/scripts/check_benchmark_regression.py +++ b/scripts/check_benchmark_regression.py @@ -33,12 +33,12 @@ def load_results(results_path: str | Path) -> dict[str, float]: raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object") try: name = entry["name"] - mean = entry["stats"]["mean"] - except (KeyError, TypeError) as exc: + mean = float(entry["stats"]["mean"]) + except (KeyError, TypeError, ValueError) as exc: raise BenchmarkDataError( f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'" ) from exc - results[str(name)] = float(mean) + results[str(name)] = mean return results diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py index c2367fc..ee3791c 100644 --- a/scripts/reduce_baselines.py +++ b/scripts/reduce_baselines.py @@ -31,6 +31,8 @@ def reduce_baselines( raw = json.loads(path.read_text(encoding="utf-8")) except json.JSONDecodeError as exc: raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc + except OSError as exc: + raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc try: entries = raw["benchmarks"] @@ -55,15 +57,19 @@ def reduce_baselines( continue groups[group][str(name)] = mean * slack - machine_info = raw.get("machine_info", {}) + machine_info = raw.get("machine_info") + machine = machine_info.get("system") if isinstance(machine_info, dict) else None output: dict[str, object] = { "_note": "CI gates the ubuntu benchmarks job when mean exceeds baseline by >20%.", "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"), - "machine": machine_info.get("system"), + "machine": machine, "groups": groups, } out = Path(out_path) - out.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8") + try: + out.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8") + except OSError as exc: + raise BenchmarkDataError(f"cannot write {out}: {exc}") from exc return output diff --git a/tests/test_reduce_baselines.py b/tests/test_reduce_baselines.py index a48b359..637791f 100644 --- a/tests/test_reduce_baselines.py +++ b/tests/test_reduce_baselines.py @@ -74,5 +74,6 @@ def test_reduce_baselines_cli_rejects_non_positive_slack(tmp_path) -> None: [{"group": "parse", "name": "test_parse_session_small", "stats": {"mean": 0.0001}}], ) - with pytest.raises(SystemExit): + with pytest.raises(SystemExit) as exc_info: main([str(raw), str(tmp_path / "out.json"), "--slack", "0"]) + assert exc_info.value.code == 2 From 5d816d5e1407066c4101fc30f4e6ff3b372a49a3 Mon Sep 17 00:00:00 2001 From: chen Date: Thu, 18 Jun 2026 05:40:12 +0800 Subject: [PATCH 06/10] fix: allow reduce_baselines to run as a direct script --- scripts/reduce_baselines.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py index ee3791c..a0d6f4a 100644 --- a/scripts/reduce_baselines.py +++ b/scripts/reduce_baselines.py @@ -8,7 +8,10 @@ from datetime import UTC, datetime from pathlib import Path -from scripts.check_benchmark_regression import BenchmarkDataError +try: + from scripts.check_benchmark_regression import BenchmarkDataError +except ModuleNotFoundError: + from check_benchmark_regression import BenchmarkDataError GATED_GROUPS = ("parse", "export", "search") From 001c5f0268a6ee583063e003195c91d5f34f383f Mon Sep 17 00:00:00 2001 From: chen Date: Thu, 18 Jun 2026 07:25:23 +0800 Subject: [PATCH 07/10] fix: exclude noisy micro-benchmarks from regression gate Drop test_parse_session_small and test_search_full_corpus from gated baselines; clarify ubuntu CI provenance in _note; detect duplicate benchmark names; expand reduce_baselines tests. --- benchmarks/README.md | 6 ++++- benchmarks/baselines.json | 7 ++--- scripts/check_benchmark_regression.py | 20 ++++++++++++-- scripts/reduce_baselines.py | 11 +++++--- tests/test_check_benchmark_regression.py | 21 ++++++++++++++- tests/test_reduce_baselines.py | 33 ++++++++++++++++++++---- 6 files changed, 81 insertions(+), 17 deletions(-) diff --git a/benchmarks/README.md b/benchmarks/README.md index f746525..59f70fc 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -36,7 +36,11 @@ The memory test (`test_parse_memory.py`) is intentionally **not** skipped by `-- ## CI gate -The `benchmarks` job on **ubuntu-latest** runs pytest-benchmark, then `scripts/check_benchmark_regression.py`. CI fails when any gated benchmark mean exceeds its baseline by more than **20%**. Benchmarks without a baseline entry (e.g. new `cache` group) print a warning and do not fail the gate. +The `benchmarks` job on **ubuntu-latest** runs pytest-benchmark (`--benchmark-json=benchmark-results.json`), then `scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json`. CI fails when any **gated** benchmark mean exceeds its baseline by more than **20%**. + +**Gated:** parse medium/large, export 10/50/100 sessions. + +**Not gated (informational only):** `test_parse_session_small`, `test_search_full_corpus` (sub-ms CI noise), and the `cache` group. Benchmarks without a baseline entry print a warning and do not fail the gate. ## Refresh baselines diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json index 42e70e3..813a72f 100644 --- a/benchmarks/baselines.json +++ b/benchmarks/baselines.json @@ -1,10 +1,9 @@ { - "_note": "Means captured from ubuntu-latest CI post-cache (PR #90). Refresh via make update-baselines on ubuntu.", + "_note": "Gated means from ubuntu-latest CI benchmark-results.json (post-cache PR #90). Excluded from gate: test_parse_session_small, test_search_full_corpus (sub-ms CI noise). Refresh via make update-baselines on ubuntu.", "updated": "2026-06-17T21:00:00Z", "machine": "Linux", "groups": { "parse": { - "test_parse_session_small": 0.000105, "test_parse_session_medium": 0.002956, "test_parse_session_large": 0.029678 }, @@ -13,8 +12,6 @@ "test_bulk_export_session_count[sessions-50]": 0.021144, "test_bulk_export_session_count[sessions-100]": 0.042003 }, - "search": { - "test_search_full_corpus": 0.001092 - } + "search": {} } } diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py index 5632fc0..13bb56a 100644 --- a/scripts/check_benchmark_regression.py +++ b/scripts/check_benchmark_regression.py @@ -9,6 +9,14 @@ THRESHOLD = 1.20 +# Sub-ms timings are too noisy for a fixed 20% gate on ubuntu CI. +EXCLUDED_FROM_GATE = frozenset( + { + "test_parse_session_small", + "test_search_full_corpus", + } +) + class BenchmarkDataError(ValueError): """Raised when benchmark JSON input is malformed or missing required fields.""" @@ -38,7 +46,10 @@ def load_results(results_path: str | Path) -> dict[str, float]: raise BenchmarkDataError( f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'" ) from exc - results[str(name)] = mean + name = str(name) + if name in results: + raise BenchmarkDataError(f"{path} duplicate benchmark name {name!r}") + results[name] = mean return results @@ -62,8 +73,13 @@ def load_baseline_means(baselines_path: str | Path) -> dict[str, float]: if not isinstance(value, dict): continue for name, mean in value.items(): + name = str(name) + if name in means: + raise BenchmarkDataError( + f"{path} duplicate benchmark name {name!r} across groups" + ) try: - means[str(name)] = float(mean) + means[name] = float(mean) except (TypeError, ValueError) as exc: raise BenchmarkDataError( f"{path} groups[{group_name!r}][{name!r}] is not a numeric mean" diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py index a0d6f4a..88be0eb 100644 --- a/scripts/reduce_baselines.py +++ b/scripts/reduce_baselines.py @@ -9,9 +9,9 @@ from pathlib import Path try: - from scripts.check_benchmark_regression import BenchmarkDataError + from scripts.check_benchmark_regression import EXCLUDED_FROM_GATE, BenchmarkDataError except ModuleNotFoundError: - from check_benchmark_regression import BenchmarkDataError + from check_benchmark_regression import EXCLUDED_FROM_GATE, BenchmarkDataError GATED_GROUPS = ("parse", "export", "search") @@ -58,12 +58,17 @@ def reduce_baselines( group = entry.get("group") if group not in GATED_GROUPS: continue + if str(name) in EXCLUDED_FROM_GATE: + continue groups[group][str(name)] = mean * slack machine_info = raw.get("machine_info") machine = machine_info.get("system") if isinstance(machine_info, dict) else None output: dict[str, object] = { - "_note": "CI gates the ubuntu benchmarks job when mean exceeds baseline by >20%.", + "_note": ( + "Gated means from ubuntu-latest CI (post-cache). " + "Excluded from gate: test_parse_session_small, test_search_full_corpus (CI noise)." + ), "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"), "machine": machine, "groups": groups, diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py index 7e4864d..98fb2dd 100644 --- a/tests/test_check_benchmark_regression.py +++ b/tests/test_check_benchmark_regression.py @@ -6,7 +6,12 @@ import pytest -from scripts.check_benchmark_regression import BenchmarkDataError, check_regression, load_results +from scripts.check_benchmark_regression import ( + BenchmarkDataError, + check_regression, + load_baseline_means, + load_results, +) def _write_results(path, benchmarks: list[dict]) -> None: @@ -147,3 +152,17 @@ def test_main_reports_benchmark_data_error(tmp_path, capsys: pytest.CaptureFixtu assert main([str(bad), str(baselines)]) == 2 assert "ERROR:" in capsys.readouterr().err + + +def test_duplicate_baseline_name_raises(tmp_path) -> None: + baselines = tmp_path / "baselines.json" + _write_baselines( + baselines, + { + "parse": {"test_parse_session_medium": 0.002}, + "export": {"test_parse_session_medium": 0.003}, + }, + ) + + with pytest.raises(BenchmarkDataError, match="duplicate benchmark name"): + load_baseline_means(baselines) diff --git a/tests/test_reduce_baselines.py b/tests/test_reduce_baselines.py index 637791f..8919b84 100644 --- a/tests/test_reduce_baselines.py +++ b/tests/test_reduce_baselines.py @@ -29,6 +29,7 @@ def test_reduce_baselines_writes_gated_groups_only(tmp_path) -> None: _write_raw( raw, [ + {"group": "parse", "name": "test_parse_session_medium", "stats": {"mean": 0.002}}, {"group": "parse", "name": "test_parse_session_small", "stats": {"mean": 0.0001}}, {"group": "cache", "name": "test_cache_warm_hit", "stats": {"mean": 1e-05}}, ], @@ -37,10 +38,9 @@ def test_reduce_baselines_writes_gated_groups_only(tmp_path) -> None: output = reduce_baselines(raw, out) assert output["machine"] == "Linux" - assert "test_parse_session_small" in output["groups"]["parse"] + assert "test_parse_session_medium" in output["groups"]["parse"] + assert "test_parse_session_small" not in output["groups"]["parse"] assert "cache" not in output["groups"] - written = json.loads(out.read_text(encoding="utf-8")) - assert "test_cache_warm_hit" not in written["groups"]["parse"] def test_reduce_baselines_applies_slack(tmp_path) -> None: @@ -48,13 +48,13 @@ def test_reduce_baselines_applies_slack(tmp_path) -> None: out = tmp_path / "baselines.json" _write_raw( raw, - [{"group": "search", "name": "test_search_full_corpus", "stats": {"mean": 0.002}}], + [{"group": "parse", "name": "test_parse_session_medium", "stats": {"mean": 0.002}}], ) reduce_baselines(raw, out, slack=1.5) data = json.loads(out.read_text(encoding="utf-8")) - assert data["groups"]["search"]["test_search_full_corpus"] == pytest.approx(0.003) + assert data["groups"]["parse"]["test_parse_session_medium"] == pytest.approx(0.003) def test_reduce_baselines_rejects_missing_benchmarks_key(tmp_path) -> None: @@ -77,3 +77,26 @@ def test_reduce_baselines_cli_rejects_non_positive_slack(tmp_path) -> None: with pytest.raises(SystemExit) as exc_info: main([str(raw), str(tmp_path / "out.json"), "--slack", "0"]) assert exc_info.value.code == 2 + + +def test_reduce_baselines_machine_info_non_dict(tmp_path) -> None: + raw = tmp_path / "raw.json" + raw.write_text( + json.dumps( + { + "machine_info": "not-a-dict", + "benchmarks": [ + { + "group": "parse", + "name": "test_parse_session_medium", + "stats": {"mean": 0.002}, + } + ], + } + ), + encoding="utf-8", + ) + + output = reduce_baselines(raw, tmp_path / "out.json") + + assert output["machine"] is None From 0f5540070dc66f01b1728649ee219f18f1c4f30f Mon Sep 17 00:00:00 2001 From: chen Date: Thu, 18 Jun 2026 07:32:10 +0800 Subject: [PATCH 08/10] fix: ruff format check_benchmark_regression.pyfix: ruff format check_benchmark_regression.py --- scripts/check_benchmark_regression.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py index 13bb56a..88dd251 100644 --- a/scripts/check_benchmark_regression.py +++ b/scripts/check_benchmark_regression.py @@ -75,9 +75,7 @@ def load_baseline_means(baselines_path: str | Path) -> dict[str, float]: for name, mean in value.items(): name = str(name) if name in means: - raise BenchmarkDataError( - f"{path} duplicate benchmark name {name!r} across groups" - ) + raise BenchmarkDataError(f"{path} duplicate benchmark name {name!r} across groups") try: means[name] = float(mean) except (TypeError, ValueError) as exc: From bdce5ae32fd3962b0d7d3e7e10c84e527a0598d7 Mon Sep 17 00:00:00 2001 From: chen Date: Thu, 18 Jun 2026 07:40:46 +0800 Subject: [PATCH 09/10] fix: enforce EXCLUDED_FROM_GATE at check time and harden file reads --- scripts/check_benchmark_regression.py | 7 ++++ tests/test_check_benchmark_regression.py | 45 ++++++++++++++++++------ 2 files changed, 42 insertions(+), 10 deletions(-) diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py index 88dd251..74365d2 100644 --- a/scripts/check_benchmark_regression.py +++ b/scripts/check_benchmark_regression.py @@ -26,6 +26,8 @@ def load_results(results_path: str | Path) -> dict[str, float]: path = Path(results_path) try: data = json.loads(path.read_text(encoding="utf-8")) + except OSError as exc: + raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc except json.JSONDecodeError as exc: raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc try: @@ -57,6 +59,7 @@ def load_baseline_means(baselines_path: str | Path) -> dict[str, float]: path = Path(baselines_path) try: data = json.loads(path.read_text(encoding="utf-8")) + except OSError as exc: except json.JSONDecodeError as exc: raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc if not isinstance(data, dict): @@ -97,6 +100,8 @@ def check_regression( failures: list[str] = [] for name, base in baseline_means.items(): + if name in EXCLUDED_FROM_GATE: + continue cur = flat.get(name) if cur is None: print(f"WARN: no current result for baseline {name!r}; skipping") @@ -111,6 +116,8 @@ def check_regression( failures.append(name) for name in flat: + if name in EXCLUDED_FROM_GATE: + continue if name not in baseline_means: print(f"WARN: {name!r} has no baseline yet; not gated") diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py index 98fb2dd..49c4f49 100644 --- a/tests/test_check_benchmark_regression.py +++ b/tests/test_check_benchmark_regression.py @@ -13,6 +13,8 @@ load_results, ) +GATED_BENCH = "test_parse_session_medium" + def _write_results(path, benchmarks: list[dict]) -> None: path.write_text( @@ -55,11 +57,11 @@ def test_regression_over_threshold_fails(tmp_path, capsys: pytest.CaptureFixture baselines = tmp_path / "baselines.json" _write_results( results, - [{"name": "test_parse_session_small", "stats": {"mean": 0.0002}}], + [{"name": GATED_BENCH, "stats": {"mean": 0.0025}}], ) _write_baselines( baselines, - {"parse": {"test_parse_session_small": 0.0001}}, + {"parse": {GATED_BENCH: 0.002}}, ) assert check_regression(results, baselines) == 1 @@ -72,11 +74,11 @@ def test_within_threshold_passes(tmp_path) -> None: baselines = tmp_path / "baselines.json" _write_results( results, - [{"name": "test_parse_session_small", "stats": {"mean": 0.00011}}], + [{"name": GATED_BENCH, "stats": {"mean": 0.0022}}], ) _write_baselines( baselines, - {"parse": {"test_parse_session_small": 0.0001}}, + {"parse": {GATED_BENCH: 0.002}}, ) assert check_regression(results, baselines) == 0 @@ -96,20 +98,25 @@ def test_load_results_requires_benchmarks_array(tmp_path) -> None: load_results(path) +def test_load_results_rejects_missing_file(tmp_path) -> None: + with pytest.raises(BenchmarkDataError, match="cannot read"): + load_results(tmp_path / "missing.json") + + def test_zero_baseline_skips_ratio_check(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: results = tmp_path / "results.json" baselines = tmp_path / "baselines.json" _write_results( results, - [{"name": "test_parse_session_small", "stats": {"mean": 0.0002}}], + [{"name": GATED_BENCH, "stats": {"mean": 0.0025}}], ) _write_baselines( baselines, - {"parse": {"test_parse_session_small": 0.0}}, + {"parse": {GATED_BENCH: 0.0}}, ) assert check_regression(results, baselines) == 0 - assert "baseline for 'test_parse_session_small' is zero" in capsys.readouterr().out + assert f"baseline for '{GATED_BENCH}' is zero" in capsys.readouterr().out def test_exactly_at_threshold_passes(tmp_path) -> None: @@ -117,7 +124,24 @@ def test_exactly_at_threshold_passes(tmp_path) -> None: baselines = tmp_path / "baselines.json" _write_results( results, - [{"name": "test_parse_session_small", "stats": {"mean": 0.00012}}], + [{"name": GATED_BENCH, "stats": {"mean": 0.0024}}], + ) + _write_baselines( + baselines, + {"parse": {GATED_BENCH: 0.002}}, + ) + + assert check_regression(results, baselines) == 0 + + +def test_excluded_benchmark_in_baselines_is_not_gated( + tmp_path, capsys: pytest.CaptureFixture[str] +) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": "test_parse_session_small", "stats": {"mean": 0.001}}], ) _write_baselines( baselines, @@ -125,6 +149,7 @@ def test_exactly_at_threshold_passes(tmp_path) -> None: ) assert check_regression(results, baselines) == 0 + assert "REGRESSION" not in capsys.readouterr().out def test_missing_current_result_warns_without_failing( @@ -135,7 +160,7 @@ def test_missing_current_result_warns_without_failing( _write_results(results, []) _write_baselines( baselines, - {"parse": {"test_parse_session_small": 0.0001}}, + {"parse": {GATED_BENCH: 0.002}}, ) assert check_regression(results, baselines) == 0 @@ -148,7 +173,7 @@ def test_main_reports_benchmark_data_error(tmp_path, capsys: pytest.CaptureFixtu bad = tmp_path / "bad.json" bad.write_text("{}", encoding="utf-8") baselines = tmp_path / "baselines.json" - _write_baselines(baselines, {"parse": {"test_parse_session_small": 0.0001}}) + _write_baselines(baselines, {"parse": {GATED_BENCH: 0.002}}) assert main([str(bad), str(baselines)]) == 2 assert "ERROR:" in capsys.readouterr().err From a79866ffb9eb833de27004d9af751884b177ee9e Mon Sep 17 00:00:00 2001 From: chen Date: Thu, 18 Jun 2026 07:41:11 +0800 Subject: [PATCH 10/10] fix: enforce EXCLUDED_FROM_GATE at check time and harden file reads --- scripts/check_benchmark_regression.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py index 74365d2..7842021 100644 --- a/scripts/check_benchmark_regression.py +++ b/scripts/check_benchmark_regression.py @@ -60,6 +60,7 @@ def load_baseline_means(baselines_path: str | Path) -> dict[str, float]: try: data = json.loads(path.read_text(encoding="utf-8")) except OSError as exc: + raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc except json.JSONDecodeError as exc: raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc if not isinstance(data, dict):