diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 60c75bb..6611ad2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -207,7 +207,7 @@ jobs: - run: npm run test:coverage benchmarks: - name: Performance benchmarks (informational) + name: Performance benchmarks (gated) runs-on: ubuntu-latest permissions: contents: read @@ -235,7 +235,11 @@ jobs: --benchmark-columns=min,max,mean,stddev,rounds -o addopts= + - name: Regression gate + run: python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json + - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4 + if: always() with: name: benchmark-results path: benchmark-results.json diff --git a/.gitignore b/.gitignore index 8707cd8..27a84ea 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,5 @@ node_modules/ .coverage coverage/ coverage.xml +benchmark-results.json +benchmarks/_raw.json diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..b14f3d6 --- /dev/null +++ b/Makefile @@ -0,0 +1,12 @@ +.PHONY: update-baselines check-benchmarks clean-benchmark-artifacts + +update-baselines: + pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts= + python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json + +check-benchmarks: + pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts= + python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json + +clean-benchmark-artifacts: + rm -f benchmarks/_raw.json benchmark-results.json diff --git a/benchmarks/README.md b/benchmarks/README.md index 75ff9cc..59f70fc 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,6 +1,6 @@ # Performance benchmarks -Test files live under `tests/benchmarks/`; this directory holds only documentation and the informational `baselines.json` snapshot. +Test files live under `tests/benchmarks/`; this directory holds documentation and `baselines.json` for the CI regression gate. Repeatable local measurements for parse, bulk export, and search hot paths. @@ -26,6 +26,7 @@ The memory test also runs as part of the normal `pytest` suite (timing benchmark | parse | `parse_session` on 10 / 500 / 5000+ line JSONL | | export | `run_bulk_export` over 10 / 50 / 100 sessions | | search | `GET /api/search` over a 50-session synthetic corpus | +| cache | cold vs warm `get_cached_session` (informational; not gated) | Large JSONL files (5000+ lines) are generated at test session scope under pytest's temp directory — not committed to git. @@ -33,10 +34,27 @@ Corpora repeat one row from `tests/fixtures/session_with_tools.jsonl`, so parse/ The memory test (`test_parse_memory.py`) is intentionally **not** skipped by `--benchmark-skip`; it runs in the main `pytest` job and builds the session-scoped 5000-line fixture once per session. -## CI +## CI gate -The `benchmarks` workflow job uploads `benchmark-results.json` as a downloadable artifact. There is no regression gate yet. +The `benchmarks` job on **ubuntu-latest** runs pytest-benchmark (`--benchmark-json=benchmark-results.json`), then `scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json`. CI fails when any **gated** benchmark mean exceeds its baseline by more than **20%**. + +**Gated:** parse medium/large, export 10/50/100 sessions. + +**Not gated (informational only):** `test_parse_session_small`, `test_search_full_corpus` (sub-ms CI noise), and the `cache` group. Benchmarks without a baseline entry print a warning and do not fail the gate. ## Refresh baselines -After intentional performance work, copy key means from a local run into `baselines.json` with a date and machine note. This file is informational only; CI does not compare against it. +After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job): + +```bash +make update-baselines +``` + +Or manually: + +```bash +pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts= +python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json +``` + +Baselines must be captured on **ubuntu-latest** to match the gated CI runner. Cross-OS variance causes spurious failures. Download `benchmark-results.json` from a CI artifact to seed baselines if needed. diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json index 123a2b4..813a72f 100644 --- a/benchmarks/baselines.json +++ b/benchmarks/baselines.json @@ -1,10 +1,17 @@ { - "_note": "Informational snapshot only — CI does not gate on these values.", - "updated": null, - "machine": null, + "_note": "Gated means from ubuntu-latest CI benchmark-results.json (post-cache PR #90). Excluded from gate: test_parse_session_small, test_search_full_corpus (sub-ms CI noise). Refresh via make update-baselines on ubuntu.", + "updated": "2026-06-17T21:00:00Z", + "machine": "Linux", "groups": { - "parse": {}, - "export": {}, + "parse": { + "test_parse_session_medium": 0.002956, + "test_parse_session_large": 0.029678 + }, + "export": { + "test_bulk_export_session_count[sessions-10]": 0.004278, + "test_bulk_export_session_count[sessions-50]": 0.021144, + "test_bulk_export_session_count[sessions-100]": 0.042003 + }, "search": {} } } diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py new file mode 100644 index 0000000..7842021 --- /dev/null +++ b/scripts/check_benchmark_regression.py @@ -0,0 +1,154 @@ +"""Compare pytest-benchmark JSON output against stored baselines.""" + +from __future__ import annotations + +import argparse +import json +import sys +from pathlib import Path + +THRESHOLD = 1.20 + +# Sub-ms timings are too noisy for a fixed 20% gate on ubuntu CI. +EXCLUDED_FROM_GATE = frozenset( + { + "test_parse_session_small", + "test_search_full_corpus", + } +) + + +class BenchmarkDataError(ValueError): + """Raised when benchmark JSON input is malformed or missing required fields.""" + + +def load_results(results_path: str | Path) -> dict[str, float]: + path = Path(results_path) + try: + data = json.loads(path.read_text(encoding="utf-8")) + except OSError as exc: + raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc + except json.JSONDecodeError as exc: + raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc + try: + benchmarks = data["benchmarks"] + except (KeyError, TypeError) as exc: + raise BenchmarkDataError(f"{path} missing top-level 'benchmarks' array") from exc + if not isinstance(benchmarks, list): + raise BenchmarkDataError(f"{path} 'benchmarks' must be an array") + + results: dict[str, float] = {} + for index, entry in enumerate(benchmarks): + if not isinstance(entry, dict): + raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object") + try: + name = entry["name"] + mean = float(entry["stats"]["mean"]) + except (KeyError, TypeError, ValueError) as exc: + raise BenchmarkDataError( + f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'" + ) from exc + name = str(name) + if name in results: + raise BenchmarkDataError(f"{path} duplicate benchmark name {name!r}") + results[name] = mean + return results + + +def load_baseline_means(baselines_path: str | Path) -> dict[str, float]: + path = Path(baselines_path) + try: + data = json.loads(path.read_text(encoding="utf-8")) + except OSError as exc: + raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc + except json.JSONDecodeError as exc: + raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc + if not isinstance(data, dict): + raise BenchmarkDataError(f"{path} root value must be an object") + + if "groups" not in data: + raise BenchmarkDataError(f"{path} missing required 'groups' key") + groups = data["groups"] + if not isinstance(groups, dict): + raise BenchmarkDataError(f"{path} 'groups' must be an object") + + means: dict[str, float] = {} + for group_name, value in groups.items(): + if not isinstance(value, dict): + continue + for name, mean in value.items(): + name = str(name) + if name in means: + raise BenchmarkDataError(f"{path} duplicate benchmark name {name!r} across groups") + try: + means[name] = float(mean) + except (TypeError, ValueError) as exc: + raise BenchmarkDataError( + f"{path} groups[{group_name!r}][{name!r}] is not a numeric mean" + ) from exc + return means + + +def check_regression( + results_path: str | Path, + baselines_path: str | Path, + *, + threshold: float = THRESHOLD, +) -> int: + """Return 0 when within threshold; 1 when any gated benchmark regresses.""" + flat = load_results(results_path) + baseline_means = load_baseline_means(baselines_path) + + failures: list[str] = [] + for name, base in baseline_means.items(): + if name in EXCLUDED_FROM_GATE: + continue + cur = flat.get(name) + if cur is None: + print(f"WARN: no current result for baseline {name!r}; skipping") + continue + if base == 0: + print(f"WARN: baseline for {name!r} is zero; skipping ratio check") + continue + ratio = cur / base + tag = "FAIL" if ratio > threshold else "ok" + print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)") + if ratio > threshold: + failures.append(name) + + for name in flat: + if name in EXCLUDED_FROM_GATE: + continue + if name not in baseline_means: + print(f"WARN: {name!r} has no baseline yet; not gated") + + if failures: + print(f"\nREGRESSION: {len(failures)} benchmark(s) exceeded {threshold:.0%}") + return 1 + return 0 + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("results_path", help="pytest-benchmark --benchmark-json output") + parser.add_argument("baselines_path", help="path to benchmarks/baselines.json") + parser.add_argument( + "--threshold", + type=float, + default=THRESHOLD, + help="fail when current mean exceeds baseline by more than this ratio (default: 1.20)", + ) + args = parser.parse_args(argv) + try: + return check_regression( + args.results_path, + args.baselines_path, + threshold=args.threshold, + ) + except BenchmarkDataError as exc: + print(f"ERROR: {exc}", file=sys.stderr) + return 2 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py new file mode 100644 index 0000000..88be0eb --- /dev/null +++ b/scripts/reduce_baselines.py @@ -0,0 +1,104 @@ +"""Reduce pytest-benchmark JSON into benchmarks/baselines.json.""" + +from __future__ import annotations + +import argparse +import json +import sys +from datetime import UTC, datetime +from pathlib import Path + +try: + from scripts.check_benchmark_regression import EXCLUDED_FROM_GATE, BenchmarkDataError +except ModuleNotFoundError: + from check_benchmark_regression import EXCLUDED_FROM_GATE, BenchmarkDataError + +GATED_GROUPS = ("parse", "export", "search") + + +def _positive_float(value: str) -> float: + parsed = float(value) + if parsed <= 0: + raise argparse.ArgumentTypeError("slack must be greater than zero") + return parsed + + +def reduce_baselines( + raw_path: str | Path, + out_path: str | Path, + *, + slack: float = 1.0, +) -> dict[str, object]: + path = Path(raw_path) + try: + raw = json.loads(path.read_text(encoding="utf-8")) + except json.JSONDecodeError as exc: + raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc + except OSError as exc: + raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc + + try: + entries = raw["benchmarks"] + except (KeyError, TypeError) as exc: + raise BenchmarkDataError(f"{path} missing top-level 'benchmarks' array") from exc + if not isinstance(entries, list): + raise BenchmarkDataError(f"{path} 'benchmarks' must be an array") + + groups: dict[str, dict[str, float]] = {group: {} for group in GATED_GROUPS} + for index, entry in enumerate(entries): + if not isinstance(entry, dict): + raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object") + try: + name = entry["name"] + mean = float(entry["stats"]["mean"]) + except (KeyError, TypeError, ValueError) as exc: + raise BenchmarkDataError( + f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'" + ) from exc + group = entry.get("group") + if group not in GATED_GROUPS: + continue + if str(name) in EXCLUDED_FROM_GATE: + continue + groups[group][str(name)] = mean * slack + + machine_info = raw.get("machine_info") + machine = machine_info.get("system") if isinstance(machine_info, dict) else None + output: dict[str, object] = { + "_note": ( + "Gated means from ubuntu-latest CI (post-cache). " + "Excluded from gate: test_parse_session_small, test_search_full_corpus (CI noise)." + ), + "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"), + "machine": machine, + "groups": groups, + } + out = Path(out_path) + try: + out.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8") + except OSError as exc: + raise BenchmarkDataError(f"cannot write {out}: {exc}") from exc + return output + + +def main(argv: list[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("raw_path", help="pytest-benchmark --benchmark-json output") + parser.add_argument("out_path", help="destination baselines.json path") + parser.add_argument( + "--slack", + type=_positive_float, + default=1.0, + help="multiply means by this factor (must be > 0)", + ) + args = parser.parse_args(argv) + try: + reduce_baselines(args.raw_path, args.out_path, slack=args.slack) + except BenchmarkDataError as exc: + print(f"ERROR: {exc}", file=sys.stderr) + return 2 + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py new file mode 100644 index 0000000..49c4f49 --- /dev/null +++ b/tests/test_check_benchmark_regression.py @@ -0,0 +1,193 @@ +"""Tests for scripts/check_benchmark_regression.py.""" + +from __future__ import annotations + +import json + +import pytest + +from scripts.check_benchmark_regression import ( + BenchmarkDataError, + check_regression, + load_baseline_means, + load_results, +) + +GATED_BENCH = "test_parse_session_medium" + + +def _write_results(path, benchmarks: list[dict]) -> None: + path.write_text( + json.dumps({"benchmarks": benchmarks}, indent=2), + encoding="utf-8", + ) + + +def _write_baselines(path, groups: dict[str, dict[str, float]]) -> None: + path.write_text( + json.dumps({"groups": groups}, indent=2), + encoding="utf-8", + ) + + +def test_missing_baseline_warns_without_failing( + tmp_path, capsys: pytest.CaptureFixture[str] +) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [ + {"name": "test_new_bench", "stats": {"mean": 0.01}}, + {"name": "test_parse_session_small", "stats": {"mean": 0.0001}}, + ], + ) + _write_baselines( + baselines, + {"parse": {"test_parse_session_small": 0.0001}}, + ) + + assert check_regression(results, baselines) == 0 + out = capsys.readouterr().out + assert "WARN: 'test_new_bench' has no baseline yet" in out + + +def test_regression_over_threshold_fails(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": GATED_BENCH, "stats": {"mean": 0.0025}}], + ) + _write_baselines( + baselines, + {"parse": {GATED_BENCH: 0.002}}, + ) + + assert check_regression(results, baselines) == 1 + out = capsys.readouterr().out + assert "REGRESSION" in out + + +def test_within_threshold_passes(tmp_path) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": GATED_BENCH, "stats": {"mean": 0.0022}}], + ) + _write_baselines( + baselines, + {"parse": {GATED_BENCH: 0.002}}, + ) + + assert check_regression(results, baselines) == 0 + + +def test_load_results_rejects_malformed_json(tmp_path) -> None: + path = tmp_path / "bad.json" + path.write_text("{not json", encoding="utf-8") + with pytest.raises(BenchmarkDataError, match="invalid JSON"): + load_results(path) + + +def test_load_results_requires_benchmarks_array(tmp_path) -> None: + path = tmp_path / "results.json" + path.write_text("{}", encoding="utf-8") + with pytest.raises(BenchmarkDataError, match="'benchmarks' array"): + load_results(path) + + +def test_load_results_rejects_missing_file(tmp_path) -> None: + with pytest.raises(BenchmarkDataError, match="cannot read"): + load_results(tmp_path / "missing.json") + + +def test_zero_baseline_skips_ratio_check(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": GATED_BENCH, "stats": {"mean": 0.0025}}], + ) + _write_baselines( + baselines, + {"parse": {GATED_BENCH: 0.0}}, + ) + + assert check_regression(results, baselines) == 0 + assert f"baseline for '{GATED_BENCH}' is zero" in capsys.readouterr().out + + +def test_exactly_at_threshold_passes(tmp_path) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": GATED_BENCH, "stats": {"mean": 0.0024}}], + ) + _write_baselines( + baselines, + {"parse": {GATED_BENCH: 0.002}}, + ) + + assert check_regression(results, baselines) == 0 + + +def test_excluded_benchmark_in_baselines_is_not_gated( + tmp_path, capsys: pytest.CaptureFixture[str] +) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results( + results, + [{"name": "test_parse_session_small", "stats": {"mean": 0.001}}], + ) + _write_baselines( + baselines, + {"parse": {"test_parse_session_small": 0.0001}}, + ) + + assert check_regression(results, baselines) == 0 + assert "REGRESSION" not in capsys.readouterr().out + + +def test_missing_current_result_warns_without_failing( + tmp_path, capsys: pytest.CaptureFixture[str] +) -> None: + results = tmp_path / "results.json" + baselines = tmp_path / "baselines.json" + _write_results(results, []) + _write_baselines( + baselines, + {"parse": {GATED_BENCH: 0.002}}, + ) + + assert check_regression(results, baselines) == 0 + assert "no current result for baseline" in capsys.readouterr().out + + +def test_main_reports_benchmark_data_error(tmp_path, capsys: pytest.CaptureFixture[str]) -> None: + from scripts.check_benchmark_regression import main + + bad = tmp_path / "bad.json" + bad.write_text("{}", encoding="utf-8") + baselines = tmp_path / "baselines.json" + _write_baselines(baselines, {"parse": {GATED_BENCH: 0.002}}) + + assert main([str(bad), str(baselines)]) == 2 + assert "ERROR:" in capsys.readouterr().err + + +def test_duplicate_baseline_name_raises(tmp_path) -> None: + baselines = tmp_path / "baselines.json" + _write_baselines( + baselines, + { + "parse": {"test_parse_session_medium": 0.002}, + "export": {"test_parse_session_medium": 0.003}, + }, + ) + + with pytest.raises(BenchmarkDataError, match="duplicate benchmark name"): + load_baseline_means(baselines) diff --git a/tests/test_reduce_baselines.py b/tests/test_reduce_baselines.py new file mode 100644 index 0000000..8919b84 --- /dev/null +++ b/tests/test_reduce_baselines.py @@ -0,0 +1,102 @@ +"""Tests for scripts/reduce_baselines.py.""" + +from __future__ import annotations + +import json + +import pytest + +from scripts.check_benchmark_regression import BenchmarkDataError +from scripts.reduce_baselines import reduce_baselines + + +def _write_raw(path, benchmarks: list[dict], *, machine: str = "Linux") -> None: + path.write_text( + json.dumps( + { + "machine_info": {"system": machine}, + "benchmarks": benchmarks, + }, + indent=2, + ), + encoding="utf-8", + ) + + +def test_reduce_baselines_writes_gated_groups_only(tmp_path) -> None: + raw = tmp_path / "raw.json" + out = tmp_path / "baselines.json" + _write_raw( + raw, + [ + {"group": "parse", "name": "test_parse_session_medium", "stats": {"mean": 0.002}}, + {"group": "parse", "name": "test_parse_session_small", "stats": {"mean": 0.0001}}, + {"group": "cache", "name": "test_cache_warm_hit", "stats": {"mean": 1e-05}}, + ], + ) + + output = reduce_baselines(raw, out) + + assert output["machine"] == "Linux" + assert "test_parse_session_medium" in output["groups"]["parse"] + assert "test_parse_session_small" not in output["groups"]["parse"] + assert "cache" not in output["groups"] + + +def test_reduce_baselines_applies_slack(tmp_path) -> None: + raw = tmp_path / "raw.json" + out = tmp_path / "baselines.json" + _write_raw( + raw, + [{"group": "parse", "name": "test_parse_session_medium", "stats": {"mean": 0.002}}], + ) + + reduce_baselines(raw, out, slack=1.5) + data = json.loads(out.read_text(encoding="utf-8")) + + assert data["groups"]["parse"]["test_parse_session_medium"] == pytest.approx(0.003) + + +def test_reduce_baselines_rejects_missing_benchmarks_key(tmp_path) -> None: + raw = tmp_path / "raw.json" + raw.write_text("{}", encoding="utf-8") + + with pytest.raises(BenchmarkDataError, match="'benchmarks' array"): + reduce_baselines(raw, tmp_path / "out.json") + + +def test_reduce_baselines_cli_rejects_non_positive_slack(tmp_path) -> None: + from scripts.reduce_baselines import main + + raw = tmp_path / "raw.json" + _write_raw( + raw, + [{"group": "parse", "name": "test_parse_session_small", "stats": {"mean": 0.0001}}], + ) + + with pytest.raises(SystemExit) as exc_info: + main([str(raw), str(tmp_path / "out.json"), "--slack", "0"]) + assert exc_info.value.code == 2 + + +def test_reduce_baselines_machine_info_non_dict(tmp_path) -> None: + raw = tmp_path / "raw.json" + raw.write_text( + json.dumps( + { + "machine_info": "not-a-dict", + "benchmarks": [ + { + "group": "parse", + "name": "test_parse_session_medium", + "stats": {"mean": 0.002}, + } + ], + } + ), + encoding="utf-8", + ) + + output = reduce_baselines(raw, tmp_path / "out.json") + + assert output["machine"] is None