From 12bf70cf6dadd7b804edf4e27c189071c56e38fb Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Thu, 18 Jun 2026 04:37:05 +0800
Subject: [PATCH 01/10] ci: populate baselines and gate benchmarks at +20%

Add check_benchmark_regression.py and reduce_baselines.py; populate
benchmarks/baselines.json from post-cache run; wire regression gate into
ubuntu benchmarks job; add Makefile update-baselines target and tests.
---
 .github/workflows/ci.yml                 |  5 +-
 .gitignore                               |  2 +
 Makefile                                 |  9 +++
 benchmarks/README.md                     | 22 +++++--
 benchmarks/baselines.json                | 22 +++++--
 scripts/check_benchmark_regression.py    | 73 ++++++++++++++++++++++
 scripts/reduce_baselines.py              | 56 +++++++++++++++++
 tests/test_check_benchmark_regression.py | 77 ++++++++++++++++++++++++
 8 files changed, 255 insertions(+), 11 deletions(-)
 create mode 100644 Makefile
 create mode 100644 scripts/check_benchmark_regression.py
 create mode 100644 scripts/reduce_baselines.py
 create mode 100644 tests/test_check_benchmark_regression.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 60c75bb..60242c4 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -207,7 +207,7 @@ jobs:
       - run: npm run test:coverage
 
   benchmarks:
-    name: Performance benchmarks (informational)
+    name: Performance benchmarks (gated)
     runs-on: ubuntu-latest
     permissions:
       contents: read
@@ -235,6 +235,9 @@ jobs:
           --benchmark-columns=min,max,mean,stddev,rounds
           -o addopts=
 
+      - name: Regression gate
+        run: python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json
+
       - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
         with:
           name: benchmark-results
diff --git a/.gitignore b/.gitignore
index 8707cd8..27a84ea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,5 @@ node_modules/
 .coverage
 coverage/
 coverage.xml
+benchmark-results.json
+benchmarks/_raw.json
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..9df379a
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,9 @@
+.PHONY: update-baselines check-benchmarks
+
+update-baselines:
+	pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
+	python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json
+
+check-benchmarks:
+	pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=
+	python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 75ff9cc..8519e97 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,6 +1,6 @@
 # Performance benchmarks
 
-Test files live under `tests/benchmarks/`; this directory holds only documentation and the informational `baselines.json` snapshot.
+Test files live under `tests/benchmarks/`; this directory holds documentation and `baselines.json` for the CI regression gate.
 
 Repeatable local measurements for parse, bulk export, and search hot paths.
 
@@ -26,6 +26,7 @@ The memory test also runs as part of the normal `pytest` suite (timing benchmark
 | parse | `parse_session` on 10 / 500 / 5000+ line JSONL |
 | export | `run_bulk_export` over 10 / 50 / 100 sessions |
 | search | `GET /api/search` over a 50-session synthetic corpus |
+| cache | cold vs warm `get_cached_session` (informational; not gated) |
 
 Large JSONL files (5000+ lines) are generated at test session scope under pytest's temp directory — not committed to git.
 
@@ -33,10 +34,23 @@ Corpora repeat one row from `tests/fixtures/session_with_tools.jsonl`, so parse/
 
 The memory test (`test_parse_memory.py`) is intentionally **not** skipped by `--benchmark-skip`; it runs in the main `pytest` job and builds the session-scoped 5000-line fixture once per session.
 
-## CI
+## CI gate
 
-The `benchmarks` workflow job uploads `benchmark-results.json` as a downloadable artifact. There is no regression gate yet.
+The `benchmarks` job on **ubuntu-latest** runs pytest-benchmark, then `scripts/check_benchmark_regression.py`. CI fails when any gated benchmark mean exceeds its baseline by more than **20%**. Benchmarks without a baseline entry (e.g. new `cache` group) print a warning and do not fail the gate.
 
 ## Refresh baselines
 
-After intentional performance work, copy key means from a local run into `baselines.json` with a date and machine note. This file is informational only; CI does not compare against it.
+After intentional performance work on ubuntu (same OS as CI):
+
+```bash
+make update-baselines
+```
+
+Or manually:
+
+```bash
+pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
+python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json
+```
+
+Use `--slack 1.25` on `reduce_baselines.py` when capturing on a faster host than CI to absorb cross-machine variance.
diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
index 123a2b4..97db608 100644
--- a/benchmarks/baselines.json
+++ b/benchmarks/baselines.json
@@ -1,10 +1,20 @@
 {
-  "_note": "Informational snapshot only — CI does not gate on these values.",
-  "updated": null,
-  "machine": null,
+  "_note": "CI gates the ubuntu benchmarks job when mean exceeds baseline by >20%.",
+  "updated": "2026-06-17T20:33:56Z",
+  "machine": "Windows",
   "groups": {
-    "parse": {},
-    "export": {},
-    "search": {}
+    "parse": {
+      "test_parse_session_small": 8.711556942772768e-05,
+      "test_parse_session_medium": 0.0019041118800302193,
+      "test_parse_session_large": 0.018555900882518687
+    },
+    "export": {
+      "test_bulk_export_session_count[sessions-10]": 0.0034216649980517108,
+      "test_bulk_export_session_count[sessions-50]": 0.017290590000629893,
+      "test_bulk_export_session_count[sessions-100]": 0.03397804391996553
+    },
+    "search": {
+      "test_search_full_corpus": 0.002600985144447307
+    }
   }
 }
diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
new file mode 100644
index 0000000..230a345
--- /dev/null
+++ b/scripts/check_benchmark_regression.py
@@ -0,0 +1,73 @@
+"""Compare pytest-benchmark JSON output against stored baselines."""
+
+from __future__ import annotations
+
+import json
+import sys
+from pathlib import Path
+
+THRESHOLD = 1.20
+
+
+def load_results(results_path: str | Path) -> dict[str, float]:
+    data = json.loads(Path(results_path).read_text(encoding="utf-8"))
+    return {entry["name"]: float(entry["stats"]["mean"]) for entry in data["benchmarks"]}
+
+
+def load_baseline_means(baselines_path: str | Path) -> dict[str, float]:
+    data = json.loads(Path(baselines_path).read_text(encoding="utf-8"))
+    groups = data.get("groups", data)
+    means: dict[str, float] = {}
+    for key, value in groups.items():
+        if not isinstance(value, dict):
+            continue
+        for name, mean in value.items():
+            means[name] = float(mean)
+    return means
+
+
+def check_regression(
+    results_path: str | Path,
+    baselines_path: str | Path,
+    *,
+    threshold: float = THRESHOLD,
+) -> int:
+    """Return 0 when within threshold; 1 when any gated benchmark regresses."""
+    flat = load_results(results_path)
+    baseline_means = load_baseline_means(baselines_path)
+
+    failures: list[str] = []
+    for name, base in baseline_means.items():
+        cur = flat.get(name)
+        if cur is None:
+            print(f"WARN: no current result for baseline {name!r}; skipping")
+            continue
+        ratio = cur / base
+        tag = "FAIL" if ratio > threshold else "ok"
+        print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)")
+        if ratio > threshold:
+            failures.append(name)
+
+    for name in flat:
+        if name not in baseline_means:
+            print(f"WARN: {name!r} has no baseline yet; not gated")
+
+    if failures:
+        print(f"\nREGRESSION: {len(failures)} benchmark(s) exceeded {threshold:.0%}")
+        return 1
+    return 0
+
+
+def main(argv: list[str] | None = None) -> int:
+    argv = sys.argv[1:] if argv is None else argv
+    if len(argv) != 2:
+        print(
+            "usage: check_benchmark_regression.py <results.json> <baselines.json>",
+            file=sys.stderr,
+        )
+        return 2
+    return check_regression(argv[0], argv[1])
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py
new file mode 100644
index 0000000..c8c7ec7
--- /dev/null
+++ b/scripts/reduce_baselines.py
@@ -0,0 +1,56 @@
+"""Reduce pytest-benchmark JSON into benchmarks/baselines.json."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from datetime import UTC, datetime
+from pathlib import Path
+
+GATED_GROUPS = ("parse", "export", "search")
+
+
+def reduce_baselines(
+    raw_path: str | Path,
+    out_path: str | Path,
+    *,
+    slack: float = 1.0,
+) -> dict[str, object]:
+    raw = json.loads(Path(raw_path).read_text(encoding="utf-8"))
+    groups: dict[str, dict[str, float]] = {group: {} for group in GATED_GROUPS}
+    for entry in raw["benchmarks"]:
+        group = entry.get("group")
+        if group not in GATED_GROUPS:
+            continue
+        groups[group][entry["name"]] = float(entry["stats"]["mean"]) * slack
+
+    machine_info = raw.get("machine_info", {})
+    output: dict[str, object] = {
+        "_note": "CI gates the ubuntu benchmarks job when mean exceeds baseline by >20%.",
+        "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "machine": machine_info.get("system"),
+        "groups": groups,
+    }
+    path = Path(out_path)
+    path.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8")
+    return output
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("raw_path", help="pytest-benchmark --benchmark-json output")
+    parser.add_argument("out_path", help="destination baselines.json path")
+    parser.add_argument(
+        "--slack",
+        type=float,
+        default=1.0,
+        help="multiply means by this factor (e.g. 1.25 when capturing on a faster host)",
+    )
+    args = parser.parse_args(argv)
+    reduce_baselines(args.raw_path, args.out_path, slack=args.slack)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py
new file mode 100644
index 0000000..d22edf0
--- /dev/null
+++ b/tests/test_check_benchmark_regression.py
@@ -0,0 +1,77 @@
+"""Tests for scripts/check_benchmark_regression.py."""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from scripts.check_benchmark_regression import check_regression
+
+
+def _write_results(path, benchmarks: list[dict]) -> None:
+    path.write_text(
+        json.dumps({"benchmarks": benchmarks}, indent=2),
+        encoding="utf-8",
+    )
+
+
+def _write_baselines(path, groups: dict[str, dict[str, float]]) -> None:
+    path.write_text(
+        json.dumps({"groups": groups}, indent=2),
+        encoding="utf-8",
+    )
+
+
+def test_missing_baseline_warns_without_failing(
+    tmp_path, capsys: pytest.CaptureFixture[str]
+) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(
+        results,
+        [
+            {"name": "test_new_bench", "stats": {"mean": 0.01}},
+            {"name": "test_parse_session_small", "stats": {"mean": 0.0001}},
+        ],
+    )
+    _write_baselines(
+        baselines,
+        {"parse": {"test_parse_session_small": 0.0001}},
+    )
+
+    assert check_regression(results, baselines) == 0
+    out = capsys.readouterr().out
+    assert "WARN: 'test_new_bench' has no baseline yet" in out
+
+
+def test_regression_over_threshold_fails(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(
+        results,
+        [{"name": "test_parse_session_small", "stats": {"mean": 0.0002}}],
+    )
+    _write_baselines(
+        baselines,
+        {"parse": {"test_parse_session_small": 0.0001}},
+    )
+
+    assert check_regression(results, baselines) == 1
+    out = capsys.readouterr().out
+    assert "REGRESSION" in out
+
+
+def test_within_threshold_passes(tmp_path) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(
+        results,
+        [{"name": "test_parse_session_small", "stats": {"mean": 0.00011}}],
+    )
+    _write_baselines(
+        baselines,
+        {"parse": {"test_parse_session_small": 0.0001}},
+    )
+
+    assert check_regression(results, baselines) == 0

From b9d0707568bf5cddafa9b3a1a11223e84f2df47f Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Thu, 18 Jun 2026 04:45:43 +0800
Subject: [PATCH 02/10] fix: set benchmark baselines from ubuntu-latest CI
 means

---
 benchmarks/README.md      |  4 ++--
 benchmarks/baselines.json | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 8519e97..a3467e8 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -40,7 +40,7 @@ The `benchmarks` job on **ubuntu-latest** runs pytest-benchmark, then `scripts/c
 
 ## Refresh baselines
 
-After intentional performance work on ubuntu (same OS as CI):
+After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job):
 
 ```bash
 make update-baselines
@@ -53,4 +53,4 @@ pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json
 python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json
 ```
 
-Use `--slack 1.25` on `reduce_baselines.py` when capturing on a faster host than CI to absorb cross-machine variance.
+Do not capture baselines on Windows/macOS for commit — ubuntu runners are slower and the gate will fail. Download `benchmark-results.json` from a green CI artifact to seed baselines if needed.
diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
index 97db608..42e70e3 100644
--- a/benchmarks/baselines.json
+++ b/benchmarks/baselines.json
@@ -1,20 +1,20 @@
 {
-  "_note": "CI gates the ubuntu benchmarks job when mean exceeds baseline by >20%.",
-  "updated": "2026-06-17T20:33:56Z",
-  "machine": "Windows",
+  "_note": "Means captured from ubuntu-latest CI post-cache (PR #90). Refresh via make update-baselines on ubuntu.",
+  "updated": "2026-06-17T21:00:00Z",
+  "machine": "Linux",
   "groups": {
     "parse": {
-      "test_parse_session_small": 8.711556942772768e-05,
-      "test_parse_session_medium": 0.0019041118800302193,
-      "test_parse_session_large": 0.018555900882518687
+      "test_parse_session_small": 0.000105,
+      "test_parse_session_medium": 0.002956,
+      "test_parse_session_large": 0.029678
     },
     "export": {
-      "test_bulk_export_session_count[sessions-10]": 0.0034216649980517108,
-      "test_bulk_export_session_count[sessions-50]": 0.017290590000629893,
-      "test_bulk_export_session_count[sessions-100]": 0.03397804391996553
+      "test_bulk_export_session_count[sessions-10]": 0.004278,
+      "test_bulk_export_session_count[sessions-50]": 0.021144,
+      "test_bulk_export_session_count[sessions-100]": 0.042003
     },
     "search": {
-      "test_search_full_corpus": 0.002600985144447307
+      "test_search_full_corpus": 0.001092
     }
   }
 }

From 2482951d0ffba03eeafc2b6ceb697d0c26f85636 Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Thu, 18 Jun 2026 04:49:50 +0800
Subject: [PATCH 03/10] fix: harden benchmark gate scripts with input
 validation

Raise BenchmarkDataError for malformed JSON or missing keys; skip zero
baselines safely; reject non-positive --slack values.
---
 scripts/check_benchmark_regression.py    | 55 +++++++++++++++++++++---
 scripts/reduce_baselines.py              | 11 ++++-
 tests/test_check_benchmark_regression.py | 32 +++++++++++++-
 3 files changed, 90 insertions(+), 8 deletions(-)

diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
index 230a345..c8ca359 100644
--- a/scripts/check_benchmark_regression.py
+++ b/scripts/check_benchmark_regression.py
@@ -9,20 +9,62 @@
 THRESHOLD = 1.20
 
 
+class BenchmarkDataError(ValueError):
+    """Raised when benchmark JSON input is malformed or missing required fields."""
+
+
 def load_results(results_path: str | Path) -> dict[str, float]:
-    data = json.loads(Path(results_path).read_text(encoding="utf-8"))
-    return {entry["name"]: float(entry["stats"]["mean"]) for entry in data["benchmarks"]}
+    path = Path(results_path)
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc
+    try:
+        benchmarks = data["benchmarks"]
+    except (KeyError, TypeError) as exc:
+        raise BenchmarkDataError(f"{path} missing top-level 'benchmarks' array") from exc
+    if not isinstance(benchmarks, list):
+        raise BenchmarkDataError(f"{path} 'benchmarks' must be an array")
+
+    results: dict[str, float] = {}
+    for index, entry in enumerate(benchmarks):
+        if not isinstance(entry, dict):
+            raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object")
+        try:
+            name = entry["name"]
+            mean = entry["stats"]["mean"]
+        except (KeyError, TypeError) as exc:
+            raise BenchmarkDataError(
+                f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'"
+            ) from exc
+        results[str(name)] = float(mean)
+    return results
 
 
 def load_baseline_means(baselines_path: str | Path) -> dict[str, float]:
-    data = json.loads(Path(baselines_path).read_text(encoding="utf-8"))
+    path = Path(baselines_path)
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc
+    if not isinstance(data, dict):
+        raise BenchmarkDataError(f"{path} root value must be an object")
+
     groups = data.get("groups", data)
+    if not isinstance(groups, dict):
+        raise BenchmarkDataError(f"{path} missing 'groups' object")
+
     means: dict[str, float] = {}
-    for key, value in groups.items():
+    for group_name, value in groups.items():
         if not isinstance(value, dict):
             continue
         for name, mean in value.items():
-            means[name] = float(mean)
+            try:
+                means[str(name)] = float(mean)
+            except (TypeError, ValueError) as exc:
+                raise BenchmarkDataError(
+                    f"{path} groups[{group_name!r}][{name!r}] is not a numeric mean"
+                ) from exc
     return means
 
 
@@ -42,6 +84,9 @@ def check_regression(
         if cur is None:
             print(f"WARN: no current result for baseline {name!r}; skipping")
             continue
+        if base == 0:
+            print(f"WARN: baseline for {name!r} is zero; skipping ratio check")
+            continue
         ratio = cur / base
         tag = "FAIL" if ratio > threshold else "ok"
         print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)")
diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py
index c8c7ec7..8ad28a6 100644
--- a/scripts/reduce_baselines.py
+++ b/scripts/reduce_baselines.py
@@ -11,6 +11,13 @@
 GATED_GROUPS = ("parse", "export", "search")
 
 
+def _positive_float(value: str) -> float:
+    parsed = float(value)
+    if parsed <= 0:
+        raise argparse.ArgumentTypeError("slack must be greater than zero")
+    return parsed
+
+
 def reduce_baselines(
     raw_path: str | Path,
     out_path: str | Path,
@@ -43,9 +50,9 @@ def main(argv: list[str] | None = None) -> int:
     parser.add_argument("out_path", help="destination baselines.json path")
     parser.add_argument(
         "--slack",
-        type=float,
+        type=_positive_float,
         default=1.0,
-        help="multiply means by this factor (e.g. 1.25 when capturing on a faster host)",
+        help="multiply means by this factor (must be > 0)",
     )
     args = parser.parse_args(argv)
     reduce_baselines(args.raw_path, args.out_path, slack=args.slack)
diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py
index d22edf0..4cd370b 100644
--- a/tests/test_check_benchmark_regression.py
+++ b/tests/test_check_benchmark_regression.py
@@ -6,7 +6,7 @@
 
 import pytest
 
-from scripts.check_benchmark_regression import check_regression
+from scripts.check_benchmark_regression import BenchmarkDataError, check_regression, load_results
 
 
 def _write_results(path, benchmarks: list[dict]) -> None:
@@ -75,3 +75,33 @@ def test_within_threshold_passes(tmp_path) -> None:
     )
 
     assert check_regression(results, baselines) == 0
+
+
+def test_load_results_rejects_malformed_json(tmp_path) -> None:
+    path = tmp_path / "bad.json"
+    path.write_text("{not json", encoding="utf-8")
+    with pytest.raises(BenchmarkDataError, match="invalid JSON"):
+        load_results(path)
+
+
+def test_load_results_requires_benchmarks_array(tmp_path) -> None:
+    path = tmp_path / "results.json"
+    path.write_text("{}", encoding="utf-8")
+    with pytest.raises(BenchmarkDataError, match="'benchmarks' array"):
+        load_results(path)
+
+
+def test_zero_baseline_skips_ratio_check(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(
+        results,
+        [{"name": "test_parse_session_small", "stats": {"mean": 0.0002}}],
+    )
+    _write_baselines(
+        baselines,
+        {"parse": {"test_parse_session_small": 0.0}},
+    )
+
+    assert check_regression(results, baselines) == 0
+    assert "baseline for 'test_parse_session_small' is zero" in capsys.readouterr().out

From b4bf963047da51da3a03168302cb3fbaac67715f Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Thu, 18 Jun 2026 04:59:45 +0800
Subject: [PATCH 04/10] fix: harden benchmark gate per PR #91 review

Upload benchmark-results.json even when the gate fails; add BenchmarkDataError
handling to reduce_baselines; require explicit groups key; expose --threshold
CLI flag; add reduce_baselines and boundary tests; clarify README.
---
 .github/workflows/ci.yml                 |  1 +
 Makefile                                 |  5 +-
 benchmarks/README.md                     |  2 +-
 scripts/check_benchmark_regression.py    | 30 ++++++---
 scripts/reduce_baselines.py              | 39 ++++++++++--
 tests/test_check_benchmark_regression.py | 42 +++++++++++++
 tests/test_reduce_baselines.py           | 78 ++++++++++++++++++++++++
 7 files changed, 181 insertions(+), 16 deletions(-)
 create mode 100644 tests/test_reduce_baselines.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 60242c4..6611ad2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -239,6 +239,7 @@ jobs:
         run: python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json
 
       - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        if: always()
         with:
           name: benchmark-results
           path: benchmark-results.json
diff --git a/Makefile b/Makefile
index 9df379a..b14f3d6 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,4 @@
-.PHONY: update-baselines check-benchmarks
+.PHONY: update-baselines check-benchmarks clean-benchmark-artifacts
 
 update-baselines:
 	pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
@@ -7,3 +7,6 @@ update-baselines:
 check-benchmarks:
 	pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=
 	python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json
+
+clean-benchmark-artifacts:
+	rm -f benchmarks/_raw.json benchmark-results.json
diff --git a/benchmarks/README.md b/benchmarks/README.md
index a3467e8..f746525 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -53,4 +53,4 @@ pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json
 python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json
 ```
 
-Do not capture baselines on Windows/macOS for commit — ubuntu runners are slower and the gate will fail. Download `benchmark-results.json` from a green CI artifact to seed baselines if needed.
+Baselines must be captured on **ubuntu-latest** to match the gated CI runner. Cross-OS variance causes spurious failures. Download `benchmark-results.json` from a CI artifact to seed baselines if needed.
diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
index c8ca359..0bc8258 100644
--- a/scripts/check_benchmark_regression.py
+++ b/scripts/check_benchmark_regression.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import argparse
 import json
 import sys
 from pathlib import Path
@@ -50,9 +51,11 @@ def load_baseline_means(baselines_path: str | Path) -> dict[str, float]:
     if not isinstance(data, dict):
         raise BenchmarkDataError(f"{path} root value must be an object")
 
-    groups = data.get("groups", data)
+    if "groups" not in data:
+        raise BenchmarkDataError(f"{path} missing required 'groups' key")
+    groups = data["groups"]
     if not isinstance(groups, dict):
-        raise BenchmarkDataError(f"{path} missing 'groups' object")
+        raise BenchmarkDataError(f"{path} 'groups' must be an object")
 
     means: dict[str, float] = {}
     for group_name, value in groups.items():
@@ -104,14 +107,25 @@ def check_regression(
 
 
 def main(argv: list[str] | None = None) -> int:
-    argv = sys.argv[1:] if argv is None else argv
-    if len(argv) != 2:
-        print(
-            "usage: check_benchmark_regression.py <results.json> <baselines.json>",
-            file=sys.stderr,
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("results_path", help="pytest-benchmark --benchmark-json output")
+    parser.add_argument("baselines_path", help="path to benchmarks/baselines.json")
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=THRESHOLD,
+        help="fail when current mean exceeds baseline by more than this ratio (default: 1.20)",
+    )
+    args = parser.parse_args(argv)
+    try:
+        return check_regression(
+            args.results_path,
+            args.baselines_path,
+            threshold=args.threshold,
         )
+    except BenchmarkDataError as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
         return 2
-    return check_regression(argv[0], argv[1])
 
 
 if __name__ == "__main__":
diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py
index 8ad28a6..c2367fc 100644
--- a/scripts/reduce_baselines.py
+++ b/scripts/reduce_baselines.py
@@ -8,6 +8,8 @@
 from datetime import UTC, datetime
 from pathlib import Path
 
+from scripts.check_benchmark_regression import BenchmarkDataError
+
 GATED_GROUPS = ("parse", "export", "search")
 
 
@@ -24,13 +26,34 @@ def reduce_baselines(
     *,
     slack: float = 1.0,
 ) -> dict[str, object]:
-    raw = json.loads(Path(raw_path).read_text(encoding="utf-8"))
+    path = Path(raw_path)
+    try:
+        raw = json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc
+
+    try:
+        entries = raw["benchmarks"]
+    except (KeyError, TypeError) as exc:
+        raise BenchmarkDataError(f"{path} missing top-level 'benchmarks' array") from exc
+    if not isinstance(entries, list):
+        raise BenchmarkDataError(f"{path} 'benchmarks' must be an array")
+
     groups: dict[str, dict[str, float]] = {group: {} for group in GATED_GROUPS}
-    for entry in raw["benchmarks"]:
+    for index, entry in enumerate(entries):
+        if not isinstance(entry, dict):
+            raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object")
+        try:
+            name = entry["name"]
+            mean = float(entry["stats"]["mean"])
+        except (KeyError, TypeError, ValueError) as exc:
+            raise BenchmarkDataError(
+                f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'"
+            ) from exc
         group = entry.get("group")
         if group not in GATED_GROUPS:
             continue
-        groups[group][entry["name"]] = float(entry["stats"]["mean"]) * slack
+        groups[group][str(name)] = mean * slack
 
     machine_info = raw.get("machine_info", {})
     output: dict[str, object] = {
@@ -39,8 +62,8 @@ def reduce_baselines(
         "machine": machine_info.get("system"),
         "groups": groups,
     }
-    path = Path(out_path)
-    path.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8")
+    out = Path(out_path)
+    out.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8")
     return output
 
 
@@ -55,7 +78,11 @@ def main(argv: list[str] | None = None) -> int:
         help="multiply means by this factor (must be > 0)",
     )
     args = parser.parse_args(argv)
-    reduce_baselines(args.raw_path, args.out_path, slack=args.slack)
+    try:
+        reduce_baselines(args.raw_path, args.out_path, slack=args.slack)
+    except BenchmarkDataError as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 2
     return 0
 
 
diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py
index 4cd370b..7e4864d 100644
--- a/tests/test_check_benchmark_regression.py
+++ b/tests/test_check_benchmark_regression.py
@@ -105,3 +105,45 @@ def test_zero_baseline_skips_ratio_check(tmp_path, capsys: pytest.CaptureFixture
 
     assert check_regression(results, baselines) == 0
     assert "baseline for 'test_parse_session_small' is zero" in capsys.readouterr().out
+
+
+def test_exactly_at_threshold_passes(tmp_path) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(
+        results,
+        [{"name": "test_parse_session_small", "stats": {"mean": 0.00012}}],
+    )
+    _write_baselines(
+        baselines,
+        {"parse": {"test_parse_session_small": 0.0001}},
+    )
+
+    assert check_regression(results, baselines) == 0
+
+
+def test_missing_current_result_warns_without_failing(
+    tmp_path, capsys: pytest.CaptureFixture[str]
+) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(results, [])
+    _write_baselines(
+        baselines,
+        {"parse": {"test_parse_session_small": 0.0001}},
+    )
+
+    assert check_regression(results, baselines) == 0
+    assert "no current result for baseline" in capsys.readouterr().out
+
+
+def test_main_reports_benchmark_data_error(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
+    from scripts.check_benchmark_regression import main
+
+    bad = tmp_path / "bad.json"
+    bad.write_text("{}", encoding="utf-8")
+    baselines = tmp_path / "baselines.json"
+    _write_baselines(baselines, {"parse": {"test_parse_session_small": 0.0001}})
+
+    assert main([str(bad), str(baselines)]) == 2
+    assert "ERROR:" in capsys.readouterr().err
diff --git a/tests/test_reduce_baselines.py b/tests/test_reduce_baselines.py
new file mode 100644
index 0000000..a48b359
--- /dev/null
+++ b/tests/test_reduce_baselines.py
@@ -0,0 +1,78 @@
+"""Tests for scripts/reduce_baselines.py."""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from scripts.check_benchmark_regression import BenchmarkDataError
+from scripts.reduce_baselines import reduce_baselines
+
+
+def _write_raw(path, benchmarks: list[dict], *, machine: str = "Linux") -> None:
+    path.write_text(
+        json.dumps(
+            {
+                "machine_info": {"system": machine},
+                "benchmarks": benchmarks,
+            },
+            indent=2,
+        ),
+        encoding="utf-8",
+    )
+
+
+def test_reduce_baselines_writes_gated_groups_only(tmp_path) -> None:
+    raw = tmp_path / "raw.json"
+    out = tmp_path / "baselines.json"
+    _write_raw(
+        raw,
+        [
+            {"group": "parse", "name": "test_parse_session_small", "stats": {"mean": 0.0001}},
+            {"group": "cache", "name": "test_cache_warm_hit", "stats": {"mean": 1e-05}},
+        ],
+    )
+
+    output = reduce_baselines(raw, out)
+
+    assert output["machine"] == "Linux"
+    assert "test_parse_session_small" in output["groups"]["parse"]
+    assert "cache" not in output["groups"]
+    written = json.loads(out.read_text(encoding="utf-8"))
+    assert "test_cache_warm_hit" not in written["groups"]["parse"]
+
+
+def test_reduce_baselines_applies_slack(tmp_path) -> None:
+    raw = tmp_path / "raw.json"
+    out = tmp_path / "baselines.json"
+    _write_raw(
+        raw,
+        [{"group": "search", "name": "test_search_full_corpus", "stats": {"mean": 0.002}}],
+    )
+
+    reduce_baselines(raw, out, slack=1.5)
+    data = json.loads(out.read_text(encoding="utf-8"))
+
+    assert data["groups"]["search"]["test_search_full_corpus"] == pytest.approx(0.003)
+
+
+def test_reduce_baselines_rejects_missing_benchmarks_key(tmp_path) -> None:
+    raw = tmp_path / "raw.json"
+    raw.write_text("{}", encoding="utf-8")
+
+    with pytest.raises(BenchmarkDataError, match="'benchmarks' array"):
+        reduce_baselines(raw, tmp_path / "out.json")
+
+
+def test_reduce_baselines_cli_rejects_non_positive_slack(tmp_path) -> None:
+    from scripts.reduce_baselines import main
+
+    raw = tmp_path / "raw.json"
+    _write_raw(
+        raw,
+        [{"group": "parse", "name": "test_parse_session_small", "stats": {"mean": 0.0001}}],
+    )
+
+    with pytest.raises(SystemExit):
+        main([str(raw), str(tmp_path / "out.json"), "--slack", "0"])

From 6a6cb6ee6b6aefeeeaf6ead9577d5076940dfe57 Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Thu, 18 Jun 2026 05:11:29 +0800
Subject: [PATCH 05/10] fix: harden benchmark scripts against filesystem and
 type errors

Wrap reduce_baselines I/O in BenchmarkDataError; guard machine_info type;
move float(mean) into try block in load_results; assert argparse exit code 2.
---
 scripts/check_benchmark_regression.py |  6 +++---
 scripts/reduce_baselines.py           | 12 +++++++++---
 tests/test_reduce_baselines.py        |  3 ++-
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
index 0bc8258..5632fc0 100644
--- a/scripts/check_benchmark_regression.py
+++ b/scripts/check_benchmark_regression.py
@@ -33,12 +33,12 @@ def load_results(results_path: str | Path) -> dict[str, float]:
             raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object")
         try:
             name = entry["name"]
-            mean = entry["stats"]["mean"]
-        except (KeyError, TypeError) as exc:
+            mean = float(entry["stats"]["mean"])
+        except (KeyError, TypeError, ValueError) as exc:
             raise BenchmarkDataError(
                 f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'"
             ) from exc
-        results[str(name)] = float(mean)
+        results[str(name)] = mean
     return results
 
 
diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py
index c2367fc..ee3791c 100644
--- a/scripts/reduce_baselines.py
+++ b/scripts/reduce_baselines.py
@@ -31,6 +31,8 @@ def reduce_baselines(
         raw = json.loads(path.read_text(encoding="utf-8"))
     except json.JSONDecodeError as exc:
         raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc
+    except OSError as exc:
+        raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc
 
     try:
         entries = raw["benchmarks"]
@@ -55,15 +57,19 @@ def reduce_baselines(
             continue
         groups[group][str(name)] = mean * slack
 
-    machine_info = raw.get("machine_info", {})
+    machine_info = raw.get("machine_info")
+    machine = machine_info.get("system") if isinstance(machine_info, dict) else None
     output: dict[str, object] = {
         "_note": "CI gates the ubuntu benchmarks job when mean exceeds baseline by >20%.",
         "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"),
-        "machine": machine_info.get("system"),
+        "machine": machine,
         "groups": groups,
     }
     out = Path(out_path)
-    out.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8")
+    try:
+        out.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8")
+    except OSError as exc:
+        raise BenchmarkDataError(f"cannot write {out}: {exc}") from exc
     return output
 
 
diff --git a/tests/test_reduce_baselines.py b/tests/test_reduce_baselines.py
index a48b359..637791f 100644
--- a/tests/test_reduce_baselines.py
+++ b/tests/test_reduce_baselines.py
@@ -74,5 +74,6 @@ def test_reduce_baselines_cli_rejects_non_positive_slack(tmp_path) -> None:
         [{"group": "parse", "name": "test_parse_session_small", "stats": {"mean": 0.0001}}],
     )
 
-    with pytest.raises(SystemExit):
+    with pytest.raises(SystemExit) as exc_info:
         main([str(raw), str(tmp_path / "out.json"), "--slack", "0"])
+    assert exc_info.value.code == 2

From 5d816d5e1407066c4101fc30f4e6ff3b372a49a3 Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Thu, 18 Jun 2026 05:40:12 +0800
Subject: [PATCH 06/10] fix: allow reduce_baselines to run as a direct script

---
 scripts/reduce_baselines.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py
index ee3791c..a0d6f4a 100644
--- a/scripts/reduce_baselines.py
+++ b/scripts/reduce_baselines.py
@@ -8,7 +8,10 @@
 from datetime import UTC, datetime
 from pathlib import Path
 
-from scripts.check_benchmark_regression import BenchmarkDataError
+try:
+    from scripts.check_benchmark_regression import BenchmarkDataError
+except ModuleNotFoundError:
+    from check_benchmark_regression import BenchmarkDataError
 
 GATED_GROUPS = ("parse", "export", "search")
 

From 001c5f0268a6ee583063e003195c91d5f34f383f Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Thu, 18 Jun 2026 07:25:23 +0800
Subject: [PATCH 07/10] fix: exclude noisy micro-benchmarks from regression
 gate

Drop test_parse_session_small and test_search_full_corpus from gated
baselines; clarify ubuntu CI provenance in _note; detect duplicate
benchmark names; expand reduce_baselines tests.
---
 benchmarks/README.md                     |  6 ++++-
 benchmarks/baselines.json                |  7 ++---
 scripts/check_benchmark_regression.py    | 20 ++++++++++++--
 scripts/reduce_baselines.py              | 11 +++++---
 tests/test_check_benchmark_regression.py | 21 ++++++++++++++-
 tests/test_reduce_baselines.py           | 33 ++++++++++++++++++++----
 6 files changed, 81 insertions(+), 17 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index f746525..59f70fc 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -36,7 +36,11 @@ The memory test (`test_parse_memory.py`) is intentionally **not** skipped by `--
 
 ## CI gate
 
-The `benchmarks` job on **ubuntu-latest** runs pytest-benchmark, then `scripts/check_benchmark_regression.py`. CI fails when any gated benchmark mean exceeds its baseline by more than **20%**. Benchmarks without a baseline entry (e.g. new `cache` group) print a warning and do not fail the gate.
+The `benchmarks` job on **ubuntu-latest** runs pytest-benchmark (`--benchmark-json=benchmark-results.json`), then `scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json`. CI fails when any **gated** benchmark mean exceeds its baseline by more than **20%**.
+
+**Gated:** parse medium/large, export 10/50/100 sessions.
+
+**Not gated (informational only):** `test_parse_session_small`, `test_search_full_corpus` (sub-ms CI noise), and the `cache` group. Benchmarks without a baseline entry print a warning and do not fail the gate.
 
 ## Refresh baselines
 
diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
index 42e70e3..813a72f 100644
--- a/benchmarks/baselines.json
+++ b/benchmarks/baselines.json
@@ -1,10 +1,9 @@
 {
-  "_note": "Means captured from ubuntu-latest CI post-cache (PR #90). Refresh via make update-baselines on ubuntu.",
+  "_note": "Gated means from ubuntu-latest CI benchmark-results.json (post-cache PR #90). Excluded from gate: test_parse_session_small, test_search_full_corpus (sub-ms CI noise). Refresh via make update-baselines on ubuntu.",
   "updated": "2026-06-17T21:00:00Z",
   "machine": "Linux",
   "groups": {
     "parse": {
-      "test_parse_session_small": 0.000105,
       "test_parse_session_medium": 0.002956,
       "test_parse_session_large": 0.029678
     },
@@ -13,8 +12,6 @@
       "test_bulk_export_session_count[sessions-50]": 0.021144,
       "test_bulk_export_session_count[sessions-100]": 0.042003
     },
-    "search": {
-      "test_search_full_corpus": 0.001092
-    }
+    "search": {}
   }
 }
diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
index 5632fc0..13bb56a 100644
--- a/scripts/check_benchmark_regression.py
+++ b/scripts/check_benchmark_regression.py
@@ -9,6 +9,14 @@
 
 THRESHOLD = 1.20
 
+# Sub-ms timings are too noisy for a fixed 20% gate on ubuntu CI.
+EXCLUDED_FROM_GATE = frozenset(
+    {
+        "test_parse_session_small",
+        "test_search_full_corpus",
+    }
+)
+
 
 class BenchmarkDataError(ValueError):
     """Raised when benchmark JSON input is malformed or missing required fields."""
@@ -38,7 +46,10 @@ def load_results(results_path: str | Path) -> dict[str, float]:
             raise BenchmarkDataError(
                 f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'"
             ) from exc
-        results[str(name)] = mean
+        name = str(name)
+        if name in results:
+            raise BenchmarkDataError(f"{path} duplicate benchmark name {name!r}")
+        results[name] = mean
     return results
 
 
@@ -62,8 +73,13 @@ def load_baseline_means(baselines_path: str | Path) -> dict[str, float]:
         if not isinstance(value, dict):
             continue
         for name, mean in value.items():
+            name = str(name)
+            if name in means:
+                raise BenchmarkDataError(
+                    f"{path} duplicate benchmark name {name!r} across groups"
+                )
             try:
-                means[str(name)] = float(mean)
+                means[name] = float(mean)
             except (TypeError, ValueError) as exc:
                 raise BenchmarkDataError(
                     f"{path} groups[{group_name!r}][{name!r}] is not a numeric mean"
diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py
index a0d6f4a..88be0eb 100644
--- a/scripts/reduce_baselines.py
+++ b/scripts/reduce_baselines.py
@@ -9,9 +9,9 @@
 from pathlib import Path
 
 try:
-    from scripts.check_benchmark_regression import BenchmarkDataError
+    from scripts.check_benchmark_regression import EXCLUDED_FROM_GATE, BenchmarkDataError
 except ModuleNotFoundError:
-    from check_benchmark_regression import BenchmarkDataError
+    from check_benchmark_regression import EXCLUDED_FROM_GATE, BenchmarkDataError
 
 GATED_GROUPS = ("parse", "export", "search")
 
@@ -58,12 +58,17 @@ def reduce_baselines(
         group = entry.get("group")
         if group not in GATED_GROUPS:
             continue
+        if str(name) in EXCLUDED_FROM_GATE:
+            continue
         groups[group][str(name)] = mean * slack
 
     machine_info = raw.get("machine_info")
     machine = machine_info.get("system") if isinstance(machine_info, dict) else None
     output: dict[str, object] = {
-        "_note": "CI gates the ubuntu benchmarks job when mean exceeds baseline by >20%.",
+        "_note": (
+            "Gated means from ubuntu-latest CI (post-cache). "
+            "Excluded from gate: test_parse_session_small, test_search_full_corpus (CI noise)."
+        ),
         "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"),
         "machine": machine,
         "groups": groups,
diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py
index 7e4864d..98fb2dd 100644
--- a/tests/test_check_benchmark_regression.py
+++ b/tests/test_check_benchmark_regression.py
@@ -6,7 +6,12 @@
 
 import pytest
 
-from scripts.check_benchmark_regression import BenchmarkDataError, check_regression, load_results
+from scripts.check_benchmark_regression import (
+    BenchmarkDataError,
+    check_regression,
+    load_baseline_means,
+    load_results,
+)
 
 
 def _write_results(path, benchmarks: list[dict]) -> None:
@@ -147,3 +152,17 @@ def test_main_reports_benchmark_data_error(tmp_path, capsys: pytest.CaptureFixtu
 
     assert main([str(bad), str(baselines)]) == 2
     assert "ERROR:" in capsys.readouterr().err
+
+
+def test_duplicate_baseline_name_raises(tmp_path) -> None:
+    baselines = tmp_path / "baselines.json"
+    _write_baselines(
+        baselines,
+        {
+            "parse": {"test_parse_session_medium": 0.002},
+            "export": {"test_parse_session_medium": 0.003},
+        },
+    )
+
+    with pytest.raises(BenchmarkDataError, match="duplicate benchmark name"):
+        load_baseline_means(baselines)
diff --git a/tests/test_reduce_baselines.py b/tests/test_reduce_baselines.py
index 637791f..8919b84 100644
--- a/tests/test_reduce_baselines.py
+++ b/tests/test_reduce_baselines.py
@@ -29,6 +29,7 @@ def test_reduce_baselines_writes_gated_groups_only(tmp_path) -> None:
     _write_raw(
         raw,
         [
+            {"group": "parse", "name": "test_parse_session_medium", "stats": {"mean": 0.002}},
             {"group": "parse", "name": "test_parse_session_small", "stats": {"mean": 0.0001}},
             {"group": "cache", "name": "test_cache_warm_hit", "stats": {"mean": 1e-05}},
         ],
@@ -37,10 +38,9 @@ def test_reduce_baselines_writes_gated_groups_only(tmp_path) -> None:
     output = reduce_baselines(raw, out)
 
     assert output["machine"] == "Linux"
-    assert "test_parse_session_small" in output["groups"]["parse"]
+    assert "test_parse_session_medium" in output["groups"]["parse"]
+    assert "test_parse_session_small" not in output["groups"]["parse"]
     assert "cache" not in output["groups"]
-    written = json.loads(out.read_text(encoding="utf-8"))
-    assert "test_cache_warm_hit" not in written["groups"]["parse"]
 
 
 def test_reduce_baselines_applies_slack(tmp_path) -> None:
@@ -48,13 +48,13 @@ def test_reduce_baselines_applies_slack(tmp_path) -> None:
     out = tmp_path / "baselines.json"
     _write_raw(
         raw,
-        [{"group": "search", "name": "test_search_full_corpus", "stats": {"mean": 0.002}}],
+        [{"group": "parse", "name": "test_parse_session_medium", "stats": {"mean": 0.002}}],
     )
 
     reduce_baselines(raw, out, slack=1.5)
     data = json.loads(out.read_text(encoding="utf-8"))
 
-    assert data["groups"]["search"]["test_search_full_corpus"] == pytest.approx(0.003)
+    assert data["groups"]["parse"]["test_parse_session_medium"] == pytest.approx(0.003)
 
 
 def test_reduce_baselines_rejects_missing_benchmarks_key(tmp_path) -> None:
@@ -77,3 +77,26 @@ def test_reduce_baselines_cli_rejects_non_positive_slack(tmp_path) -> None:
     with pytest.raises(SystemExit) as exc_info:
         main([str(raw), str(tmp_path / "out.json"), "--slack", "0"])
     assert exc_info.value.code == 2
+
+
+def test_reduce_baselines_machine_info_non_dict(tmp_path) -> None:
+    raw = tmp_path / "raw.json"
+    raw.write_text(
+        json.dumps(
+            {
+                "machine_info": "not-a-dict",
+                "benchmarks": [
+                    {
+                        "group": "parse",
+                        "name": "test_parse_session_medium",
+                        "stats": {"mean": 0.002},
+                    }
+                ],
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    output = reduce_baselines(raw, tmp_path / "out.json")
+
+    assert output["machine"] is None

From 0f5540070dc66f01b1728649ee219f18f1c4f30f Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Thu, 18 Jun 2026 07:32:10 +0800
Subject: [PATCH 08/10] fix: ruff format check_benchmark_regression.pyfix: ruff
 format check_benchmark_regression.py

---
 scripts/check_benchmark_regression.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
index 13bb56a..88dd251 100644
--- a/scripts/check_benchmark_regression.py
+++ b/scripts/check_benchmark_regression.py
@@ -75,9 +75,7 @@ def load_baseline_means(baselines_path: str | Path) -> dict[str, float]:
         for name, mean in value.items():
             name = str(name)
             if name in means:
-                raise BenchmarkDataError(
-                    f"{path} duplicate benchmark name {name!r} across groups"
-                )
+                raise BenchmarkDataError(f"{path} duplicate benchmark name {name!r} across groups")
             try:
                 means[name] = float(mean)
             except (TypeError, ValueError) as exc:

From bdce5ae32fd3962b0d7d3e7e10c84e527a0598d7 Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Thu, 18 Jun 2026 07:40:46 +0800
Subject: [PATCH 09/10] fix: enforce EXCLUDED_FROM_GATE at check time and
 harden file reads

---
 scripts/check_benchmark_regression.py    |  7 ++++
 tests/test_check_benchmark_regression.py | 45 ++++++++++++++++++------
 2 files changed, 42 insertions(+), 10 deletions(-)

diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
index 88dd251..74365d2 100644
--- a/scripts/check_benchmark_regression.py
+++ b/scripts/check_benchmark_regression.py
@@ -26,6 +26,8 @@ def load_results(results_path: str | Path) -> dict[str, float]:
     path = Path(results_path)
     try:
         data = json.loads(path.read_text(encoding="utf-8"))
+    except OSError as exc:
+        raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc
     except json.JSONDecodeError as exc:
         raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc
     try:
@@ -57,6 +59,7 @@ def load_baseline_means(baselines_path: str | Path) -> dict[str, float]:
     path = Path(baselines_path)
     try:
         data = json.loads(path.read_text(encoding="utf-8"))
+    except OSError as exc:
     except json.JSONDecodeError as exc:
         raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc
     if not isinstance(data, dict):
@@ -97,6 +100,8 @@ def check_regression(
 
     failures: list[str] = []
     for name, base in baseline_means.items():
+        if name in EXCLUDED_FROM_GATE:
+            continue
         cur = flat.get(name)
         if cur is None:
             print(f"WARN: no current result for baseline {name!r}; skipping")
@@ -111,6 +116,8 @@ def check_regression(
             failures.append(name)
 
     for name in flat:
+        if name in EXCLUDED_FROM_GATE:
+            continue
         if name not in baseline_means:
             print(f"WARN: {name!r} has no baseline yet; not gated")
 
diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py
index 98fb2dd..49c4f49 100644
--- a/tests/test_check_benchmark_regression.py
+++ b/tests/test_check_benchmark_regression.py
@@ -13,6 +13,8 @@
     load_results,
 )
 
+GATED_BENCH = "test_parse_session_medium"
+
 
 def _write_results(path, benchmarks: list[dict]) -> None:
     path.write_text(
@@ -55,11 +57,11 @@ def test_regression_over_threshold_fails(tmp_path, capsys: pytest.CaptureFixture
     baselines = tmp_path / "baselines.json"
     _write_results(
         results,
-        [{"name": "test_parse_session_small", "stats": {"mean": 0.0002}}],
+        [{"name": GATED_BENCH, "stats": {"mean": 0.0025}}],
     )
     _write_baselines(
         baselines,
-        {"parse": {"test_parse_session_small": 0.0001}},
+        {"parse": {GATED_BENCH: 0.002}},
     )
 
     assert check_regression(results, baselines) == 1
@@ -72,11 +74,11 @@ def test_within_threshold_passes(tmp_path) -> None:
     baselines = tmp_path / "baselines.json"
     _write_results(
         results,
-        [{"name": "test_parse_session_small", "stats": {"mean": 0.00011}}],
+        [{"name": GATED_BENCH, "stats": {"mean": 0.0022}}],
     )
     _write_baselines(
         baselines,
-        {"parse": {"test_parse_session_small": 0.0001}},
+        {"parse": {GATED_BENCH: 0.002}},
     )
 
     assert check_regression(results, baselines) == 0
@@ -96,20 +98,25 @@ def test_load_results_requires_benchmarks_array(tmp_path) -> None:
         load_results(path)
 
 
+def test_load_results_rejects_missing_file(tmp_path) -> None:
+    with pytest.raises(BenchmarkDataError, match="cannot read"):
+        load_results(tmp_path / "missing.json")
+
+
 def test_zero_baseline_skips_ratio_check(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
     results = tmp_path / "results.json"
     baselines = tmp_path / "baselines.json"
     _write_results(
         results,
-        [{"name": "test_parse_session_small", "stats": {"mean": 0.0002}}],
+        [{"name": GATED_BENCH, "stats": {"mean": 0.0025}}],
     )
     _write_baselines(
         baselines,
-        {"parse": {"test_parse_session_small": 0.0}},
+        {"parse": {GATED_BENCH: 0.0}},
     )
 
     assert check_regression(results, baselines) == 0
-    assert "baseline for 'test_parse_session_small' is zero" in capsys.readouterr().out
+    assert f"baseline for '{GATED_BENCH}' is zero" in capsys.readouterr().out
 
 
 def test_exactly_at_threshold_passes(tmp_path) -> None:
@@ -117,7 +124,24 @@ def test_exactly_at_threshold_passes(tmp_path) -> None:
     baselines = tmp_path / "baselines.json"
     _write_results(
         results,
-        [{"name": "test_parse_session_small", "stats": {"mean": 0.00012}}],
+        [{"name": GATED_BENCH, "stats": {"mean": 0.0024}}],
+    )
+    _write_baselines(
+        baselines,
+        {"parse": {GATED_BENCH: 0.002}},
+    )
+
+    assert check_regression(results, baselines) == 0
+
+
+def test_excluded_benchmark_in_baselines_is_not_gated(
+    tmp_path, capsys: pytest.CaptureFixture[str]
+) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(
+        results,
+        [{"name": "test_parse_session_small", "stats": {"mean": 0.001}}],
     )
     _write_baselines(
         baselines,
@@ -125,6 +149,7 @@ def test_exactly_at_threshold_passes(tmp_path) -> None:
     )
 
     assert check_regression(results, baselines) == 0
+    assert "REGRESSION" not in capsys.readouterr().out
 
 
 def test_missing_current_result_warns_without_failing(
@@ -135,7 +160,7 @@ def test_missing_current_result_warns_without_failing(
     _write_results(results, [])
     _write_baselines(
         baselines,
-        {"parse": {"test_parse_session_small": 0.0001}},
+        {"parse": {GATED_BENCH: 0.002}},
     )
 
     assert check_regression(results, baselines) == 0
@@ -148,7 +173,7 @@ def test_main_reports_benchmark_data_error(tmp_path, capsys: pytest.CaptureFixtu
     bad = tmp_path / "bad.json"
     bad.write_text("{}", encoding="utf-8")
     baselines = tmp_path / "baselines.json"
-    _write_baselines(baselines, {"parse": {"test_parse_session_small": 0.0001}})
+    _write_baselines(baselines, {"parse": {GATED_BENCH: 0.002}})
 
     assert main([str(bad), str(baselines)]) == 2
     assert "ERROR:" in capsys.readouterr().err

From a79866ffb9eb833de27004d9af751884b177ee9e Mon Sep 17 00:00:00 2001
From: chen <clean6378@gmail.com>
Date: Thu, 18 Jun 2026 07:41:11 +0800
Subject: [PATCH 10/10] fix: enforce EXCLUDED_FROM_GATE at check time and
 harden file reads

---
 scripts/check_benchmark_regression.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
index 74365d2..7842021 100644
--- a/scripts/check_benchmark_regression.py
+++ b/scripts/check_benchmark_regression.py
@@ -60,6 +60,7 @@ def load_baseline_means(baselines_path: str | Path) -> dict[str, float]:
     try:
         data = json.loads(path.read_text(encoding="utf-8"))
     except OSError as exc:
+        raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc
     except json.JSONDecodeError as exc:
         raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc
     if not isinstance(data, dict):