cppalliance · clean6378-max-it · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026 · Jun 17, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -207,7 +207,7 @@ jobs:
       - run: npm run test:coverage
 
   benchmarks:
-    name: Performance benchmarks (informational)
+    name: Performance benchmarks (gated)
     runs-on: ubuntu-latest
     permissions:
       contents: read
@@ -235,7 +235,11 @@ jobs:
           --benchmark-columns=min,max,mean,stddev,rounds
           -o addopts=
 
+      - name: Regression gate
+        run: python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json
+
       - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        if: always()
         with:
           name: benchmark-results
           path: benchmark-results.json
diff --git a/.gitignore b/.gitignore
@@ -14,3 +14,5 @@ node_modules/
 .coverage
 coverage/
 coverage.xml
+benchmark-results.json
+benchmarks/_raw.json
diff --git a/Makefile b/Makefile
@@ -0,0 +1,12 @@
+.PHONY: update-baselines check-benchmarks clean-benchmark-artifacts
+
+update-baselines:
+	pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
+	python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json
+
+check-benchmarks:
+	pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=
+	python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json
+
+clean-benchmark-artifacts:
+	rm -f benchmarks/_raw.json benchmark-results.json
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -1,6 +1,6 @@
 # Performance benchmarks
 
-Test files live under `tests/benchmarks/`; this directory holds only documentation and the informational `baselines.json` snapshot.
+Test files live under `tests/benchmarks/`; this directory holds documentation and `baselines.json` for the CI regression gate.
 
 Repeatable local measurements for parse, bulk export, and search hot paths.
 
@@ -26,17 +26,35 @@ The memory test also runs as part of the normal `pytest` suite (timing benchmark
 | parse | `parse_session` on 10 / 500 / 5000+ line JSONL |
 | export | `run_bulk_export` over 10 / 50 / 100 sessions |
 | search | `GET /api/search` over a 50-session synthetic corpus |
+| cache | cold vs warm `get_cached_session` (informational; not gated) |
 
 Large JSONL files (5000+ lines) are generated at test session scope under pytest's temp directory — not committed to git.
 
 Corpora repeat one row from `tests/fixtures/session_with_tools.jsonl`, so parse/export numbers measure steady-state throughput on a narrow schema slice — not full parser branch coverage. Treat as v1 baselines, not exhaustive perf proof.
 
 The memory test (`test_parse_memory.py`) is intentionally **not** skipped by `--benchmark-skip`; it runs in the main `pytest` job and builds the session-scoped 5000-line fixture once per session.
 
-## CI
+## CI gate
 
-The `benchmarks` workflow job uploads `benchmark-results.json` as a downloadable artifact. There is no regression gate yet.
+The `benchmarks` job on **ubuntu-latest** runs pytest-benchmark (`--benchmark-json=benchmark-results.json`), then `scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json`. CI fails when any **gated** benchmark mean exceeds its baseline by more than **20%**.
+
+**Gated:** parse medium/large, export 10/50/100 sessions.
+
+**Not gated (informational only):** `test_parse_session_small`, `test_search_full_corpus` (sub-ms CI noise), and the `cache` group. Benchmarks without a baseline entry print a warning and do not fail the gate.
 
 ## Refresh baselines
 
-After intentional performance work, copy key means from a local run into `baselines.json` with a date and machine note. This file is informational only; CI does not compare against it.
+After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job):
+
+```bash
+make update-baselines
+```
+
+Or manually:
+
+```bash
+pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
+python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json
+```
+
+Baselines must be captured on **ubuntu-latest** to match the gated CI runner. Cross-OS variance causes spurious failures. Download `benchmark-results.json` from a CI artifact to seed baselines if needed.
diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
@@ -1,10 +1,17 @@
 {
-  "_note": "Informational snapshot only — CI does not gate on these values.",
-  "updated": null,
-  "machine": null,
+  "_note": "Gated means from ubuntu-latest CI benchmark-results.json (post-cache PR #90). Excluded from gate: test_parse_session_small, test_search_full_corpus (sub-ms CI noise). Refresh via make update-baselines on ubuntu.",
+  "updated": "2026-06-17T21:00:00Z",
+  "machine": "Linux",
   "groups": {
-    "parse": {},
-    "export": {},
+    "parse": {
+      "test_parse_session_medium": 0.002956,
+      "test_parse_session_large": 0.029678
+    },
+    "export": {
+      "test_bulk_export_session_count[sessions-10]": 0.004278,
+      "test_bulk_export_session_count[sessions-50]": 0.021144,
+      "test_bulk_export_session_count[sessions-100]": 0.042003
+    },
     "search": {}
   }
 }
diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
@@ -0,0 +1,154 @@
+"""Compare pytest-benchmark JSON output against stored baselines."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+THRESHOLD = 1.20
+
+# Sub-ms timings are too noisy for a fixed 20% gate on ubuntu CI.
+EXCLUDED_FROM_GATE = frozenset(
+    {
+        "test_parse_session_small",
+        "test_search_full_corpus",
+    }
+)
+
+
+class BenchmarkDataError(ValueError):
+    """Raised when benchmark JSON input is malformed or missing required fields."""
+
+
+def load_results(results_path: str | Path) -> dict[str, float]:
+    path = Path(results_path)
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+    except OSError as exc:
+        raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc
+    except json.JSONDecodeError as exc:
+        raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc
+    try:
+        benchmarks = data["benchmarks"]
+    except (KeyError, TypeError) as exc:
+        raise BenchmarkDataError(f"{path} missing top-level 'benchmarks' array") from exc
+    if not isinstance(benchmarks, list):
+        raise BenchmarkDataError(f"{path} 'benchmarks' must be an array")
+
+    results: dict[str, float] = {}
+    for index, entry in enumerate(benchmarks):
+        if not isinstance(entry, dict):
+            raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object")
+        try:
+            name = entry["name"]
+            mean = float(entry["stats"]["mean"])
+        except (KeyError, TypeError, ValueError) as exc:
+            raise BenchmarkDataError(
+                f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'"
+            ) from exc
+        name = str(name)
+        if name in results:
+            raise BenchmarkDataError(f"{path} duplicate benchmark name {name!r}")
+        results[name] = mean
+    return results
+
+
+def load_baseline_means(baselines_path: str | Path) -> dict[str, float]:
+    path = Path(baselines_path)
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+    except OSError as exc:
+        raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc
+    except json.JSONDecodeError as exc:
+        raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc
+    if not isinstance(data, dict):
+        raise BenchmarkDataError(f"{path} root value must be an object")
+
+    if "groups" not in data:
+        raise BenchmarkDataError(f"{path} missing required 'groups' key")
+    groups = data["groups"]
+    if not isinstance(groups, dict):
+        raise BenchmarkDataError(f"{path} 'groups' must be an object")
+
+    means: dict[str, float] = {}
+    for group_name, value in groups.items():
+        if not isinstance(value, dict):
+            continue
+        for name, mean in value.items():
+            name = str(name)
+            if name in means:
+                raise BenchmarkDataError(f"{path} duplicate benchmark name {name!r} across groups")
+            try:
+                means[name] = float(mean)
+            except (TypeError, ValueError) as exc:
+                raise BenchmarkDataError(
+                    f"{path} groups[{group_name!r}][{name!r}] is not a numeric mean"
+                ) from exc
+    return means
+
+
+def check_regression(
+    results_path: str | Path,
+    baselines_path: str | Path,
+    *,
+    threshold: float = THRESHOLD,
+) -> int:
+    """Return 0 when within threshold; 1 when any gated benchmark regresses."""
+    flat = load_results(results_path)
+    baseline_means = load_baseline_means(baselines_path)
+
+    failures: list[str] = []
+    for name, base in baseline_means.items():
+        if name in EXCLUDED_FROM_GATE:
+            continue
+        cur = flat.get(name)
+        if cur is None:
+            print(f"WARN: no current result for baseline {name!r}; skipping")
+            continue
+        if base == 0:
+            print(f"WARN: baseline for {name!r} is zero; skipping ratio check")
+            continue
+        ratio = cur / base
+        tag = "FAIL" if ratio > threshold else "ok"
+        print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)")
+        if ratio > threshold:
+            failures.append(name)
+
+    for name in flat:
+        if name in EXCLUDED_FROM_GATE:
+            continue
+        if name not in baseline_means:
+            print(f"WARN: {name!r} has no baseline yet; not gated")
+
+    if failures:
+        print(f"\nREGRESSION: {len(failures)} benchmark(s) exceeded {threshold:.0%}")
+        return 1
+    return 0
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("results_path", help="pytest-benchmark --benchmark-json output")
+    parser.add_argument("baselines_path", help="path to benchmarks/baselines.json")
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=THRESHOLD,
+        help="fail when current mean exceeds baseline by more than this ratio (default: 1.20)",
+    )
+    args = parser.parse_args(argv)
+    try:
+        return check_regression(
+            args.results_path,
+            args.baselines_path,
+            threshold=args.threshold,
+        )
+    except BenchmarkDataError as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py
@@ -0,0 +1,104 @@
+"""Reduce pytest-benchmark JSON into benchmarks/baselines.json."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from datetime import UTC, datetime
+from pathlib import Path
+
+try:
+    from scripts.check_benchmark_regression import EXCLUDED_FROM_GATE, BenchmarkDataError
+except ModuleNotFoundError:
+    from check_benchmark_regression import EXCLUDED_FROM_GATE, BenchmarkDataError
+
+GATED_GROUPS = ("parse", "export", "search")
+
+
+def _positive_float(value: str) -> float:
+    parsed = float(value)
+    if parsed <= 0:
+        raise argparse.ArgumentTypeError("slack must be greater than zero")
+    return parsed
+
+
+def reduce_baselines(
+    raw_path: str | Path,
+    out_path: str | Path,
+    *,
+    slack: float = 1.0,
+) -> dict[str, object]:
+    path = Path(raw_path)
+    try:
+        raw = json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc
+    except OSError as exc:
+        raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc
+
+    try:
+        entries = raw["benchmarks"]
+    except (KeyError, TypeError) as exc:
+        raise BenchmarkDataError(f"{path} missing top-level 'benchmarks' array") from exc
+    if not isinstance(entries, list):
+        raise BenchmarkDataError(f"{path} 'benchmarks' must be an array")
+
+    groups: dict[str, dict[str, float]] = {group: {} for group in GATED_GROUPS}
+    for index, entry in enumerate(entries):
+        if not isinstance(entry, dict):
+            raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object")
+        try:
+            name = entry["name"]
+            mean = float(entry["stats"]["mean"])
+        except (KeyError, TypeError, ValueError) as exc:
+            raise BenchmarkDataError(
+                f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'"
+            ) from exc
+        group = entry.get("group")
+        if group not in GATED_GROUPS:
+            continue
+        if str(name) in EXCLUDED_FROM_GATE:
+            continue
+        groups[group][str(name)] = mean * slack
+
+    machine_info = raw.get("machine_info")
+    machine = machine_info.get("system") if isinstance(machine_info, dict) else None
+    output: dict[str, object] = {
+        "_note": (
+            "Gated means from ubuntu-latest CI (post-cache). "
+            "Excluded from gate: test_parse_session_small, test_search_full_corpus (CI noise)."
+        ),
+        "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "machine": machine,
+        "groups": groups,
+    }
+    out = Path(out_path)
+    try:
+        out.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8")
+    except OSError as exc:
+        raise BenchmarkDataError(f"cannot write {out}: {exc}") from exc
+    return output
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("raw_path", help="pytest-benchmark --benchmark-json output")
+    parser.add_argument("out_path", help="destination baselines.json path")
+    parser.add_argument(
+        "--slack",
+        type=_positive_float,
+        default=1.0,
+        help="multiply means by this factor (must be > 0)",
+    )
+    args = parser.parse_args(argv)
+    try:
+        reduce_baselines(args.raw_path, args.out_path, slack=args.slack)
+    except BenchmarkDataError as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 2
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())