diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 60c75bb..6611ad2 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -207,7 +207,7 @@ jobs:
       - run: npm run test:coverage
 
   benchmarks:
-    name: Performance benchmarks (informational)
+    name: Performance benchmarks (gated)
     runs-on: ubuntu-latest
     permissions:
       contents: read
@@ -235,7 +235,11 @@ jobs:
           --benchmark-columns=min,max,mean,stddev,rounds
           -o addopts=
 
+      - name: Regression gate
+        run: python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json
+
       - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4
+        if: always()
         with:
           name: benchmark-results
           path: benchmark-results.json
diff --git a/.gitignore b/.gitignore
index 8707cd8..27a84ea 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,3 +14,5 @@ node_modules/
 .coverage
 coverage/
 coverage.xml
+benchmark-results.json
+benchmarks/_raw.json
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..b14f3d6
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,12 @@
+.PHONY: update-baselines check-benchmarks clean-benchmark-artifacts
+
+update-baselines:
+	pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
+	python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json
+
+check-benchmarks:
+	pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmark-results.json -o addopts=
+	python scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json
+
+clean-benchmark-artifacts:
+	rm -f benchmarks/_raw.json benchmark-results.json
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 75ff9cc..59f70fc 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,6 +1,6 @@
 # Performance benchmarks
 
-Test files live under `tests/benchmarks/`; this directory holds only documentation and the informational `baselines.json` snapshot.
+Test files live under `tests/benchmarks/`; this directory holds documentation and `baselines.json` for the CI regression gate.
 
 Repeatable local measurements for parse, bulk export, and search hot paths.
 
@@ -26,6 +26,7 @@ The memory test also runs as part of the normal `pytest` suite (timing benchmark
 | parse | `parse_session` on 10 / 500 / 5000+ line JSONL |
 | export | `run_bulk_export` over 10 / 50 / 100 sessions |
 | search | `GET /api/search` over a 50-session synthetic corpus |
+| cache | cold vs warm `get_cached_session` (informational; not gated) |
 
 Large JSONL files (5000+ lines) are generated at test session scope under pytest's temp directory — not committed to git.
 
@@ -33,10 +34,27 @@ Corpora repeat one row from `tests/fixtures/session_with_tools.jsonl`, so parse/
 
 The memory test (`test_parse_memory.py`) is intentionally **not** skipped by `--benchmark-skip`; it runs in the main `pytest` job and builds the session-scoped 5000-line fixture once per session.
 
-## CI
+## CI gate
 
-The `benchmarks` workflow job uploads `benchmark-results.json` as a downloadable artifact. There is no regression gate yet.
+The `benchmarks` job on **ubuntu-latest** runs pytest-benchmark (`--benchmark-json=benchmark-results.json`), then `scripts/check_benchmark_regression.py benchmark-results.json benchmarks/baselines.json`. CI fails when any **gated** benchmark mean exceeds its baseline by more than **20%**.
+
+**Gated:** parse medium/large, export 10/50/100 sessions.
+
+**Not gated (informational only):** `test_parse_session_small`, `test_search_full_corpus` (sub-ms CI noise), and the `cache` group. Benchmarks without a baseline entry print a warning and do not fail the gate.
 
 ## Refresh baselines
 
-After intentional performance work, copy key means from a local run into `baselines.json` with a date and machine note. This file is informational only; CI does not compare against it.
+After intentional performance work, capture on **ubuntu-latest** (same OS as the gated CI job):
+
+```bash
+make update-baselines
+```
+
+Or manually:
+
+```bash
+pytest tests/benchmarks/ --benchmark-only --benchmark-json=benchmarks/_raw.json -o addopts=
+python scripts/reduce_baselines.py benchmarks/_raw.json benchmarks/baselines.json
+```
+
+Baselines must be captured on **ubuntu-latest** to match the gated CI runner. Cross-OS variance causes spurious failures. Download `benchmark-results.json` from a CI artifact to seed baselines if needed.
diff --git a/benchmarks/baselines.json b/benchmarks/baselines.json
index 123a2b4..813a72f 100644
--- a/benchmarks/baselines.json
+++ b/benchmarks/baselines.json
@@ -1,10 +1,17 @@
 {
-  "_note": "Informational snapshot only — CI does not gate on these values.",
-  "updated": null,
-  "machine": null,
+  "_note": "Gated means from ubuntu-latest CI benchmark-results.json (post-cache PR #90). Excluded from gate: test_parse_session_small, test_search_full_corpus (sub-ms CI noise). Refresh via make update-baselines on ubuntu.",
+  "updated": "2026-06-17T21:00:00Z",
+  "machine": "Linux",
   "groups": {
-    "parse": {},
-    "export": {},
+    "parse": {
+      "test_parse_session_medium": 0.002956,
+      "test_parse_session_large": 0.029678
+    },
+    "export": {
+      "test_bulk_export_session_count[sessions-10]": 0.004278,
+      "test_bulk_export_session_count[sessions-50]": 0.021144,
+      "test_bulk_export_session_count[sessions-100]": 0.042003
+    },
     "search": {}
   }
 }
diff --git a/scripts/check_benchmark_regression.py b/scripts/check_benchmark_regression.py
new file mode 100644
index 0000000..7842021
--- /dev/null
+++ b/scripts/check_benchmark_regression.py
@@ -0,0 +1,154 @@
+"""Compare pytest-benchmark JSON output against stored baselines."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+THRESHOLD = 1.20
+
+# Sub-ms timings are too noisy for a fixed 20% gate on ubuntu CI.
+EXCLUDED_FROM_GATE = frozenset(
+    {
+        "test_parse_session_small",
+        "test_search_full_corpus",
+    }
+)
+
+
+class BenchmarkDataError(ValueError):
+    """Raised when benchmark JSON input is malformed or missing required fields."""
+
+
+def load_results(results_path: str | Path) -> dict[str, float]:
+    path = Path(results_path)
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+    except OSError as exc:
+        raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc
+    except json.JSONDecodeError as exc:
+        raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc
+    try:
+        benchmarks = data["benchmarks"]
+    except (KeyError, TypeError) as exc:
+        raise BenchmarkDataError(f"{path} missing top-level 'benchmarks' array") from exc
+    if not isinstance(benchmarks, list):
+        raise BenchmarkDataError(f"{path} 'benchmarks' must be an array")
+
+    results: dict[str, float] = {}
+    for index, entry in enumerate(benchmarks):
+        if not isinstance(entry, dict):
+            raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object")
+        try:
+            name = entry["name"]
+            mean = float(entry["stats"]["mean"])
+        except (KeyError, TypeError, ValueError) as exc:
+            raise BenchmarkDataError(
+                f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'"
+            ) from exc
+        name = str(name)
+        if name in results:
+            raise BenchmarkDataError(f"{path} duplicate benchmark name {name!r}")
+        results[name] = mean
+    return results
+
+
+def load_baseline_means(baselines_path: str | Path) -> dict[str, float]:
+    path = Path(baselines_path)
+    try:
+        data = json.loads(path.read_text(encoding="utf-8"))
+    except OSError as exc:
+        raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc
+    except json.JSONDecodeError as exc:
+        raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc
+    if not isinstance(data, dict):
+        raise BenchmarkDataError(f"{path} root value must be an object")
+
+    if "groups" not in data:
+        raise BenchmarkDataError(f"{path} missing required 'groups' key")
+    groups = data["groups"]
+    if not isinstance(groups, dict):
+        raise BenchmarkDataError(f"{path} 'groups' must be an object")
+
+    means: dict[str, float] = {}
+    for group_name, value in groups.items():
+        if not isinstance(value, dict):
+            continue
+        for name, mean in value.items():
+            name = str(name)
+            if name in means:
+                raise BenchmarkDataError(f"{path} duplicate benchmark name {name!r} across groups")
+            try:
+                means[name] = float(mean)
+            except (TypeError, ValueError) as exc:
+                raise BenchmarkDataError(
+                    f"{path} groups[{group_name!r}][{name!r}] is not a numeric mean"
+                ) from exc
+    return means
+
+
+def check_regression(
+    results_path: str | Path,
+    baselines_path: str | Path,
+    *,
+    threshold: float = THRESHOLD,
+) -> int:
+    """Return 0 when within threshold; 1 when any gated benchmark regresses."""
+    flat = load_results(results_path)
+    baseline_means = load_baseline_means(baselines_path)
+
+    failures: list[str] = []
+    for name, base in baseline_means.items():
+        if name in EXCLUDED_FROM_GATE:
+            continue
+        cur = flat.get(name)
+        if cur is None:
+            print(f"WARN: no current result for baseline {name!r}; skipping")
+            continue
+        if base == 0:
+            print(f"WARN: baseline for {name!r} is zero; skipping ratio check")
+            continue
+        ratio = cur / base
+        tag = "FAIL" if ratio > threshold else "ok"
+        print(f"[{tag}] {name}: {cur:.6f}s vs {base:.6f}s ({ratio:.2f}x)")
+        if ratio > threshold:
+            failures.append(name)
+
+    for name in flat:
+        if name in EXCLUDED_FROM_GATE:
+            continue
+        if name not in baseline_means:
+            print(f"WARN: {name!r} has no baseline yet; not gated")
+
+    if failures:
+        print(f"\nREGRESSION: {len(failures)} benchmark(s) exceeded {threshold:.0%}")
+        return 1
+    return 0
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("results_path", help="pytest-benchmark --benchmark-json output")
+    parser.add_argument("baselines_path", help="path to benchmarks/baselines.json")
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=THRESHOLD,
+        help="fail when current mean exceeds baseline by more than this ratio (default: 1.20)",
+    )
+    args = parser.parse_args(argv)
+    try:
+        return check_regression(
+            args.results_path,
+            args.baselines_path,
+            threshold=args.threshold,
+        )
+    except BenchmarkDataError as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 2
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/scripts/reduce_baselines.py b/scripts/reduce_baselines.py
new file mode 100644
index 0000000..88be0eb
--- /dev/null
+++ b/scripts/reduce_baselines.py
@@ -0,0 +1,104 @@
+"""Reduce pytest-benchmark JSON into benchmarks/baselines.json."""
+
+from __future__ import annotations
+
+import argparse
+import json
+import sys
+from datetime import UTC, datetime
+from pathlib import Path
+
+try:
+    from scripts.check_benchmark_regression import EXCLUDED_FROM_GATE, BenchmarkDataError
+except ModuleNotFoundError:
+    from check_benchmark_regression import EXCLUDED_FROM_GATE, BenchmarkDataError
+
+GATED_GROUPS = ("parse", "export", "search")
+
+
+def _positive_float(value: str) -> float:
+    parsed = float(value)
+    if parsed <= 0:
+        raise argparse.ArgumentTypeError("slack must be greater than zero")
+    return parsed
+
+
+def reduce_baselines(
+    raw_path: str | Path,
+    out_path: str | Path,
+    *,
+    slack: float = 1.0,
+) -> dict[str, object]:
+    path = Path(raw_path)
+    try:
+        raw = json.loads(path.read_text(encoding="utf-8"))
+    except json.JSONDecodeError as exc:
+        raise BenchmarkDataError(f"invalid JSON in {path}: {exc}") from exc
+    except OSError as exc:
+        raise BenchmarkDataError(f"cannot read {path}: {exc}") from exc
+
+    try:
+        entries = raw["benchmarks"]
+    except (KeyError, TypeError) as exc:
+        raise BenchmarkDataError(f"{path} missing top-level 'benchmarks' array") from exc
+    if not isinstance(entries, list):
+        raise BenchmarkDataError(f"{path} 'benchmarks' must be an array")
+
+    groups: dict[str, dict[str, float]] = {group: {} for group in GATED_GROUPS}
+    for index, entry in enumerate(entries):
+        if not isinstance(entry, dict):
+            raise BenchmarkDataError(f"{path} benchmarks[{index}] must be an object")
+        try:
+            name = entry["name"]
+            mean = float(entry["stats"]["mean"])
+        except (KeyError, TypeError, ValueError) as exc:
+            raise BenchmarkDataError(
+                f"{path} benchmarks[{index}] missing 'name' or 'stats.mean'"
+            ) from exc
+        group = entry.get("group")
+        if group not in GATED_GROUPS:
+            continue
+        if str(name) in EXCLUDED_FROM_GATE:
+            continue
+        groups[group][str(name)] = mean * slack
+
+    machine_info = raw.get("machine_info")
+    machine = machine_info.get("system") if isinstance(machine_info, dict) else None
+    output: dict[str, object] = {
+        "_note": (
+            "Gated means from ubuntu-latest CI (post-cache). "
+            "Excluded from gate: test_parse_session_small, test_search_full_corpus (CI noise)."
+        ),
+        "updated": datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%SZ"),
+        "machine": machine,
+        "groups": groups,
+    }
+    out = Path(out_path)
+    try:
+        out.write_text(json.dumps(output, indent=2) + "\n", encoding="utf-8")
+    except OSError as exc:
+        raise BenchmarkDataError(f"cannot write {out}: {exc}") from exc
+    return output
+
+
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("raw_path", help="pytest-benchmark --benchmark-json output")
+    parser.add_argument("out_path", help="destination baselines.json path")
+    parser.add_argument(
+        "--slack",
+        type=_positive_float,
+        default=1.0,
+        help="multiply means by this factor (must be > 0)",
+    )
+    args = parser.parse_args(argv)
+    try:
+        reduce_baselines(args.raw_path, args.out_path, slack=args.slack)
+    except BenchmarkDataError as exc:
+        print(f"ERROR: {exc}", file=sys.stderr)
+        return 2
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/test_check_benchmark_regression.py b/tests/test_check_benchmark_regression.py
new file mode 100644
index 0000000..49c4f49
--- /dev/null
+++ b/tests/test_check_benchmark_regression.py
@@ -0,0 +1,193 @@
+"""Tests for scripts/check_benchmark_regression.py."""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from scripts.check_benchmark_regression import (
+    BenchmarkDataError,
+    check_regression,
+    load_baseline_means,
+    load_results,
+)
+
+GATED_BENCH = "test_parse_session_medium"
+
+
+def _write_results(path, benchmarks: list[dict]) -> None:
+    path.write_text(
+        json.dumps({"benchmarks": benchmarks}, indent=2),
+        encoding="utf-8",
+    )
+
+
+def _write_baselines(path, groups: dict[str, dict[str, float]]) -> None:
+    path.write_text(
+        json.dumps({"groups": groups}, indent=2),
+        encoding="utf-8",
+    )
+
+
+def test_missing_baseline_warns_without_failing(
+    tmp_path, capsys: pytest.CaptureFixture[str]
+) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(
+        results,
+        [
+            {"name": "test_new_bench", "stats": {"mean": 0.01}},
+            {"name": "test_parse_session_small", "stats": {"mean": 0.0001}},
+        ],
+    )
+    _write_baselines(
+        baselines,
+        {"parse": {"test_parse_session_small": 0.0001}},
+    )
+
+    assert check_regression(results, baselines) == 0
+    out = capsys.readouterr().out
+    assert "WARN: 'test_new_bench' has no baseline yet" in out
+
+
+def test_regression_over_threshold_fails(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(
+        results,
+        [{"name": GATED_BENCH, "stats": {"mean": 0.0025}}],
+    )
+    _write_baselines(
+        baselines,
+        {"parse": {GATED_BENCH: 0.002}},
+    )
+
+    assert check_regression(results, baselines) == 1
+    out = capsys.readouterr().out
+    assert "REGRESSION" in out
+
+
+def test_within_threshold_passes(tmp_path) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(
+        results,
+        [{"name": GATED_BENCH, "stats": {"mean": 0.0022}}],
+    )
+    _write_baselines(
+        baselines,
+        {"parse": {GATED_BENCH: 0.002}},
+    )
+
+    assert check_regression(results, baselines) == 0
+
+
+def test_load_results_rejects_malformed_json(tmp_path) -> None:
+    path = tmp_path / "bad.json"
+    path.write_text("{not json", encoding="utf-8")
+    with pytest.raises(BenchmarkDataError, match="invalid JSON"):
+        load_results(path)
+
+
+def test_load_results_requires_benchmarks_array(tmp_path) -> None:
+    path = tmp_path / "results.json"
+    path.write_text("{}", encoding="utf-8")
+    with pytest.raises(BenchmarkDataError, match="'benchmarks' array"):
+        load_results(path)
+
+
+def test_load_results_rejects_missing_file(tmp_path) -> None:
+    with pytest.raises(BenchmarkDataError, match="cannot read"):
+        load_results(tmp_path / "missing.json")
+
+
+def test_zero_baseline_skips_ratio_check(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(
+        results,
+        [{"name": GATED_BENCH, "stats": {"mean": 0.0025}}],
+    )
+    _write_baselines(
+        baselines,
+        {"parse": {GATED_BENCH: 0.0}},
+    )
+
+    assert check_regression(results, baselines) == 0
+    assert f"baseline for '{GATED_BENCH}' is zero" in capsys.readouterr().out
+
+
+def test_exactly_at_threshold_passes(tmp_path) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(
+        results,
+        [{"name": GATED_BENCH, "stats": {"mean": 0.0024}}],
+    )
+    _write_baselines(
+        baselines,
+        {"parse": {GATED_BENCH: 0.002}},
+    )
+
+    assert check_regression(results, baselines) == 0
+
+
+def test_excluded_benchmark_in_baselines_is_not_gated(
+    tmp_path, capsys: pytest.CaptureFixture[str]
+) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(
+        results,
+        [{"name": "test_parse_session_small", "stats": {"mean": 0.001}}],
+    )
+    _write_baselines(
+        baselines,
+        {"parse": {"test_parse_session_small": 0.0001}},
+    )
+
+    assert check_regression(results, baselines) == 0
+    assert "REGRESSION" not in capsys.readouterr().out
+
+
+def test_missing_current_result_warns_without_failing(
+    tmp_path, capsys: pytest.CaptureFixture[str]
+) -> None:
+    results = tmp_path / "results.json"
+    baselines = tmp_path / "baselines.json"
+    _write_results(results, [])
+    _write_baselines(
+        baselines,
+        {"parse": {GATED_BENCH: 0.002}},
+    )
+
+    assert check_regression(results, baselines) == 0
+    assert "no current result for baseline" in capsys.readouterr().out
+
+
+def test_main_reports_benchmark_data_error(tmp_path, capsys: pytest.CaptureFixture[str]) -> None:
+    from scripts.check_benchmark_regression import main
+
+    bad = tmp_path / "bad.json"
+    bad.write_text("{}", encoding="utf-8")
+    baselines = tmp_path / "baselines.json"
+    _write_baselines(baselines, {"parse": {GATED_BENCH: 0.002}})
+
+    assert main([str(bad), str(baselines)]) == 2
+    assert "ERROR:" in capsys.readouterr().err
+
+
+def test_duplicate_baseline_name_raises(tmp_path) -> None:
+    baselines = tmp_path / "baselines.json"
+    _write_baselines(
+        baselines,
+        {
+            "parse": {"test_parse_session_medium": 0.002},
+            "export": {"test_parse_session_medium": 0.003},
+        },
+    )
+
+    with pytest.raises(BenchmarkDataError, match="duplicate benchmark name"):
+        load_baseline_means(baselines)
diff --git a/tests/test_reduce_baselines.py b/tests/test_reduce_baselines.py
new file mode 100644
index 0000000..8919b84
--- /dev/null
+++ b/tests/test_reduce_baselines.py
@@ -0,0 +1,102 @@
+"""Tests for scripts/reduce_baselines.py."""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+from scripts.check_benchmark_regression import BenchmarkDataError
+from scripts.reduce_baselines import reduce_baselines
+
+
+def _write_raw(path, benchmarks: list[dict], *, machine: str = "Linux") -> None:
+    path.write_text(
+        json.dumps(
+            {
+                "machine_info": {"system": machine},
+                "benchmarks": benchmarks,
+            },
+            indent=2,
+        ),
+        encoding="utf-8",
+    )
+
+
+def test_reduce_baselines_writes_gated_groups_only(tmp_path) -> None:
+    raw = tmp_path / "raw.json"
+    out = tmp_path / "baselines.json"
+    _write_raw(
+        raw,
+        [
+            {"group": "parse", "name": "test_parse_session_medium", "stats": {"mean": 0.002}},
+            {"group": "parse", "name": "test_parse_session_small", "stats": {"mean": 0.0001}},
+            {"group": "cache", "name": "test_cache_warm_hit", "stats": {"mean": 1e-05}},
+        ],
+    )
+
+    output = reduce_baselines(raw, out)
+
+    assert output["machine"] == "Linux"
+    assert "test_parse_session_medium" in output["groups"]["parse"]
+    assert "test_parse_session_small" not in output["groups"]["parse"]
+    assert "cache" not in output["groups"]
+
+
+def test_reduce_baselines_applies_slack(tmp_path) -> None:
+    raw = tmp_path / "raw.json"
+    out = tmp_path / "baselines.json"
+    _write_raw(
+        raw,
+        [{"group": "parse", "name": "test_parse_session_medium", "stats": {"mean": 0.002}}],
+    )
+
+    reduce_baselines(raw, out, slack=1.5)
+    data = json.loads(out.read_text(encoding="utf-8"))
+
+    assert data["groups"]["parse"]["test_parse_session_medium"] == pytest.approx(0.003)
+
+
+def test_reduce_baselines_rejects_missing_benchmarks_key(tmp_path) -> None:
+    raw = tmp_path / "raw.json"
+    raw.write_text("{}", encoding="utf-8")
+
+    with pytest.raises(BenchmarkDataError, match="'benchmarks' array"):
+        reduce_baselines(raw, tmp_path / "out.json")
+
+
+def test_reduce_baselines_cli_rejects_non_positive_slack(tmp_path) -> None:
+    from scripts.reduce_baselines import main
+
+    raw = tmp_path / "raw.json"
+    _write_raw(
+        raw,
+        [{"group": "parse", "name": "test_parse_session_small", "stats": {"mean": 0.0001}}],
+    )
+
+    with pytest.raises(SystemExit) as exc_info:
+        main([str(raw), str(tmp_path / "out.json"), "--slack", "0"])
+    assert exc_info.value.code == 2
+
+
+def test_reduce_baselines_machine_info_non_dict(tmp_path) -> None:
+    raw = tmp_path / "raw.json"
+    raw.write_text(
+        json.dumps(
+            {
+                "machine_info": "not-a-dict",
+                "benchmarks": [
+                    {
+                        "group": "parse",
+                        "name": "test_parse_session_medium",
+                        "stats": {"mean": 0.002},
+                    }
+                ],
+            }
+        ),
+        encoding="utf-8",
+    )
+
+    output = reduce_baselines(raw, tmp_path / "out.json")
+
+    assert output["machine"] is None