From 8c908af116846334eaab6b2c35a65e08041737fc Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Wed, 27 May 2026 23:04:37 +0200
Subject: [PATCH 01/68] docs:  update benchmark readme

---
 benchmarks/README.md | 114 ++++++++++++++++++-------------------------
 1 file changed, 48 insertions(+), 66 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 22ac73ce..c2d1df04 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,94 +1,76 @@
 # Internal Performance Benchmarks
 
-Measures linopy's own performance (build time, LP write speed, memory usage) across problem sizes using [pytest-benchmark](https://pytest-benchmark.readthedocs.io/) and [pytest-memray](https://pytest-memray.readthedocs.io/). Use these to check whether a code change introduces a regression or improvement.
+This suite benchmarks the **linopy part end-to-end** in two phases:
 
-> **Note:** The `benchmark/` directory (singular) contains *external* benchmarks comparing linopy against other modeling frameworks. This directory (`benchmarks/`) is for *internal* performance tracking only.
+1. **Build**: construct the linopy model.
+2. **Solver handoff**: convert a built model into solver-consumable form.
 
-## Setup
-
-```bash
-pip install -e ".[benchmarks]"
-```
-
-## Running benchmarks
+> **Note:** `benchmark/` (singular) is for external framework comparisons. `benchmarks/` is only for internal linopy performance tracking.
 
-```bash
-# Quick smoke test (small sizes only)
-pytest benchmarks/ --quick
-
-# Full timing benchmarks
-pytest benchmarks/test_build.py benchmarks/test_lp_write.py benchmarks/test_matrices.py
+## What is covered
 
-# Run a specific model
-pytest benchmarks/test_build.py -k basic
-```
+- **Build** (`benchmarks/test_build.py`): variable creation, expression construction, constraints, objective.
+- **Solver handoff**:
+  - canonical in-memory (`benchmarks/test_matrices.py`) via `A`, `b`, `c`, bounds, labels (**required**),
+  - file handoff (`benchmarks/test_lp_write.py`) via LP serialization (**optional**),
+  - direct API handoff (e.g. `to_highspy`) when enabled (**optional**, solver-specific).
 
-## Comparing timing between branches
+## What is not covered
 
-```bash
-# Save baseline results on master
-git checkout master
-pytest benchmarks/test_build.py --benchmark-save=master
+- Solver algorithm performance (optimize/solve runtime).
+- Cross-solver ranking.
+- Nonlinear/quadratic benchmark suites.
 
-# Switch to feature branch and compare
-git checkout my-feature
-pytest benchmarks/test_build.py --benchmark-save=my-feature --benchmark-compare=0001_master
+## Models
 
-# Compare saved results without re-running
-pytest-benchmark compare 0001_master 0002_my-feature --columns=median,iqr
-```
+Core models:
 
-Results are stored in `.benchmarks/` (gitignored).
+- `basic`
+- `knapsack`
+- `expression_arithmetic`
+- `sparse_network`
 
-## Memory benchmarks
+Extended (optional dependency):
 
-`memory.py` runs each test in a separate process with pytest-memray to get accurate per-test peak memory (including C/numpy allocations). Results are saved as JSON and can be compared across branches.
+- `pypsa_scigrid`
 
-By default, only the build phase (`test_build.py`) is measured. Unlike timing benchmarks where `benchmark()` isolates the measured function, memray tracks all allocations within a test — including model construction in setup. This means LP write and matrix tests would report build + phase memory combined, making the phase-specific contribution impossible to isolate. Since model construction dominates memory usage, measuring build alone gives the most actionable numbers.
+## Setup
 
 ```bash
-# Save baseline on master
-git checkout master
-python benchmarks/memory.py save master
+pip install -e ".[benchmarks]"
+```
 
-# Save feature branch
-git checkout my-feature
-python benchmarks/memory.py save my-feature
+## Run benchmarks
 
-# Compare
-python benchmarks/memory.py compare master my-feature
+```bash
+# Quick smoke run
+pytest benchmarks/ --quick
 
-# Quick mode (smaller sizes, faster)
-python benchmarks/memory.py save master --quick
+# Full timing run (build + handoff)
+pytest benchmarks/test_build.py benchmarks/test_matrices.py benchmarks/test_lp_write.py
 
-# Measure a specific phase (includes build overhead)
-python benchmarks/memory.py save master --test-path benchmarks/test_lp_write.py
+# Single model
+pytest benchmarks/test_build.py -k basic
 ```
 
-Results are stored in `.benchmarks/memory/` (gitignored). Requires Linux or macOS (memray is not available on Windows).
-
-> **Note:** Small tests (~5 MiB) are near the import-overhead floor and may show noise of ~1 MiB between runs. Focus on larger tests for meaningful memory comparisons. Do not combine `--memray` with timing benchmarks — memray adds ~2x overhead that invalidates timing results.
+## Metrics
 
-## Models
+- **Time**: pytest-benchmark median runtime (IQR for stability).
+- **Memory**: pytest-memray peak RSS (MiB), primarily tracked for Build.
 
-| Model | Description | Sizes |
-|-------|-------------|-------|
-| `basic` | Dense N*N model, 2*N^2 vars/cons | 10 — 1600 |
-| `knapsack` | N binary variables, 1 constraint | 100 — 1M |
-| `expression_arithmetic` | Broadcasting, scaling, summation across dims | 10 — 1000 |
-| `sparse_network` | Ring network with mismatched bus/line coords | 10 — 1000 |
-| `pypsa_scigrid` | Real power system (requires `pypsa`) | 10 — 200 snapshots |
+## Results and history
 
-## Phases
+- Raw outputs live in `.benchmarks/` (gitignored).
+- Store comparison snapshots as JSON and compare to a rolling `master` baseline.
 
-| Phase | File | What it measures |
-|-------|------|------------------|
-| Build | `test_build.py` | Model construction (add_variables, add_constraints, add_objective) |
-| LP write | `test_lp_write.py` | Writing the model to an LP file |
-| Matrices | `test_matrices.py` | Generating sparse matrices (A, b, c, bounds) from the model |
+```bash
+# Timing snapshot
+pytest benchmarks/test_build.py benchmarks/test_matrices.py benchmarks/test_lp_write.py \
+  --benchmark-json ".benchmarks/timing-$(date +%Y%m%d-%H%M%S).json"
 
-## Adding a new model
+# Memory snapshot (Build by default)
+python benchmarks/memory.py save "$(git rev-parse --short HEAD)"
 
-1. Create `benchmarks/models/my_model.py` with a `build_my_model(n)` function and a `SIZES` list
-2. Add parametrized tests in the relevant `test_*.py` files
-3. Add a quick threshold in `conftest.py`
+# Compare memory snapshots
+python benchmarks/memory.py compare <baseline-label> <candidate-label>
+```

From 413f1c6adbf146809748efad3cea418578d32738 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 08:37:30 +0200
Subject: [PATCH 02/68] benchmarks: reusable model registry, new model types,
 new phases, CI smoke

Refactors the internal benchmark suite around a reusable ModelSpec /
REGISTRY pattern so adding a model is one self-registering file with
metadata (features, applicable phases, sizes, optional deps). Other tests
and scripts can import it via `from benchmarks import REGISTRY`.

New model specs cover gaps in the existing coverage:
- milp: general (non-binary) integers (capacitated facility location)
- qp: continuous quadratic objective (diagonal portfolio)
- sos: SOS1 multi-mode generation (Model.add_sos_constraints)
- piecewise: piecewise-linear fuel cost (Model.add_piecewise_formulation)
- masked: sparse-route transportation using mask= on add_variables

SOS and piecewise specs gate their own registration on API availability,
so the suite stays runnable on older linopy.

New phase tests:
- test_solver_handoff.py: parametrizes lp.io.to_highspy/to_gurobipy/
  to_mosek/to_xpress across applicable models, skipping per-solver when
  the solver isn't installed. Uses stable lp.io wrappers (not the new
  Solver.from_name API) for backward compatibility.
- test_netcdf.py: separate to_netcdf / read_netcdf benchmarks.

CI: new benchmark-smoke.yml runs the suite under --quick
--benchmark-disable on PRs, so refactors that break a model spec get
caught early. Timings stay off CI (~35s smoke locally, no regression
tracking).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark-smoke.yml      |  41 ++++++
 benchmarks/README.md                       | 115 ++++++++++-----
 benchmarks/__init__.py                     |  84 ++++++++++-
 benchmarks/conftest.py                     |  24 ++-
 benchmarks/models/__init__.py              |  32 +++-
 benchmarks/models/basic.py                 |  16 +-
 benchmarks/models/expression_arithmetic.py |  14 +-
 benchmarks/models/knapsack.py              |  17 ++-
 benchmarks/models/masked.py                |  90 ++++++++++++
 benchmarks/models/milp.py                  |  79 ++++++++++
 benchmarks/models/piecewise.py             |  92 ++++++++++++
 benchmarks/models/pypsa_scigrid.py         |  18 ++-
 benchmarks/models/qp.py                    |  65 ++++++++
 benchmarks/models/sos.py                   |  99 +++++++++++++
 benchmarks/models/sparse_network.py        |  14 +-
 benchmarks/registry.py                     | 163 +++++++++++++++++++++
 benchmarks/test_build.py                   |  51 +------
 benchmarks/test_lp_write.py                |  59 +-------
 benchmarks/test_matrices.py                |  39 ++---
 benchmarks/test_netcdf.py                  |  34 +++++
 benchmarks/test_solver_handoff.py          |  61 ++++++++
 21 files changed, 1027 insertions(+), 180 deletions(-)
 create mode 100644 .github/workflows/benchmark-smoke.yml
 create mode 100644 benchmarks/models/masked.py
 create mode 100644 benchmarks/models/milp.py
 create mode 100644 benchmarks/models/piecewise.py
 create mode 100644 benchmarks/models/qp.py
 create mode 100644 benchmarks/models/sos.py
 create mode 100644 benchmarks/registry.py
 create mode 100644 benchmarks/test_netcdf.py
 create mode 100644 benchmarks/test_solver_handoff.py

diff --git a/.github/workflows/benchmark-smoke.yml b/.github/workflows/benchmark-smoke.yml
new file mode 100644
index 00000000..c6b37028
--- /dev/null
+++ b/.github/workflows/benchmark-smoke.yml
@@ -0,0 +1,41 @@
+name: Benchmark smoke
+
+# Runs the internal benchmark suite under --quick --benchmark-disable so every
+# model spec is built and every phase fires at least once, but no timings are
+# recorded. The goal is "did refactor X break a model spec?" — not regression
+# tracking, which is done out-of-CI on dedicated hardware.
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ '*' ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  smoke:
+    name: Benchmark smoke (quick)
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v6
+      with:
+        fetch-depth: 0  # setuptools_scm
+
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v6
+      with:
+        python-version: "3.12"
+
+    - name: Install package and benchmark dependencies
+      run: |
+        python -m pip install uv
+        # [dev] for pytest + netcdf4; [benchmarks] for pytest-benchmark + pypsa.
+        uv pip install --system -e ".[dev,benchmarks]"
+
+    - name: Run benchmark smoke
+      run: |
+        pytest benchmarks/ --quick --benchmark-disable -q
diff --git a/benchmarks/README.md b/benchmarks/README.md
index c2d1df04..59cc0594 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,71 +1,120 @@
 # Internal Performance Benchmarks
 
-This suite benchmarks the **linopy part end-to-end** in two phases:
+This suite benchmarks the **linopy part end-to-end** across three phases:
 
-1. **Build**: construct the linopy model.
-2. **Solver handoff**: convert a built model into solver-consumable form.
+1. **Build** — construct the linopy model.
+2. **Solver handoff** — convert a built model into solver-consumable form
+   (in-memory matrices, LP file, native solver instance, netCDF).
+3. **Persistence round-trip** — `to_netcdf` / `read_netcdf`.
 
-> **Note:** `benchmark/` (singular) is for external framework comparisons. `benchmarks/` is only for internal linopy performance tracking.
+> **Note:** `benchmark/` (singular) is for external framework comparisons.
+> `benchmarks/` is only for internal linopy performance tracking.
 
 ## What is covered
 
-- **Build** (`benchmarks/test_build.py`): variable creation, expression construction, constraints, objective.
-- **Solver handoff**:
-  - canonical in-memory (`benchmarks/test_matrices.py`) via `A`, `b`, `c`, bounds, labels (**required**),
-  - file handoff (`benchmarks/test_lp_write.py`) via LP serialization (**optional**),
-  - direct API handoff (e.g. `to_highspy`) when enabled (**optional**, solver-specific).
+| Phase                 | Test file                       | Notes                                              |
+| --------------------- | ------------------------------- | -------------------------------------------------- |
+| Build                 | `test_build.py`                 | variables / expressions / constraints / objective  |
+| Matrices              | `test_matrices.py`              | `A`, `b`, `c`, bounds, labels, `Q` for QP          |
+| LP write              | `test_lp_write.py`              | `model.to_file(...)`                               |
+| netCDF write/read     | `test_netcdf.py`                | `to_netcdf` / `read_netcdf`                        |
+| Solver handoff        | `test_solver_handoff.py`        | `lp.io.to_highspy / to_gurobipy / to_mosek / to_xpress` — skipped per-solver when not installed |
+| PyPSA carbon handoff  | `test_pypsa_carbon_management.py` | `set_names=True/False`, `freeze_constraints=True/False` |
 
-## What is not covered
-
-- Solver algorithm performance (optimize/solve runtime).
-- Cross-solver ranking.
-- Nonlinear/quadratic benchmark suites.
+What we *don't* cover: solver algorithm performance (`Solver.solve()`
+runtime), cross-solver ranking, nonlinear / general-quadratic constraint
+suites.
 
 ## Models
 
-Core models:
+The suite is driven by a **reusable model registry**. Each model file under
+`benchmarks/models/` exposes a `build_<name>(size) -> linopy.Model` callable
+and a module-level `SPEC` describing features, applicable phases, default
+sizes, and optional dependencies.
+
+| Name                    | Features            | Typical use                                         |
+| ----------------------- | ------------------- | --------------------------------------------------- |
+| `basic`                 | continuous          | dense LP scaling                                    |
+| `knapsack`              | binary              | MIP binary-section path                             |
+| `expression_arithmetic` | continuous          | stresses `+`, `*`, `sum`, broadcasting              |
+| `sparse_network`        | continuous          | mismatched-coordinate / sparse coefficient handling |
+| `milp`                  | integer             | general-integer (non-binary) MIP path               |
+| `qp`                    | quadratic           | continuous QP / `matrices.Q` path                   |
+| `sos` *(linopy ≥ recent)* | sos              | `Model.add_sos_constraints` + LP SOS section        |
+| `piecewise` *(linopy ≥ recent)* | piecewise  | `Model.add_piecewise_formulation`                    |
+| `masked`                | masked              | `mask=` on `add_variables` / `add_constraints`      |
+| `pypsa_scigrid` *(optional)* | continuous     | real PyPSA model                                    |
+
+The `sos` and `piecewise` specs are skipped automatically if the underlying
+APIs aren't present in the installed linopy.
+
+### Reusing the registry outside the suite
+
+The registry is a plain importable object — use it from any test, script,
+or profiling session:
+
+```python
+from benchmarks import REGISTRY
 
-- `basic`
-- `knapsack`
-- `expression_arithmetic`
-- `sparse_network`
+# Look up by name
+model = REGISTRY["basic"].build(100)
 
-Extended (optional dependency):
+# Iterate (e.g. parametrize your own test)
+for spec in REGISTRY.values():
+    m = spec.build(spec.sizes[0])
+    ...
 
-- `pypsa_scigrid`
+# Filter by feature or phase
+from benchmarks import filter_by, QUADRATIC, TO_GUROBIPY
+
+qp_specs = filter_by(has_feature=QUADRATIC)
+gurobi_specs = filter_by(has_phase=TO_GUROBIPY)
+```
+
+To add a new model, drop a file under `benchmarks/models/`, expose a
+`build_<name>(size)`, and call `register(ModelSpec(...))`. Import it from
+`benchmarks/models/__init__.py` so the registration fires.
 
 ## Setup
 
 ```bash
-pip install -e ".[benchmarks]"
+uv sync --extra dev --extra solvers
+source .venv/bin/activate
 ```
 
 ## Run benchmarks
 
 ```bash
-# Quick smoke run
-pytest benchmarks/ --quick
+# Quick smoke run (small sizes only, no timing)
+pytest benchmarks/ --quick --benchmark-disable
+
+# Full timing run
+pytest benchmarks/ --benchmark-only
+
+# A single phase
+pytest benchmarks/test_build.py
 
-# Full timing run (build + handoff)
-pytest benchmarks/test_build.py benchmarks/test_matrices.py benchmarks/test_lp_write.py
+# A single model across all phases
+pytest benchmarks/ -k basic
 
-# Single model
-pytest benchmarks/test_build.py -k basic
+# A single (phase, model) pair
+pytest benchmarks/test_lp_write.py -k "knapsack and n=1000"
 ```
 
 ## Metrics
 
-- **Time**: pytest-benchmark median runtime (IQR for stability).
-- **Memory**: pytest-memray peak RSS (MiB), primarily tracked for Build.
+- **Time** — pytest-benchmark median runtime (IQR for stability).
+- **Memory** — pytest-memray peak RSS (MiB), tracked for Build only because
+  later phases include build allocations and make attribution unreliable.
 
 ## Results and history
 
-- Raw outputs live in `.benchmarks/` (gitignored).
-- Store comparison snapshots as JSON and compare to a rolling `master` baseline.
+Raw outputs live in `.benchmarks/` (gitignored). Store comparison snapshots
+as JSON and compare to a rolling `master` baseline:
 
 ```bash
 # Timing snapshot
-pytest benchmarks/test_build.py benchmarks/test_matrices.py benchmarks/test_lp_write.py \
+pytest benchmarks/ \
   --benchmark-json ".benchmarks/timing-$(date +%Y%m%d-%H%M%S).json"
 
 # Memory snapshot (Build by default)
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
index 6bf202cc..ceaaff77 100644
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
@@ -1 +1,83 @@
-"""Linopy benchmark suite — run with ``pytest benchmarks/`` (use ``--quick`` for smaller sizes)."""
+"""
+Linopy benchmark suite.
+
+Run with ``pytest benchmarks/`` (use ``--quick`` for smaller sizes).
+
+This package also exposes a **reusable model registry** for any test, profiling
+session, or example that wants ready-made linopy models of varying sizes and
+features. Each entry exposes a ``build(size) -> linopy.Model`` callable plus
+metadata::
+
+    from benchmarks import REGISTRY, QUADRATIC
+
+    # Look up by name
+    model = REGISTRY["basic"].build(100)
+
+    # Iterate / filter
+    for spec in REGISTRY.values():
+        m = spec.build(spec.sizes[0])
+        ...
+
+    from benchmarks import filter_by
+    qp_specs = filter_by(has_feature=QUADRATIC)
+"""
+
+# Importing the models package triggers each module's ``register(...)`` call.
+from benchmarks import models  # noqa: F401, E402
+from benchmarks.registry import (  # noqa: F401 — re-export
+    ALL_FEATURES,
+    ALL_PHASES,
+    BINARY,
+    BUILD,
+    CONTINUOUS,
+    DEFAULT_PHASES,
+    INTEGER,
+    LP_WRITE,
+    MASKED,
+    MATRICES,
+    NETCDF,
+    PIECEWISE,
+    QUADRATIC,
+    REGISTRY,
+    SOLVER_BUILD,
+    SOS,
+    TO_GUROBIPY,
+    TO_HIGHSPY,
+    TO_MOSEK,
+    TO_XPRESS,
+    ModelSpec,
+    filter_by,
+    get,
+    iter_params,
+    param_ids,
+    register,
+)
+
+__all__ = [
+    "ALL_FEATURES",
+    "ALL_PHASES",
+    "BINARY",
+    "BUILD",
+    "CONTINUOUS",
+    "DEFAULT_PHASES",
+    "INTEGER",
+    "LP_WRITE",
+    "MASKED",
+    "MATRICES",
+    "ModelSpec",
+    "NETCDF",
+    "PIECEWISE",
+    "QUADRATIC",
+    "REGISTRY",
+    "SOLVER_BUILD",
+    "SOS",
+    "TO_GUROBIPY",
+    "TO_HIGHSPY",
+    "TO_MOSEK",
+    "TO_XPRESS",
+    "filter_by",
+    "get",
+    "iter_params",
+    "param_ids",
+    "register",
+]
diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py
index 6f9a9467..f5c31df2 100644
--- a/benchmarks/conftest.py
+++ b/benchmarks/conftest.py
@@ -4,13 +4,7 @@
 
 import pytest
 
-QUICK_THRESHOLD = {
-    "basic": 100,
-    "knapsack": 10_000,
-    "pypsa_scigrid": 50,
-    "expression_arithmetic": 100,
-    "sparse_network": 100,
-}
+from benchmarks.registry import ModelSpec
 
 
 def pytest_addoption(parser):
@@ -22,9 +16,13 @@ def pytest_addoption(parser):
     )
 
 
-def skip_if_quick(request, model: str, size: int):
-    """Skip large sizes when --quick is passed."""
-    if request.config.getoption("--quick"):
-        threshold = QUICK_THRESHOLD.get(model, float("inf"))
-        if size > threshold:
-            pytest.skip(f"--quick: skipping {model} size {size}")
+def maybe_skip(request: pytest.FixtureRequest, spec: ModelSpec, size: int) -> None:
+    """
+    Apply ``--quick`` size cap and ``spec.requires`` importorskips.
+
+    Centralised so every phase test stays a one-liner.
+    """
+    for mod in spec.requires:
+        pytest.importorskip(mod)
+    if request.config.getoption("--quick") and size > spec.quick_threshold:
+        pytest.skip(f"--quick: skipping {spec.name} size {size}")
diff --git a/benchmarks/models/__init__.py b/benchmarks/models/__init__.py
index fcff9caf..8e5b9ca2 100644
--- a/benchmarks/models/__init__.py
+++ b/benchmarks/models/__init__.py
@@ -1,4 +1,10 @@
-"""Model builders for benchmarks."""
+"""
+Model builders for benchmarks.
+
+Importing this package registers every model in :data:`benchmarks.registry.REGISTRY`.
+Each module exposes a ``build_<name>(size) -> linopy.Model`` callable and a
+module-level ``SPEC`` :class:`~benchmarks.registry.ModelSpec`.
+"""
 
 from benchmarks.models.basic import SIZES as BASIC_SIZES
 from benchmarks.models.basic import build_basic
@@ -6,6 +12,18 @@
 from benchmarks.models.expression_arithmetic import build_expression_arithmetic
 from benchmarks.models.knapsack import SIZES as KNAPSACK_SIZES
 from benchmarks.models.knapsack import build_knapsack
+from benchmarks.models.masked import SIZES as MASKED_SIZES
+from benchmarks.models.masked import build_masked
+from benchmarks.models.milp import SIZES as MILP_SIZES
+from benchmarks.models.milp import build_milp
+from benchmarks.models.piecewise import SIZES as PIECEWISE_SIZES
+from benchmarks.models.piecewise import build_piecewise
+from benchmarks.models.pypsa_scigrid import SIZES as PYPSA_SIZES
+from benchmarks.models.pypsa_scigrid import build_pypsa_scigrid
+from benchmarks.models.qp import SIZES as QP_SIZES
+from benchmarks.models.qp import build_qp
+from benchmarks.models.sos import SIZES as SOS_SIZES
+from benchmarks.models.sos import build_sos
 from benchmarks.models.sparse_network import SIZES as SPARSE_SIZES
 from benchmarks.models.sparse_network import build_sparse_network
 
@@ -13,9 +31,21 @@
     "BASIC_SIZES",
     "EXPR_SIZES",
     "KNAPSACK_SIZES",
+    "MASKED_SIZES",
+    "MILP_SIZES",
+    "PIECEWISE_SIZES",
+    "PYPSA_SIZES",
+    "QP_SIZES",
+    "SOS_SIZES",
     "SPARSE_SIZES",
     "build_basic",
     "build_expression_arithmetic",
     "build_knapsack",
+    "build_masked",
+    "build_milp",
+    "build_piecewise",
+    "build_pypsa_scigrid",
+    "build_qp",
+    "build_sos",
     "build_sparse_network",
 ]
diff --git a/benchmarks/models/basic.py b/benchmarks/models/basic.py
index 2aea49d9..4f1205a7 100644
--- a/benchmarks/models/basic.py
+++ b/benchmarks/models/basic.py
@@ -1,10 +1,11 @@
-"""Basic benchmark model: 2*N^2 variables and constraints."""
+"""Basic benchmark model: 2*N^2 variables and constraints (continuous LP)."""
 
 from __future__ import annotations
 
 import linopy
+from benchmarks.registry import CONTINUOUS, ModelSpec, register
 
-SIZES = [10, 50, 100, 250, 500, 1000, 1600]
+SIZES = (10, 50, 100, 250, 500, 1000, 1600)
 
 
 def build_basic(n: int) -> linopy.Model:
@@ -16,3 +17,14 @@ def build_basic(n: int) -> linopy.Model:
     m.add_constraints(x - y >= -5, name="lower")
     m.add_objective(x.sum() + 2 * y.sum())
     return m
+
+
+SPEC = register(
+    ModelSpec(
+        name="basic",
+        build=build_basic,
+        sizes=SIZES,
+        features=frozenset({CONTINUOUS}),
+        quick_threshold=100,
+    )
+)
diff --git a/benchmarks/models/expression_arithmetic.py b/benchmarks/models/expression_arithmetic.py
index 339c651d..795fce59 100644
--- a/benchmarks/models/expression_arithmetic.py
+++ b/benchmarks/models/expression_arithmetic.py
@@ -5,8 +5,9 @@
 import numpy as np
 
 import linopy
+from benchmarks.registry import CONTINUOUS, ModelSpec, register
 
-SIZES = [10, 50, 100, 250, 500, 1000]
+SIZES = (10, 50, 100, 250, 500, 1000)
 
 
 def build_expression_arithmetic(n: int) -> linopy.Model:
@@ -28,3 +29,14 @@ def build_expression_arithmetic(n: int) -> linopy.Model:
     m.add_constraints(expr1.sum("j") >= -10, name="row_sum")
     m.add_objective(combined.sum())
     return m
+
+
+SPEC = register(
+    ModelSpec(
+        name="expression_arithmetic",
+        build=build_expression_arithmetic,
+        sizes=SIZES,
+        features=frozenset({CONTINUOUS}),
+        quick_threshold=100,
+    )
+)
diff --git a/benchmarks/models/knapsack.py b/benchmarks/models/knapsack.py
index 83ce7394..20aa35ec 100644
--- a/benchmarks/models/knapsack.py
+++ b/benchmarks/models/knapsack.py
@@ -1,12 +1,13 @@
-"""Knapsack benchmark model: N binary variables, 1 constraint."""
+"""Knapsack benchmark model: N binary variables, 1 constraint (MILP, binary)."""
 
 from __future__ import annotations
 
 import numpy as np
 
 import linopy
+from benchmarks.registry import BINARY, DEFAULT_PHASES, ModelSpec, register
 
-SIZES = [100, 1_000, 10_000, 100_000, 1_000_000]
+SIZES = (100, 1_000, 10_000, 100_000, 1_000_000)
 
 
 def build_knapsack(n: int) -> linopy.Model:
@@ -21,3 +22,15 @@ def build_knapsack(n: int) -> linopy.Model:
     m.add_constraints((x * weights).sum() <= capacity, name="capacity")
     m.add_objective(-(x * values).sum())
     return m
+
+
+SPEC = register(
+    ModelSpec(
+        name="knapsack",
+        build=build_knapsack,
+        sizes=SIZES,
+        features=frozenset({BINARY}),
+        phases=DEFAULT_PHASES,  # HiGHS handles binary; matrices handles MILP
+        quick_threshold=10_000,
+    )
+)
diff --git a/benchmarks/models/masked.py b/benchmarks/models/masked.py
new file mode 100644
index 00000000..190b4031
--- /dev/null
+++ b/benchmarks/models/masked.py
@@ -0,0 +1,90 @@
+"""
+Masked-variables benchmark: transportation with sparse allowed routes.
+
+A standard transportation LP, but only a sparse subset of (origin, dest) pairs
+are valid routes. The ``mask=`` keyword on ``add_variables`` skips the rest,
+keeping the variable count sub-quadratic.
+
+Decision variables:
+    x[origin, dest] >= 0   continuous, only created for allowed routes
+
+Constraints:
+    sum_dest x[o, .]   <= supply[o]
+    sum_orig x[., d]   == demand[d]
+
+Objective:
+    minimize  sum cost[o, d] * x[o, d]
+
+The mask is dense at small sizes and sparser at large sizes, mimicking
+real-world transport networks where each origin only serves a fixed
+fan-out regardless of total node count.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import xarray as xr
+
+import linopy
+from benchmarks.registry import (
+    CONTINUOUS,
+    DEFAULT_PHASES,
+    MASKED,
+    ModelSpec,
+    register,
+)
+
+SIZES = (10, 50, 100, 500, 1000)
+
+
+def build_masked(n: int) -> linopy.Model:
+    rng = np.random.default_rng(42)
+    origins = np.arange(n)
+    dests = np.arange(n)
+
+    # Each origin serves at most ~min(20, n) destinations.
+    fan_out = min(20, n)
+    mask_np = np.zeros((n, n), dtype=bool)
+    for o in range(n):
+        # Deterministic fan-out so size determines connectivity.
+        targets = rng.choice(n, size=fan_out, replace=False)
+        mask_np[o, targets] = True
+
+    mask = xr.DataArray(mask_np, coords=[("origin", origins), ("dest", dests)])
+    cost = xr.DataArray(
+        rng.uniform(1, 10, size=(n, n)),
+        coords=[("origin", origins), ("dest", dests)],
+    )
+
+    # Supply scaled so the problem stays feasible at any size:
+    # each origin can ship up to ``demand_per_dest * fan_out`` units.
+    demand_per_dest = 5.0
+    supply_per_origin = demand_per_dest * n  # plenty of slack
+    supply = xr.DataArray(np.full(n, supply_per_origin), coords=[("origin", origins)])
+    demand = xr.DataArray(np.full(n, demand_per_dest), coords=[("dest", dests)])
+
+    m = linopy.Model()
+    x = m.add_variables(
+        lower=0,
+        coords=[("origin", origins), ("dest", dests)],
+        mask=mask,
+        name="x",
+    )
+
+    m.add_constraints(x.sum("dest") <= supply, name="supply", mask=mask.any("dest"))
+    m.add_constraints(x.sum("origin") == demand, name="demand", mask=mask.any("origin"))
+
+    m.add_objective((cost * x).sum())
+    return m
+
+
+SPEC = register(
+    ModelSpec(
+        name="masked",
+        build=build_masked,
+        sizes=SIZES,
+        features=frozenset({CONTINUOUS, MASKED}),
+        phases=DEFAULT_PHASES,
+        quick_threshold=100,
+    )
+)
diff --git a/benchmarks/models/milp.py b/benchmarks/models/milp.py
new file mode 100644
index 00000000..bdad39ab
--- /dev/null
+++ b/benchmarks/models/milp.py
@@ -0,0 +1,79 @@
+"""
+MILP benchmark: capacitated facility location with general integers.
+
+Decision variables:
+    y_f  in {0,1,...,K}      integer "modules" to open at facility f
+    x_{f,c} >= 0             continuous flow from facility f to customer c
+
+Constraints:
+    sum_c x_{f,c}  <=  cap * y_f       (capacity per facility)
+    sum_f x_{f,c}  ==  d_c             (demand at each customer)
+
+Objective:
+    minimize  sum_{f,c} t_{f,c} * x_{f,c}  +  sum_f f_f * y_f
+
+The general-integer ``y`` exercises the matrix accessor's MIP integer-section
+path and the LP-writer's general-integer block — neither the binary knapsack
+nor the continuous LPs hit those paths.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+
+import linopy
+from benchmarks.registry import (
+    CONTINUOUS,
+    DEFAULT_PHASES,
+    INTEGER,
+    ModelSpec,
+    register,
+)
+
+SIZES = (10, 25, 50, 100, 200)
+
+
+def build_milp(n: int) -> linopy.Model:
+    rng = np.random.default_rng(42)
+    facilities = np.arange(n)
+    customers = np.arange(n)
+
+    cap = 100.0  # capacity per module
+    Y_MAX = 5  # max modules per facility
+    transport = rng.uniform(1, 20, size=(n, n))  # per-unit shipping cost
+    fixed = rng.uniform(50, 200, size=n)  # cost per facility module
+    demand = rng.uniform(20, 80, size=n)  # demand at each customer
+
+    m = linopy.Model()
+    y = m.add_variables(
+        lower=0,
+        upper=Y_MAX,
+        coords=[facilities],
+        dims=["facility"],
+        integer=True,
+        name="y",
+    )
+    x = m.add_variables(
+        lower=0,
+        coords=[facilities, customers],
+        dims=["facility", "customer"],
+        name="x",
+    )
+
+    m.add_constraints(x.sum("customer") - cap * y <= 0, name="capacity")
+    m.add_constraints(x.sum("facility") == demand, name="demand")
+
+    m.add_objective((transport * x).sum() + (fixed * y).sum())
+    return m
+
+
+SPEC = register(
+    ModelSpec(
+        name="milp",
+        build=build_milp,
+        sizes=SIZES,
+        features=frozenset({INTEGER, CONTINUOUS}),
+        phases=DEFAULT_PHASES,
+        quick_threshold=25,
+    )
+)
diff --git a/benchmarks/models/piecewise.py b/benchmarks/models/piecewise.py
new file mode 100644
index 00000000..38cfef84
--- /dev/null
+++ b/benchmarks/models/piecewise.py
@@ -0,0 +1,92 @@
+"""
+Piecewise-linear benchmark: generation with piecewise fuel-cost curves.
+
+Each generator has a piecewise fuel cost curve pinned via
+``add_piecewise_formulation``. The default ``method="auto"`` picks an
+SOS2 or incremental expansion, generating auxiliary variables and
+constraints — that overhead is what we want to measure.
+
+Decision variables:
+    power[gen]  in [0, 100]      (continuous)
+    fuel[gen]   in [0, inf)      (continuous, pinned to piecewise curve)
+
+Constraints:
+    sum_gen power[gen]  >=  demand
+    piecewise:  fuel[gen] = f(power[gen])    for each gen
+
+Objective:
+    minimize  sum_gen fuel[gen]
+"""
+
+from __future__ import annotations
+
+import warnings
+
+import linopy
+from benchmarks.registry import (
+    CONTINUOUS,
+    DEFAULT_PHASES,
+    PIECEWISE,
+    ModelSpec,
+    register,
+)
+
+SIZES = (10, 100, 1_000, 5_000)
+
+_API_AVAILABLE = hasattr(linopy.Model, "add_piecewise_formulation") and hasattr(
+    linopy, "EvolvingAPIWarning"
+)
+
+
+def build_piecewise(n_gens: int) -> linopy.Model:
+    # Shared breakpoints, broadcast across generators.
+    x_pts = [0.0, 30.0, 60.0, 100.0]
+    y_pts = [0.0, 36.0, 84.0, 170.0]  # convex-ish fuel curve
+
+    m = linopy.Model()
+    power = m.add_variables(
+        lower=0,
+        upper=100,
+        coords=[range(n_gens)],
+        dims=["gen"],
+        name="power",
+    )
+    fuel = m.add_variables(
+        lower=0,
+        coords=[range(n_gens)],
+        dims=["gen"],
+        name="fuel",
+    )
+
+    demand = 0.5 * n_gens * x_pts[-1]
+    m.add_constraints(power.sum() >= demand, name="demand")
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=linopy.EvolvingAPIWarning)
+        m.add_piecewise_formulation(
+            (power, x_pts),
+            (fuel, y_pts),
+        )
+
+    m.add_objective(fuel.sum())
+    return m
+
+
+# ``add_piecewise_formulation`` is a recent (still-evolving) API. Skip
+# registration silently on older linopy so the rest of the suite stays usable.
+if _API_AVAILABLE:
+    SPEC = register(
+        ModelSpec(
+            name="piecewise",
+            build=build_piecewise,
+            sizes=SIZES,
+            features=frozenset({CONTINUOUS, PIECEWISE}),
+            # Monotonic breakpoints + ``method="auto"`` → incremental
+            # reformulation (pure MILP with binaries), which every supported
+            # solver handles.
+            phases=DEFAULT_PHASES,
+            quick_threshold=100,
+        )
+    )
+else:
+    SPEC = None
diff --git a/benchmarks/models/pypsa_scigrid.py b/benchmarks/models/pypsa_scigrid.py
index 2fcce217..29c3e450 100644
--- a/benchmarks/models/pypsa_scigrid.py
+++ b/benchmarks/models/pypsa_scigrid.py
@@ -1,13 +1,15 @@
-"""PyPSA SciGrid-DE benchmark model."""
+"""PyPSA SciGrid-DE benchmark model (requires pypsa)."""
 
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
 
+from benchmarks.registry import CONTINUOUS, ModelSpec, register
+
 if TYPE_CHECKING:
     import linopy
 
-SIZES = [10, 50, 100, 200]
+SIZES = (10, 50, 100, 200)
 
 
 def build_pypsa_scigrid(snapshots: int = 100) -> linopy.Model:
@@ -18,3 +20,15 @@ def build_pypsa_scigrid(snapshots: int = 100) -> linopy.Model:
     n.set_snapshots(n.snapshots[:snapshots])
     n.optimize.create_model()
     return n.model
+
+
+SPEC = register(
+    ModelSpec(
+        name="pypsa_scigrid",
+        build=build_pypsa_scigrid,
+        sizes=SIZES,
+        features=frozenset({CONTINUOUS}),
+        quick_threshold=50,
+        requires=("pypsa",),
+    )
+)
diff --git a/benchmarks/models/qp.py b/benchmarks/models/qp.py
new file mode 100644
index 00000000..6e6517d2
--- /dev/null
+++ b/benchmarks/models/qp.py
@@ -0,0 +1,65 @@
+"""
+QP benchmark: continuous quadratic objective on a portfolio-style model.
+
+Decision variables:
+    x_i  >= 0   (weight on asset i, continuous)
+
+Constraints:
+    sum_i x_i  == 1
+    x_i        <= 0.3        (no asset > 30% of portfolio)
+
+Objective:
+    minimize  sum_i q_i * x_i^2  -  sum_i r_i * x_i
+
+A pure diagonal quadratic — enough to exercise the QP build / write / matrix
+paths without paying for cross-terms. Cross-term coupling needs single-term
+factors on both sides (see ``LinearExpression._multiply_by_linear_expression``),
+which is awkward to set up cleanly via the public API.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+
+import linopy
+from benchmarks.registry import (
+    CONTINUOUS,
+    DEFAULT_PHASES,
+    QUADRATIC,
+    ModelSpec,
+    register,
+)
+
+SIZES = (10, 100, 1_000, 5_000, 20_000)
+
+
+def build_qp(n_assets: int) -> linopy.Model:
+    rng = np.random.default_rng(42)
+    q = rng.uniform(0.5, 2.0, size=n_assets)
+    r = rng.uniform(0.05, 0.15, size=n_assets)
+
+    m = linopy.Model()
+    x = m.add_variables(
+        lower=0,
+        upper=0.3,
+        coords=[range(n_assets)],
+        dims=["asset"],
+        name="x",
+    )
+
+    m.add_constraints(x.sum() == 1, name="budget")
+
+    m.add_objective((q * x**2).sum() - (r * x).sum())
+    return m
+
+
+SPEC = register(
+    ModelSpec(
+        name="qp",
+        build=build_qp,
+        sizes=SIZES,
+        features=frozenset({CONTINUOUS, QUADRATIC}),
+        phases=DEFAULT_PHASES,
+        quick_threshold=100,
+    )
+)
diff --git a/benchmarks/models/sos.py b/benchmarks/models/sos.py
new file mode 100644
index 00000000..31cca2ea
--- /dev/null
+++ b/benchmarks/models/sos.py
@@ -0,0 +1,99 @@
+"""
+SOS1 benchmark: multi-mode generation with at-most-one-mode-per-generator.
+
+Each generator has ``n_modes`` operating modes (different cap/cost tradeoff).
+SOS1 over the ``mode`` dimension enforces that each generator picks at most
+one mode.
+
+Decision variables:
+    y[gen, mode]  >= 0     continuous output per (generator, mode)
+
+Constraints:
+    y[gen, mode]  <= cap[mode]
+    sum_{gen,mode} y  >= demand_total
+    SOS1 over "mode" for each gen
+
+This benchmark exercises ``Model.add_sos_constraints`` (commits be6d3a3 /
+8aa8d0c) and the LP-writer's SOS section. In linopy, native SOS support is
+declared by Gurobi / Cplex / Xpress only (see ``SolverFeature.SOS_CONSTRAINTS``).
+HiGHS and Mosek would need ``apply_sos_reformulation()`` first.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import xarray as xr
+
+import linopy
+from benchmarks.registry import (
+    BUILD,
+    CONTINUOUS,
+    LP_WRITE,
+    MATRICES,
+    NETCDF,
+    SOLVER_BUILD,
+    SOS,
+    TO_GUROBIPY,
+    TO_XPRESS,
+    ModelSpec,
+    register,
+)
+
+SIZES = (10, 100, 1_000, 10_000)
+
+_N_MODES = 5
+_API_AVAILABLE = hasattr(linopy.Model, "add_sos_constraints")
+
+
+def build_sos(n_gens: int) -> linopy.Model:
+    modes = np.arange(_N_MODES)
+    cap = xr.DataArray(np.linspace(20.0, 100.0, _N_MODES), coords=[("mode", modes)])
+    cost = xr.DataArray(np.linspace(1.0, 8.0, _N_MODES), coords=[("mode", modes)])
+
+    m = linopy.Model()
+    y = m.add_variables(
+        lower=0,
+        upper=float(cap.max()),
+        coords=[range(n_gens), modes],
+        dims=["gen", "mode"],
+        name="y",
+    )
+
+    m.add_constraints(y <= cap, name="mode_cap")
+    demand_total = 0.4 * n_gens * float(cap.max())
+    m.add_constraints(y.sum() >= demand_total, name="demand")
+
+    m.add_sos_constraints(y, sos_type=1, sos_dim="mode")
+
+    m.add_objective((cost * y).sum())
+    return m
+
+
+# ``add_sos_constraints`` is a recent API. On older linopy we silently skip
+# registering this model — the rest of the suite stays usable.
+if _API_AVAILABLE:
+    SPEC = register(
+        ModelSpec(
+            name="sos",
+            build=build_sos,
+            sizes=SIZES,
+            features=frozenset({CONTINUOUS, SOS}),
+            # HiGHS / Mosek lack native SOS in linopy — would need
+            # ``reformulate_sos=True``, which mutates the model and defeats
+            # the benchmark. Only solvers with native SOS appear here.
+            phases=frozenset(
+                {
+                    BUILD,
+                    MATRICES,
+                    LP_WRITE,
+                    NETCDF,
+                    SOLVER_BUILD,
+                    TO_GUROBIPY,
+                    TO_XPRESS,
+                }
+            ),
+            quick_threshold=100,
+        )
+    )
+else:
+    SPEC = None
diff --git a/benchmarks/models/sparse_network.py b/benchmarks/models/sparse_network.py
index afc6be06..a750ae90 100644
--- a/benchmarks/models/sparse_network.py
+++ b/benchmarks/models/sparse_network.py
@@ -7,8 +7,9 @@
 import xarray as xr
 
 import linopy
+from benchmarks.registry import CONTINUOUS, ModelSpec, register
 
-SIZES = [10, 50, 100, 250, 500, 1000]
+SIZES = (10, 50, 100, 250, 500, 1000)
 
 
 def build_sparse_network(n_buses: int) -> linopy.Model:
@@ -48,3 +49,14 @@ def build_sparse_network(n_buses: int) -> linopy.Model:
 
     m.add_objective(gen.sum())
     return m
+
+
+SPEC = register(
+    ModelSpec(
+        name="sparse_network",
+        build=build_sparse_network,
+        sizes=SIZES,
+        features=frozenset({CONTINUOUS}),
+        quick_threshold=100,
+    )
+)
diff --git a/benchmarks/registry.py b/benchmarks/registry.py
new file mode 100644
index 00000000..07e6c322
--- /dev/null
+++ b/benchmarks/registry.py
@@ -0,0 +1,163 @@
+"""
+Reusable registry of benchmark models.
+
+A :class:`ModelSpec` captures everything needed to drive a model through the
+benchmark suite *and* to use it from any other test or script:
+
+- ``build(size) -> linopy.Model``  the actual builder
+- ``sizes``                        canonical sizes the model has been tuned for
+- ``features``                     what kinds of variables / constraints it uses
+- ``phases``                       which benchmark phases apply (lp_write, to_highspy, ...)
+- ``quick_threshold``              max size to keep under ``pytest --quick``
+- ``requires``                     extra modules to ``pytest.importorskip``
+
+Pattern for downstream use::
+
+    from benchmarks import REGISTRY
+    model = REGISTRY["basic"].build(100)
+
+    # Or pick a subset by feature/phase:
+    from benchmarks import filter_by, QUADRATIC
+    qp_specs = filter_by(has_feature=QUADRATIC)
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable, Iterator
+from dataclasses import dataclass
+
+import linopy
+
+# --- Feature tags -----------------------------------------------------------
+
+CONTINUOUS = "continuous"
+BINARY = "binary"
+INTEGER = "integer"
+QUADRATIC = "quadratic"
+SOS = "sos"
+PIECEWISE = "piecewise"
+MASKED = "masked"
+
+ALL_FEATURES = frozenset(
+    {CONTINUOUS, BINARY, INTEGER, QUADRATIC, SOS, PIECEWISE, MASKED}
+)
+
+# --- Phase tags -------------------------------------------------------------
+
+BUILD = "build"
+MATRICES = "matrices"
+LP_WRITE = "lp_write"
+NETCDF = "netcdf"
+SOLVER_BUILD = "solver_build"  # generic Solver.from_name(..., io_api="direct")
+TO_HIGHSPY = "to_highspy"
+TO_GUROBIPY = "to_gurobipy"
+TO_MOSEK = "to_mosek"
+TO_XPRESS = "to_xpress"
+
+ALL_PHASES = frozenset(
+    {
+        BUILD,
+        MATRICES,
+        LP_WRITE,
+        NETCDF,
+        SOLVER_BUILD,
+        TO_HIGHSPY,
+        TO_GUROBIPY,
+        TO_MOSEK,
+        TO_XPRESS,
+    }
+)
+
+# Phases every "well-behaved LP / MILP" can do. Models with features the
+# default solvers can't ingest natively (e.g. native SOS for HiGHS) override
+# this with a narrower set.
+DEFAULT_PHASES = frozenset(
+    {
+        BUILD,
+        MATRICES,
+        LP_WRITE,
+        NETCDF,
+        SOLVER_BUILD,
+        TO_HIGHSPY,
+        TO_GUROBIPY,
+        TO_MOSEK,
+        TO_XPRESS,
+    }
+)
+
+
+@dataclass(frozen=True)
+class ModelSpec:
+    """Declarative description of one benchmark model."""
+
+    name: str
+    build: Callable[[int], linopy.Model]
+    sizes: tuple[int, ...]
+    features: frozenset[str] = frozenset({CONTINUOUS})
+    phases: frozenset[str] = DEFAULT_PHASES
+    quick_threshold: int = 10**9
+    requires: tuple[str, ...] = ()
+
+    def applies_to(self, phase: str) -> bool:
+        return phase in self.phases
+
+    def has_feature(self, feature: str) -> bool:
+        return feature in self.features
+
+
+REGISTRY: dict[str, ModelSpec] = {}
+
+
+def register(spec: ModelSpec) -> ModelSpec:
+    """Add ``spec`` to the global registry. Returns the spec for chaining."""
+    if spec.name in REGISTRY:
+        raise ValueError(f"model {spec.name!r} already registered")
+    unknown_features = spec.features - ALL_FEATURES
+    if unknown_features:
+        raise ValueError(
+            f"model {spec.name!r}: unknown features {sorted(unknown_features)}"
+        )
+    unknown_phases = spec.phases - ALL_PHASES
+    if unknown_phases:
+        raise ValueError(
+            f"model {spec.name!r}: unknown phases {sorted(unknown_phases)}"
+        )
+    REGISTRY[spec.name] = spec
+    return spec
+
+
+def get(name: str) -> ModelSpec:
+    return REGISTRY[name]
+
+
+def filter_by(
+    *,
+    has_feature: str | None = None,
+    has_phase: str | None = None,
+) -> list[ModelSpec]:
+    out = []
+    for spec in REGISTRY.values():
+        if has_feature is not None and not spec.has_feature(has_feature):
+            continue
+        if has_phase is not None and not spec.applies_to(has_phase):
+            continue
+        out.append(spec)
+    return out
+
+
+def iter_params(phase: str) -> list[tuple[ModelSpec, int]]:
+    """Pytest parametrize helper — flatten (spec, size) pairs for one phase."""
+    return [
+        (spec, size)
+        for spec in REGISTRY.values()
+        if spec.applies_to(phase)
+        for size in spec.sizes
+    ]
+
+
+def param_ids(params: list[tuple[ModelSpec, int]]) -> list[str]:
+    return [f"{spec.name}-n={size}" for spec, size in params]
+
+
+def __iter__() -> Iterator[ModelSpec]:  # pragma: no cover - convenience
+    return iter(REGISTRY.values())
diff --git a/benchmarks/test_build.py b/benchmarks/test_build.py
index f657715e..98559b3c 100644
--- a/benchmarks/test_build.py
+++ b/benchmarks/test_build.py
@@ -4,50 +4,13 @@
 
 import pytest
 
-from benchmarks.conftest import skip_if_quick
-from benchmarks.models import (
-    BASIC_SIZES,
-    EXPR_SIZES,
-    KNAPSACK_SIZES,
-    SPARSE_SIZES,
-    build_basic,
-    build_expression_arithmetic,
-    build_knapsack,
-    build_sparse_network,
-)
-from benchmarks.models.pypsa_scigrid import SIZES as PYPSA_SIZES
+from benchmarks.conftest import maybe_skip
+from benchmarks.registry import BUILD, iter_params, param_ids
 
+_PARAMS = iter_params(BUILD)
 
-@pytest.mark.parametrize("n", BASIC_SIZES, ids=[f"n={n}" for n in BASIC_SIZES])
-def test_build_basic(benchmark, n, request):
-    skip_if_quick(request, "basic", n)
-    benchmark(build_basic, n)
 
-
-@pytest.mark.parametrize("n", KNAPSACK_SIZES, ids=[f"n={n}" for n in KNAPSACK_SIZES])
-def test_build_knapsack(benchmark, n, request):
-    skip_if_quick(request, "knapsack", n)
-    benchmark(build_knapsack, n)
-
-
-@pytest.mark.parametrize("n", EXPR_SIZES, ids=[f"n={n}" for n in EXPR_SIZES])
-def test_build_expression_arithmetic(benchmark, n, request):
-    skip_if_quick(request, "expression_arithmetic", n)
-    benchmark(build_expression_arithmetic, n)
-
-
-@pytest.mark.parametrize("n", SPARSE_SIZES, ids=[f"n={n}" for n in SPARSE_SIZES])
-def test_build_sparse_network(benchmark, n, request):
-    skip_if_quick(request, "sparse_network", n)
-    benchmark(build_sparse_network, n)
-
-
-@pytest.mark.parametrize(
-    "snapshots", PYPSA_SIZES, ids=[f"snapshots={s}" for s in PYPSA_SIZES]
-)
-def test_build_pypsa_scigrid(benchmark, snapshots, request):
-    pytest.importorskip("pypsa")
-    skip_if_quick(request, "pypsa_scigrid", snapshots)
-    from benchmarks.models.pypsa_scigrid import build_pypsa_scigrid
-
-    benchmark(build_pypsa_scigrid, snapshots)
+@pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
+def test_build(benchmark, spec, size, request):
+    maybe_skip(request, spec, size)
+    benchmark(spec.build, size)
diff --git a/benchmarks/test_lp_write.py b/benchmarks/test_lp_write.py
index 6442ccd6..ea3e04d7 100644
--- a/benchmarks/test_lp_write.py
+++ b/benchmarks/test_lp_write.py
@@ -4,60 +4,15 @@
 
 import pytest
 
-from benchmarks.conftest import skip_if_quick
-from benchmarks.models import (
-    BASIC_SIZES,
-    EXPR_SIZES,
-    KNAPSACK_SIZES,
-    SPARSE_SIZES,
-    build_basic,
-    build_expression_arithmetic,
-    build_knapsack,
-    build_sparse_network,
-)
-from benchmarks.models.pypsa_scigrid import SIZES as PYPSA_SIZES
+from benchmarks.conftest import maybe_skip
+from benchmarks.registry import LP_WRITE, iter_params, param_ids
 
+_PARAMS = iter_params(LP_WRITE)
 
-@pytest.mark.parametrize("n", BASIC_SIZES, ids=[f"n={n}" for n in BASIC_SIZES])
-def test_lp_write_basic(benchmark, n, request, tmp_path):
-    skip_if_quick(request, "basic", n)
-    m = build_basic(n)
-    lp_file = tmp_path / "model.lp"
-    benchmark(m.to_file, lp_file, progress=False)
-
-
-@pytest.mark.parametrize("n", KNAPSACK_SIZES, ids=[f"n={n}" for n in KNAPSACK_SIZES])
-def test_lp_write_knapsack(benchmark, n, request, tmp_path):
-    skip_if_quick(request, "knapsack", n)
-    m = build_knapsack(n)
-    lp_file = tmp_path / "model.lp"
-    benchmark(m.to_file, lp_file, progress=False)
-
-
-@pytest.mark.parametrize("n", EXPR_SIZES, ids=[f"n={n}" for n in EXPR_SIZES])
-def test_lp_write_expression_arithmetic(benchmark, n, request, tmp_path):
-    skip_if_quick(request, "expression_arithmetic", n)
-    m = build_expression_arithmetic(n)
-    lp_file = tmp_path / "model.lp"
-    benchmark(m.to_file, lp_file, progress=False)
-
-
-@pytest.mark.parametrize("n", SPARSE_SIZES, ids=[f"n={n}" for n in SPARSE_SIZES])
-def test_lp_write_sparse_network(benchmark, n, request, tmp_path):
-    skip_if_quick(request, "sparse_network", n)
-    m = build_sparse_network(n)
-    lp_file = tmp_path / "model.lp"
-    benchmark(m.to_file, lp_file, progress=False)
-
-
-@pytest.mark.parametrize(
-    "snapshots", PYPSA_SIZES, ids=[f"snapshots={s}" for s in PYPSA_SIZES]
-)
-def test_lp_write_pypsa_scigrid(benchmark, snapshots, request, tmp_path):
-    pytest.importorskip("pypsa")
-    skip_if_quick(request, "pypsa_scigrid", snapshots)
-    from benchmarks.models.pypsa_scigrid import build_pypsa_scigrid
 
-    m = build_pypsa_scigrid(snapshots)
+@pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
+def test_lp_write(benchmark, spec, size, request, tmp_path):
+    maybe_skip(request, spec, size)
+    m = spec.build(size)
     lp_file = tmp_path / "model.lp"
     benchmark(m.to_file, lp_file, progress=False)
diff --git a/benchmarks/test_matrices.py b/benchmarks/test_matrices.py
index 352844fb..bd36a467 100644
--- a/benchmarks/test_matrices.py
+++ b/benchmarks/test_matrices.py
@@ -4,19 +4,14 @@
 
 import pytest
 
-from benchmarks.conftest import skip_if_quick
-from benchmarks.models import (
-    BASIC_SIZES,
-    EXPR_SIZES,
-    SPARSE_SIZES,
-    build_basic,
-    build_expression_arithmetic,
-    build_sparse_network,
-)
+from benchmarks.conftest import maybe_skip
+from benchmarks.registry import MATRICES, iter_params, param_ids
+
+_PARAMS = iter_params(MATRICES)
 
 
 def _access_matrices(m):
-    """Access all matrix properties to force computation."""
+    """Touch every matrix property to force computation."""
     matrices = m.matrices
     _ = matrices.A
     _ = matrices.b
@@ -26,24 +21,12 @@ def _access_matrices(m):
     _ = matrices.sense
     _ = matrices.vlabels
     _ = matrices.clabels
+    if m.is_quadratic:
+        _ = matrices.Q  # exercise the QP path when present
 
 
-@pytest.mark.parametrize("n", BASIC_SIZES, ids=[f"n={n}" for n in BASIC_SIZES])
-def test_matrices_basic(benchmark, n, request):
-    skip_if_quick(request, "basic", n)
-    m = build_basic(n)
-    benchmark(_access_matrices, m)
-
-
-@pytest.mark.parametrize("n", EXPR_SIZES, ids=[f"n={n}" for n in EXPR_SIZES])
-def test_matrices_expression_arithmetic(benchmark, n, request):
-    skip_if_quick(request, "expression_arithmetic", n)
-    m = build_expression_arithmetic(n)
-    benchmark(_access_matrices, m)
-
-
-@pytest.mark.parametrize("n", SPARSE_SIZES, ids=[f"n={n}" for n in SPARSE_SIZES])
-def test_matrices_sparse_network(benchmark, n, request):
-    skip_if_quick(request, "sparse_network", n)
-    m = build_sparse_network(n)
+@pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
+def test_matrices(benchmark, spec, size, request):
+    maybe_skip(request, spec, size)
+    m = spec.build(size)
     benchmark(_access_matrices, m)
diff --git a/benchmarks/test_netcdf.py b/benchmarks/test_netcdf.py
new file mode 100644
index 00000000..f26ae0fc
--- /dev/null
+++ b/benchmarks/test_netcdf.py
@@ -0,0 +1,34 @@
+"""
+Benchmarks for the netCDF persistence round-trip.
+
+We track ``to_netcdf`` and ``read_netcdf`` separately because the cost split
+matters in practice: distributed workflows tend to do many reads of a single
+written artifact.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+from benchmarks.conftest import maybe_skip
+from benchmarks.registry import NETCDF, iter_params, param_ids
+from linopy import read_netcdf
+
+_PARAMS = iter_params(NETCDF)
+
+
+@pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
+def test_netcdf_write(benchmark, spec, size, request, tmp_path):
+    maybe_skip(request, spec, size)
+    m = spec.build(size)
+    out = tmp_path / "model.nc"
+    benchmark(m.to_netcdf, out)
+
+
+@pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
+def test_netcdf_read(benchmark, spec, size, request, tmp_path):
+    maybe_skip(request, spec, size)
+    m = spec.build(size)
+    out = tmp_path / "model.nc"
+    m.to_netcdf(out)
+    benchmark(read_netcdf, out)
diff --git a/benchmarks/test_solver_handoff.py b/benchmarks/test_solver_handoff.py
new file mode 100644
index 00000000..c7d649fe
--- /dev/null
+++ b/benchmarks/test_solver_handoff.py
@@ -0,0 +1,61 @@
+"""
+Benchmarks for solver handoff (model -> native solver instance).
+
+Times each ``linopy.io.to_<solver>`` wrapper. These wrappers delegate to the
+same direct-API build path as the new stateful Solver API
+(``Solver.from_name(name, model, io_api="direct")``), so the numbers serve
+double duty: regression tracking for the wrappers, *and* for the underlying
+``Solver._build_direct`` paths. They've also been available for many releases
+— using them keeps the suite runnable on older linopy versions.
+
+The actual ``Solver.solve()`` runtime (i.e. solver-side algorithm time) is
+intentionally not benchmarked.
+"""
+
+from __future__ import annotations
+
+import pytest
+
+import linopy.io as lio
+from benchmarks.conftest import maybe_skip
+from benchmarks.registry import (
+    TO_GUROBIPY,
+    TO_HIGHSPY,
+    TO_MOSEK,
+    TO_XPRESS,
+    iter_params,
+)
+from linopy.solvers import available_solvers
+
+# (solver_name, phase tag, wrapper function)
+_SOLVER_PHASES = [
+    ("highs", TO_HIGHSPY, lio.to_highspy),
+    ("gurobi", TO_GUROBIPY, lio.to_gurobipy),
+    ("mosek", TO_MOSEK, lio.to_mosek),
+    ("xpress", TO_XPRESS, lio.to_xpress),
+]
+
+
+def _make_params():
+    out = []
+    for solver_name, phase, wrapper in _SOLVER_PHASES:
+        for spec, size in iter_params(phase):
+            out.append(
+                pytest.param(
+                    solver_name,
+                    wrapper,
+                    spec,
+                    size,
+                    id=f"{solver_name}-{spec.name}-n={size}",
+                )
+            )
+    return out
+
+
+@pytest.mark.parametrize("solver_name,wrapper,spec,size", _make_params())
+def test_solver_handoff(benchmark, solver_name, wrapper, spec, size, request):
+    if solver_name not in available_solvers:
+        pytest.skip(f"{solver_name} not installed")
+    maybe_skip(request, spec, size)
+    model = spec.build(size)
+    benchmark(wrapper, model)

From a6cc83bb8fdc4d68cf05992e3a694a99562e2f3c Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 10:33:30 +0200
Subject: [PATCH 03/68] benchmarks: add --long flag, gate super-long sizes by
 default
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The default ``pytest benchmarks/`` run now skips the slowest 1-2 sizes per
spec (e.g. knapsack at 1M, basic at 1600, pypsa_scigrid at >50) so a full
timing pass completes in ~2 minutes instead of 20-45.

ModelSpec grows a ``long_threshold`` mirror of ``quick_threshold``:

- ``--quick``  → ``size <= quick_threshold``  (CI smoke)
- default      → ``size <= long_threshold``   (medium-cost regression)
- ``--long``   → no cap                       (full sweep)

Verified locally:
- --quick: 227 passed / 230 skipped / 35s
- default: 333 passed / 124 skipped / 45s
- --long : 457 passed /   0 skipped / 2m13s

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/README.md                       | 14 ++++++++-
 benchmarks/conftest.py                     | 34 ++++++++++++++++++----
 benchmarks/models/basic.py                 |  1 +
 benchmarks/models/expression_arithmetic.py |  1 +
 benchmarks/models/knapsack.py              |  1 +
 benchmarks/models/masked.py                |  1 +
 benchmarks/models/milp.py                  |  1 +
 benchmarks/models/piecewise.py             |  1 +
 benchmarks/models/pypsa_scigrid.py         |  1 +
 benchmarks/models/qp.py                    |  1 +
 benchmarks/models/sos.py                   |  1 +
 benchmarks/models/sparse_network.py        |  1 +
 benchmarks/registry.py                     | 13 ++++++++-
 13 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 59cc0594..8d2e3fd1 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -84,13 +84,25 @@ source .venv/bin/activate
 
 ## Run benchmarks
 
+The suite has three size tiers, each spec declaring its own `quick_threshold`
+and `long_threshold`:
+
+| Flag       | Sizes included                        | Typical use                              |
+| ---------- | ------------------------------------- | ---------------------------------------- |
+| `--quick`  | `size <= quick_threshold`             | CI smoke, fast local sanity check        |
+| _(none)_   | `size <= long_threshold`              | Default: medium-cost regression timing   |
+| `--long`   | all sizes                             | Full sweep (the slow stuff — many min)   |
+
 ```bash
 # Quick smoke run (small sizes only, no timing)
 pytest benchmarks/ --quick --benchmark-disable
 
-# Full timing run
+# Default timing run (skips the super-long sizes)
 pytest benchmarks/ --benchmark-only
 
+# Full sweep — every size on every model
+pytest benchmarks/ --benchmark-only --long
+
 # A single phase
 pytest benchmarks/test_build.py
 
diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py
index f5c31df2..de416167 100644
--- a/benchmarks/conftest.py
+++ b/benchmarks/conftest.py
@@ -12,17 +12,41 @@ def pytest_addoption(parser):
         "--quick",
         action="store_true",
         default=False,
-        help="Use smaller problem sizes for quick benchmarking",
+        help="Use smaller problem sizes for quick benchmarking (CI smoke).",
+    )
+    parser.addoption(
+        "--long",
+        action="store_true",
+        default=False,
+        help=(
+            "Include the slowest sizes (above each spec's long_threshold). "
+            "Default runs skip them."
+        ),
     )
 
 
 def maybe_skip(request: pytest.FixtureRequest, spec: ModelSpec, size: int) -> None:
     """
-    Apply ``--quick`` size cap and ``spec.requires`` importorskips.
+    Apply size-tier skips and ``spec.requires`` importorskips.
 
-    Centralised so every phase test stays a one-liner.
+    Tiers (most restrictive first):
+
+    - ``--quick``                 → skip ``size > quick_threshold``
+    - default (no flag)           → skip ``size > long_threshold``
+    - ``--long``                  → no size cap
+
+    If both ``--quick`` and ``--long`` are passed, ``--quick`` wins (the more
+    restrictive mode is honoured).
     """
     for mod in spec.requires:
         pytest.importorskip(mod)
-    if request.config.getoption("--quick") and size > spec.quick_threshold:
-        pytest.skip(f"--quick: skipping {spec.name} size {size}")
+
+    quick = request.config.getoption("--quick")
+    long_ = request.config.getoption("--long")
+
+    if quick:
+        if size > spec.quick_threshold:
+            pytest.skip(f"--quick: skipping {spec.name} size {size}")
+    elif not long_:
+        if size > spec.long_threshold:
+            pytest.skip(f"long size needs --long: skipping {spec.name} size {size}")
diff --git a/benchmarks/models/basic.py b/benchmarks/models/basic.py
index 4f1205a7..3f044a17 100644
--- a/benchmarks/models/basic.py
+++ b/benchmarks/models/basic.py
@@ -26,5 +26,6 @@ def build_basic(n: int) -> linopy.Model:
         sizes=SIZES,
         features=frozenset({CONTINUOUS}),
         quick_threshold=100,
+        long_threshold=500,
     )
 )
diff --git a/benchmarks/models/expression_arithmetic.py b/benchmarks/models/expression_arithmetic.py
index 795fce59..43bbd21b 100644
--- a/benchmarks/models/expression_arithmetic.py
+++ b/benchmarks/models/expression_arithmetic.py
@@ -38,5 +38,6 @@ def build_expression_arithmetic(n: int) -> linopy.Model:
         sizes=SIZES,
         features=frozenset({CONTINUOUS}),
         quick_threshold=100,
+        long_threshold=500,
     )
 )
diff --git a/benchmarks/models/knapsack.py b/benchmarks/models/knapsack.py
index 20aa35ec..5491a6b9 100644
--- a/benchmarks/models/knapsack.py
+++ b/benchmarks/models/knapsack.py
@@ -32,5 +32,6 @@ def build_knapsack(n: int) -> linopy.Model:
         features=frozenset({BINARY}),
         phases=DEFAULT_PHASES,  # HiGHS handles binary; matrices handles MILP
         quick_threshold=10_000,
+        long_threshold=10_000,
     )
 )
diff --git a/benchmarks/models/masked.py b/benchmarks/models/masked.py
index 190b4031..366e3ff8 100644
--- a/benchmarks/models/masked.py
+++ b/benchmarks/models/masked.py
@@ -86,5 +86,6 @@ def build_masked(n: int) -> linopy.Model:
         features=frozenset({CONTINUOUS, MASKED}),
         phases=DEFAULT_PHASES,
         quick_threshold=100,
+        long_threshold=500,
     )
 )
diff --git a/benchmarks/models/milp.py b/benchmarks/models/milp.py
index bdad39ab..7c181f30 100644
--- a/benchmarks/models/milp.py
+++ b/benchmarks/models/milp.py
@@ -75,5 +75,6 @@ def build_milp(n: int) -> linopy.Model:
         features=frozenset({INTEGER, CONTINUOUS}),
         phases=DEFAULT_PHASES,
         quick_threshold=25,
+        long_threshold=100,
     )
 )
diff --git a/benchmarks/models/piecewise.py b/benchmarks/models/piecewise.py
index 38cfef84..389e4669 100644
--- a/benchmarks/models/piecewise.py
+++ b/benchmarks/models/piecewise.py
@@ -86,6 +86,7 @@ def build_piecewise(n_gens: int) -> linopy.Model:
             # solver handles.
             phases=DEFAULT_PHASES,
             quick_threshold=100,
+            long_threshold=1_000,
         )
     )
 else:
diff --git a/benchmarks/models/pypsa_scigrid.py b/benchmarks/models/pypsa_scigrid.py
index 29c3e450..a1428d23 100644
--- a/benchmarks/models/pypsa_scigrid.py
+++ b/benchmarks/models/pypsa_scigrid.py
@@ -29,6 +29,7 @@ def build_pypsa_scigrid(snapshots: int = 100) -> linopy.Model:
         sizes=SIZES,
         features=frozenset({CONTINUOUS}),
         quick_threshold=50,
+        long_threshold=50,
         requires=("pypsa",),
     )
 )
diff --git a/benchmarks/models/qp.py b/benchmarks/models/qp.py
index 6e6517d2..4df793da 100644
--- a/benchmarks/models/qp.py
+++ b/benchmarks/models/qp.py
@@ -61,5 +61,6 @@ def build_qp(n_assets: int) -> linopy.Model:
         features=frozenset({CONTINUOUS, QUADRATIC}),
         phases=DEFAULT_PHASES,
         quick_threshold=100,
+        long_threshold=1_000,
     )
 )
diff --git a/benchmarks/models/sos.py b/benchmarks/models/sos.py
index 31cca2ea..e16f5bef 100644
--- a/benchmarks/models/sos.py
+++ b/benchmarks/models/sos.py
@@ -93,6 +93,7 @@ def build_sos(n_gens: int) -> linopy.Model:
                 }
             ),
             quick_threshold=100,
+            long_threshold=1_000,
         )
     )
 else:
diff --git a/benchmarks/models/sparse_network.py b/benchmarks/models/sparse_network.py
index a750ae90..47417032 100644
--- a/benchmarks/models/sparse_network.py
+++ b/benchmarks/models/sparse_network.py
@@ -58,5 +58,6 @@ def build_sparse_network(n_buses: int) -> linopy.Model:
         sizes=SIZES,
         features=frozenset({CONTINUOUS}),
         quick_threshold=100,
+        long_threshold=500,
     )
 )
diff --git a/benchmarks/registry.py b/benchmarks/registry.py
index 07e6c322..228152b8 100644
--- a/benchmarks/registry.py
+++ b/benchmarks/registry.py
@@ -88,7 +88,17 @@
 
 @dataclass(frozen=True)
 class ModelSpec:
-    """Declarative description of one benchmark model."""
+    """
+    Declarative description of one benchmark model.
+
+    Three size tiers gate the cost of a default ``pytest benchmarks/`` run:
+
+    - ``size <= quick_threshold``: included under ``--quick`` (smoke / CI).
+    - ``size <= long_threshold``: included by default (medium-cost regression).
+    - ``size >  long_threshold``: only included under ``--long`` (full sweep).
+
+    Without explicit values, both thresholds default to "no cap".
+    """
 
     name: str
     build: Callable[[int], linopy.Model]
@@ -96,6 +106,7 @@ class ModelSpec:
     features: frozenset[str] = frozenset({CONTINUOUS})
     phases: frozenset[str] = DEFAULT_PHASES
     quick_threshold: int = 10**9
+    long_threshold: int = 10**9
     requires: tuple[str, ...] = ()
 
     def applies_to(self, phase: str) -> bool:

From 300abb5e39a259c1bdf3d7c1cde5e613986739fd Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 10:41:54 +0200
Subject: [PATCH 04/68] =?UTF-8?q?benchmarks:=20make=20--quick=20truly=20qu?=
 =?UTF-8?q?ick=20(35s=20=E2=86=92=2018s)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Drop pypsa_scigrid from --quick entirely (quick_threshold=0). PyPSA
  import + example loading dominates the smoke wall-clock; the model
  still runs in default and --long modes.
- Lower every other spec's quick_threshold to its smallest size, so
  --quick exercises one size per model across all phases. The default
  tier (which uses long_threshold) still gives broad regression coverage.

Verified locally:
- --quick:  85 passed / 372 skipped / 18.5s   (was 35s)
- default: 333 passed / 124 skipped / 44.8s   (unchanged)
- --long : 457 passed /   0 skipped / 2m11s   (unchanged)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/models/basic.py                 | 2 +-
 benchmarks/models/expression_arithmetic.py | 2 +-
 benchmarks/models/knapsack.py              | 2 +-
 benchmarks/models/masked.py                | 2 +-
 benchmarks/models/milp.py                  | 2 +-
 benchmarks/models/piecewise.py             | 2 +-
 benchmarks/models/pypsa_scigrid.py         | 5 ++++-
 benchmarks/models/qp.py                    | 2 +-
 benchmarks/models/sos.py                   | 2 +-
 benchmarks/models/sparse_network.py        | 2 +-
 10 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/benchmarks/models/basic.py b/benchmarks/models/basic.py
index 3f044a17..6959e188 100644
--- a/benchmarks/models/basic.py
+++ b/benchmarks/models/basic.py
@@ -25,7 +25,7 @@ def build_basic(n: int) -> linopy.Model:
         build=build_basic,
         sizes=SIZES,
         features=frozenset({CONTINUOUS}),
-        quick_threshold=100,
+        quick_threshold=10,
         long_threshold=500,
     )
 )
diff --git a/benchmarks/models/expression_arithmetic.py b/benchmarks/models/expression_arithmetic.py
index 43bbd21b..80590951 100644
--- a/benchmarks/models/expression_arithmetic.py
+++ b/benchmarks/models/expression_arithmetic.py
@@ -37,7 +37,7 @@ def build_expression_arithmetic(n: int) -> linopy.Model:
         build=build_expression_arithmetic,
         sizes=SIZES,
         features=frozenset({CONTINUOUS}),
-        quick_threshold=100,
+        quick_threshold=10,
         long_threshold=500,
     )
 )
diff --git a/benchmarks/models/knapsack.py b/benchmarks/models/knapsack.py
index 5491a6b9..7860f285 100644
--- a/benchmarks/models/knapsack.py
+++ b/benchmarks/models/knapsack.py
@@ -31,7 +31,7 @@ def build_knapsack(n: int) -> linopy.Model:
         sizes=SIZES,
         features=frozenset({BINARY}),
         phases=DEFAULT_PHASES,  # HiGHS handles binary; matrices handles MILP
-        quick_threshold=10_000,
+        quick_threshold=100,
         long_threshold=10_000,
     )
 )
diff --git a/benchmarks/models/masked.py b/benchmarks/models/masked.py
index 366e3ff8..fccac137 100644
--- a/benchmarks/models/masked.py
+++ b/benchmarks/models/masked.py
@@ -85,7 +85,7 @@ def build_masked(n: int) -> linopy.Model:
         sizes=SIZES,
         features=frozenset({CONTINUOUS, MASKED}),
         phases=DEFAULT_PHASES,
-        quick_threshold=100,
+        quick_threshold=10,
         long_threshold=500,
     )
 )
diff --git a/benchmarks/models/milp.py b/benchmarks/models/milp.py
index 7c181f30..e762f207 100644
--- a/benchmarks/models/milp.py
+++ b/benchmarks/models/milp.py
@@ -74,7 +74,7 @@ def build_milp(n: int) -> linopy.Model:
         sizes=SIZES,
         features=frozenset({INTEGER, CONTINUOUS}),
         phases=DEFAULT_PHASES,
-        quick_threshold=25,
+        quick_threshold=10,
         long_threshold=100,
     )
 )
diff --git a/benchmarks/models/piecewise.py b/benchmarks/models/piecewise.py
index 389e4669..0fb393bd 100644
--- a/benchmarks/models/piecewise.py
+++ b/benchmarks/models/piecewise.py
@@ -85,7 +85,7 @@ def build_piecewise(n_gens: int) -> linopy.Model:
             # reformulation (pure MILP with binaries), which every supported
             # solver handles.
             phases=DEFAULT_PHASES,
-            quick_threshold=100,
+            quick_threshold=10,
             long_threshold=1_000,
         )
     )
diff --git a/benchmarks/models/pypsa_scigrid.py b/benchmarks/models/pypsa_scigrid.py
index a1428d23..41d8836b 100644
--- a/benchmarks/models/pypsa_scigrid.py
+++ b/benchmarks/models/pypsa_scigrid.py
@@ -28,7 +28,10 @@ def build_pypsa_scigrid(snapshots: int = 100) -> linopy.Model:
         build=build_pypsa_scigrid,
         sizes=SIZES,
         features=frozenset({CONTINUOUS}),
-        quick_threshold=50,
+        # quick_threshold=0 keeps pypsa_scigrid out of --quick entirely —
+        # PyPSA import + example loading dominates the smoke wall-clock
+        # otherwise. It still runs in default and --long modes.
+        quick_threshold=0,
         long_threshold=50,
         requires=("pypsa",),
     )
diff --git a/benchmarks/models/qp.py b/benchmarks/models/qp.py
index 4df793da..a040df45 100644
--- a/benchmarks/models/qp.py
+++ b/benchmarks/models/qp.py
@@ -60,7 +60,7 @@ def build_qp(n_assets: int) -> linopy.Model:
         sizes=SIZES,
         features=frozenset({CONTINUOUS, QUADRATIC}),
         phases=DEFAULT_PHASES,
-        quick_threshold=100,
+        quick_threshold=10,
         long_threshold=1_000,
     )
 )
diff --git a/benchmarks/models/sos.py b/benchmarks/models/sos.py
index e16f5bef..c5ac1d36 100644
--- a/benchmarks/models/sos.py
+++ b/benchmarks/models/sos.py
@@ -92,7 +92,7 @@ def build_sos(n_gens: int) -> linopy.Model:
                     TO_XPRESS,
                 }
             ),
-            quick_threshold=100,
+            quick_threshold=10,
             long_threshold=1_000,
         )
     )
diff --git a/benchmarks/models/sparse_network.py b/benchmarks/models/sparse_network.py
index 47417032..7ac71db1 100644
--- a/benchmarks/models/sparse_network.py
+++ b/benchmarks/models/sparse_network.py
@@ -57,7 +57,7 @@ def build_sparse_network(n_buses: int) -> linopy.Model:
         build=build_sparse_network,
         sizes=SIZES,
         features=frozenset({CONTINUOUS}),
-        quick_threshold=100,
+        quick_threshold=10,
         long_threshold=500,
     )
 )

From c725c6857842a05212f3753f2ec02c2ece5b96fd Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 10:57:24 +0200
Subject: [PATCH 05/68] benchmarks: add registry-usage notebook + execute in CI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

benchmarks/notebooks/registry_usage.py is the canonical walkthrough for
the model registry. Authored in jupytext percent format so it triples
as:

- runnable Python script (CI executes it on every PR)
- notebook in JupyterLab / VSCode with the jupytext extension
- readable doc on GitHub (markdown cells render directly)

Covers: import, lookup by name, iterate, filter_by feature/phase,
parametrize-your-own-pytest pattern, one-off tracemalloc profiling,
and the three CLI tiers.

CI: benchmark-smoke.yml gains an "Execute registry-usage notebook" step
right after the pytest smoke — so doc rot fails the build instead of
hiding until someone next opens the file.

README: new "Worked walkthrough" subsection points at the notebook.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark-smoke.yml  |   6 +
 benchmarks/README.md                   |  13 ++
 benchmarks/notebooks/registry_usage.py | 204 +++++++++++++++++++++++++
 3 files changed, 223 insertions(+)
 create mode 100644 benchmarks/notebooks/registry_usage.py

diff --git a/.github/workflows/benchmark-smoke.yml b/.github/workflows/benchmark-smoke.yml
index c6b37028..eeb6d2c5 100644
--- a/.github/workflows/benchmark-smoke.yml
+++ b/.github/workflows/benchmark-smoke.yml
@@ -39,3 +39,9 @@ jobs:
     - name: Run benchmark smoke
       run: |
         pytest benchmarks/ --quick --benchmark-disable -q
+
+    - name: Execute registry-usage notebook
+      # Catches doc rot — the notebook is the canonical "how to use the
+      # registry" walkthrough and must stay runnable end-to-end.
+      run: |
+        python benchmarks/notebooks/registry_usage.py
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 8d2e3fd1..19df4572 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -75,6 +75,19 @@ To add a new model, drop a file under `benchmarks/models/`, expose a
 `build_<name>(size)`, and call `register(ModelSpec(...))`. Import it from
 `benchmarks/models/__init__.py` so the registration fires.
 
+### Worked walkthrough
+
+[`notebooks/registry_usage.py`](notebooks/registry_usage.py) is a jupytext
+percent-format notebook that runs through every pattern above end-to-end.
+Three ways to consume it:
+
+- **Script:** `python benchmarks/notebooks/registry_usage.py` — every cell
+  executes top-to-bottom; CI runs it this way on every PR so the examples
+  can't silently rot.
+- **Notebook:** open the `.py` file in JupyterLab or VSCode with the jupytext
+  extension installed and it renders as a notebook.
+- **Read:** the `# %% [markdown]` blocks render fine on GitHub directly.
+
 ## Setup
 
 ```bash
diff --git a/benchmarks/notebooks/registry_usage.py b/benchmarks/notebooks/registry_usage.py
new file mode 100644
index 00000000..e999970b
--- /dev/null
+++ b/benchmarks/notebooks/registry_usage.py
@@ -0,0 +1,204 @@
+# ---
+# jupyter:
+#   jupytext:
+#     formats: py:percent
+#     text_representation:
+#       extension: .py
+#       format_name: percent
+#       format_version: '1.3'
+#       jupytext_version: 1.16.0
+#   kernelspec:
+#     display_name: Python 3
+#     language: python
+#     name: python3
+# ---
+
+# %% [markdown]
+# # Benchmark model registry — usage guide
+#
+# This file is the canonical walkthrough for the benchmark **model registry**.
+# It's authored in [jupytext](https://jupytext.readthedocs.io/) percent format,
+# which means:
+#
+# - **Run as a script:** `python benchmarks/notebooks/registry_usage.py` — every
+#   pattern below executes end-to-end. CI runs it this way on every PR, so the
+#   examples can't silently rot.
+# - **Open as a notebook:** in JupyterLab or VSCode with the jupytext extension,
+#   this file appears as a notebook with markdown + code cells.
+# - **Lint:** `ruff check` works because it's plain Python.
+#
+# The registry lives in `benchmarks/registry.py`. Each model file under
+# `benchmarks/models/` self-registers a `ModelSpec` on import, so just touching
+# the `benchmarks` package populates `REGISTRY`.
+
+# %% [markdown]
+# ## 1. Import the registry
+#
+# Single entry point: `from benchmarks import REGISTRY` plus the feature / phase
+# constants you need for filtering.
+
+# %%
+# Put the repo root on sys.path so the file runs from anywhere
+# (e.g. ``python benchmarks/notebooks/registry_usage.py``).
+import sys
+from pathlib import Path
+
+_REPO_ROOT = Path(__file__).resolve().parents[2]
+if str(_REPO_ROOT) not in sys.path:
+    sys.path.insert(0, str(_REPO_ROOT))
+
+from benchmarks import (  # noqa: E402
+    INTEGER,
+    QUADRATIC,
+    REGISTRY,
+    TO_GUROBIPY,
+    filter_by,
+    get,
+)
+
+print(f"{len(REGISTRY)} models registered: {sorted(REGISTRY)}")
+
+# %% [markdown]
+# ## 2. Look up one model by name
+#
+# `REGISTRY[name]` returns a `ModelSpec` (frozen dataclass). `.build(size)`
+# constructs and returns a `linopy.Model`.
+
+# %%
+spec = REGISTRY["basic"]
+print(f"name:            {spec.name}")
+print(f"sizes:           {spec.sizes}")
+print(f"features:        {sorted(spec.features)}")
+print(f"quick_threshold: {spec.quick_threshold}")
+print(f"long_threshold:  {spec.long_threshold}")
+
+m = spec.build(50)
+print(
+    f"\nbuilt at n=50: {len(m.variables)} variable arrays, "
+    f"{len(m.constraints)} constraint arrays"
+)
+
+# %% [markdown]
+# `get("name")` is an equivalent functional accessor — handy when you don't
+# want to import `REGISTRY` directly.
+
+# %%
+assert get("basic") is REGISTRY["basic"]
+
+# %% [markdown]
+# ## 3. Iterate the whole registry
+#
+# Useful when you want to sweep your own test or profiling logic across every
+# model — e.g. checking that a refactor didn't break any spec.
+
+# %%
+print(f"{'name':<25} {'features':<35} {'sizes':<20}")
+print("-" * 80)
+for name, spec in REGISTRY.items():
+    feats = ",".join(sorted(spec.features))
+    sizes = f"{spec.sizes[0]}..{spec.sizes[-1]}"
+    print(f"{name:<25} {feats:<35} {sizes:<20}")
+
+# %% [markdown]
+# ## 4. Filter by feature
+#
+# `filter_by(has_feature=...)` returns specs that advertise that feature. The
+# feature tag constants (`CONTINUOUS`, `BINARY`, `INTEGER`, `QUADRATIC`, `SOS`,
+# `PIECEWISE`, `MASKED`) are exported from `benchmarks`.
+
+# %%
+qp_specs = filter_by(has_feature=QUADRATIC)
+print("Quadratic models:", [s.name for s in qp_specs])
+
+mip_specs = filter_by(has_feature=INTEGER)
+print("Integer models:  ", [s.name for s in mip_specs])
+
+# %% [markdown]
+# ## 5. Filter by phase
+#
+# Each spec declares which **phases** apply — `BUILD`, `MATRICES`, `LP_WRITE`,
+# `NETCDF`, `SOLVER_BUILD`, plus per-solver `TO_HIGHSPY` / `TO_GUROBIPY` /
+# `TO_MOSEK` / `TO_XPRESS`. Use `has_phase=` to narrow to solver-compatible
+# models, e.g. when writing a Gurobi-specific regression test.
+
+# %%
+gurobi_specs = filter_by(has_phase=TO_GUROBIPY)
+print(f"{len(gurobi_specs)} models declare TO_GUROBIPY:")
+for s in gurobi_specs:
+    print(f"  - {s.name}")
+
+# %% [markdown]
+# ## 6. Reuse pattern — parametrize your own pytest
+#
+# The pattern the suite itself uses (see `benchmarks/test_build.py` etc.) —
+# `iter_params(phase)` returns `(spec, size)` pairs for the given phase, and
+# `param_ids(...)` builds stable test IDs for `pytest.mark.parametrize`:
+#
+# ```python
+# import pytest
+# from benchmarks import BUILD, iter_params, param_ids
+#
+# _PARAMS = iter_params(BUILD)
+#
+# @pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
+# def test_my_invariant(spec, size):
+#     m = spec.build(size)
+#     # ... assertion that should hold for every model
+# ```
+
+# %% [markdown]
+# ## 7. Reuse pattern — one-off profiling
+#
+# Grab a single model at a chosen size, measure something, throw it away.
+# `tracemalloc` works well for in-process peak-RSS spot checks (use
+# `benchmarks/memory.py` + pytest-memray for the real metric).
+
+# %%
+import tracemalloc  # noqa: E402
+
+tracemalloc.start()
+m = REGISTRY["sparse_network"].build(100)
+_current, peak = tracemalloc.get_traced_memory()
+tracemalloc.stop()
+
+print(f"sparse_network n=100: built, peak allocation ≈ {peak / 1e6:.1f} MB")
+print(
+    f"  {m.variables.nvars} scalar variables, {m.constraints.ncons} scalar constraints"
+)
+
+# %% [markdown]
+# ## 8. Running the benchmark suite
+#
+# Three size tiers, configured per-spec via `quick_threshold` and
+# `long_threshold`:
+#
+# | Flag        | Sizes included            | Use case                              |
+# | ----------- | ------------------------- | ------------------------------------- |
+# | `--quick`   | `size <= quick_threshold` | CI smoke (~18s, one size per model)   |
+# | _(none)_    | `size <= long_threshold`  | Local regression run (~45s)           |
+# | `--long`    | all sizes                 | Full sweep (~2 min, slow stuff)       |
+#
+# ```bash
+# # Quickest smoke
+# pytest benchmarks/ --quick --benchmark-disable
+#
+# # Default timing
+# pytest benchmarks/ --benchmark-only
+#
+# # Full sweep with the slow sizes
+# pytest benchmarks/ --benchmark-only --long
+#
+# # Pick a single (phase, model) pair
+# pytest benchmarks/test_lp_write.py -k "knapsack and n=1000"
+# ```
+
+# %% [markdown]
+# ## 9. Adding a new model
+#
+# 1. Drop `benchmarks/models/<name>.py` with a `build_<name>(size) -> Model`.
+# 2. Build a `ModelSpec` and call `register(...)` at module scope. Declare
+#    realistic `quick_threshold` / `long_threshold` so the smoke stays fast.
+# 3. Add an import in `benchmarks/models/__init__.py` so registration fires.
+#
+# That's it — every phase test picks the spec up automatically through
+# `iter_params(phase)`.

From 99483f8543d43ad4d762a332d4ef41ad574d5a56 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 11:53:51 +0200
Subject: [PATCH 06/68] benchmarks: switch walkthrough to .ipynb, add reprs to
 ModelSpec
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace jupytext-style ``registry_usage.py`` with a proper
  ``registry_usage.ipynb`` — matches the repo convention (examples/*.ipynb,
  nbsphinx, nbstripout). CI executes it via ``jupyter nbconvert --execute``.
- Add ``__repr__`` (one-line summary) and ``_repr_html_`` (attribute table)
  to ModelSpec. Visible in pytest -v output, in interactive Python, and as
  rich HTML in Jupyter cells.
- Notebook simplified to lean on the new reprs: explicit-attribute prints
  in sections 2-5 replaced by bare expression evaluations.
- README points at the .ipynb and notes the "launch jupyter from repo root"
  convention.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark-smoke.yml     |   5 +-
 benchmarks/README.md                      |  20 +-
 benchmarks/notebooks/registry_usage.ipynb | 329 ++++++++++++++++++++++
 benchmarks/notebooks/registry_usage.py    | 204 --------------
 benchmarks/registry.py                    |  32 ++-
 5 files changed, 374 insertions(+), 216 deletions(-)
 create mode 100644 benchmarks/notebooks/registry_usage.ipynb
 delete mode 100644 benchmarks/notebooks/registry_usage.py

diff --git a/.github/workflows/benchmark-smoke.yml b/.github/workflows/benchmark-smoke.yml
index eeb6d2c5..8cffb97d 100644
--- a/.github/workflows/benchmark-smoke.yml
+++ b/.github/workflows/benchmark-smoke.yml
@@ -44,4 +44,7 @@ jobs:
       # Catches doc rot — the notebook is the canonical "how to use the
       # registry" walkthrough and must stay runnable end-to-end.
       run: |
-        python benchmarks/notebooks/registry_usage.py
+        jupyter nbconvert --to notebook --execute \
+          --ExecutePreprocessor.timeout=300 \
+          --output executed.ipynb \
+          benchmarks/notebooks/registry_usage.ipynb
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 19df4572..39a22d46 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -77,16 +77,16 @@ To add a new model, drop a file under `benchmarks/models/`, expose a
 
 ### Worked walkthrough
 
-[`notebooks/registry_usage.py`](notebooks/registry_usage.py) is a jupytext
-percent-format notebook that runs through every pattern above end-to-end.
-Three ways to consume it:
-
-- **Script:** `python benchmarks/notebooks/registry_usage.py` — every cell
-  executes top-to-bottom; CI runs it this way on every PR so the examples
-  can't silently rot.
-- **Notebook:** open the `.py` file in JupyterLab or VSCode with the jupytext
-  extension installed and it renders as a notebook.
-- **Read:** the `# %% [markdown]` blocks render fine on GitHub directly.
+[`notebooks/registry_usage.ipynb`](notebooks/registry_usage.ipynb) is the
+canonical walkthrough — it runs through every pattern above end-to-end.
+GitHub renders it inline. CI executes it on every PR via `jupyter nbconvert
+--execute`, so the examples can't silently rot.
+
+Open it locally with JupyterLab launched from the repo root:
+
+```bash
+jupyter lab benchmarks/notebooks/registry_usage.ipynb
+```
 
 ## Setup
 
diff --git a/benchmarks/notebooks/registry_usage.ipynb b/benchmarks/notebooks/registry_usage.ipynb
new file mode 100644
index 00000000..17511e5d
--- /dev/null
+++ b/benchmarks/notebooks/registry_usage.ipynb
@@ -0,0 +1,329 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "0",
+   "metadata": {},
+   "source": [
+    "# Benchmark model registry — usage guide\n",
+    "\n",
+    "This notebook is the canonical walkthrough for the benchmark\n",
+    "**model registry**. CI executes it end-to-end on every PR\n",
+    "(`jupyter nbconvert --execute`), so the examples can't silently rot.\n",
+    "\n",
+    "Launch jupyter from the repo root so `from benchmarks import ...` resolves\n",
+    "(same convention as `examples/*.ipynb`).\n",
+    "\n",
+    "The registry lives in `benchmarks/registry.py`. Each model file under\n",
+    "`benchmarks/models/` self-registers a `ModelSpec` on import, so just touching\n",
+    "the `benchmarks` package populates `REGISTRY`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1",
+   "metadata": {},
+   "source": [
+    "## 1. Import the registry\n",
+    "\n",
+    "Single entry point: `from benchmarks import REGISTRY` plus the feature / phase\n",
+    "constants you need for filtering."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# The benchmark suite isn't shipped in the linopy wheel — it lives in-tree.\n",
+    "# Find the repo root by walking up from cwd and put it on sys.path so the\n",
+    "# import resolves whether jupyter was launched from the repo root, the\n",
+    "# notebooks directory, or anywhere in between.\n",
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "_p = Path.cwd()\n",
+    "while _p != _p.parent:\n",
+    "    if (_p / \"benchmarks\" / \"registry.py\").exists():\n",
+    "        if str(_p) not in sys.path:\n",
+    "            sys.path.insert(0, str(_p))\n",
+    "        break\n",
+    "    _p = _p.parent\n",
+    "\n",
+    "from benchmarks import (  # noqa: E402\n",
+    "    INTEGER,\n",
+    "    QUADRATIC,\n",
+    "    REGISTRY,\n",
+    "    TO_GUROBIPY,\n",
+    "    filter_by,\n",
+    "    get,\n",
+    ")\n",
+    "\n",
+    "print(f\"{len(REGISTRY)} models registered: {sorted(REGISTRY)}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3",
+   "metadata": {},
+   "source": [
+    "## 2. Look up one model by name\n",
+    "\n",
+    "`REGISTRY[name]` returns a `ModelSpec` (frozen dataclass). Evaluating it\n",
+    "renders a full attribute table in Jupyter; `__repr__` gives a one-line\n",
+    "summary in scripts or `pytest -v` output."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spec = REGISTRY[\"basic\"]\n",
+    "spec"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5",
+   "metadata": {},
+   "source": [
+    "`.build(size)` constructs and returns a `linopy.Model`:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "spec.build(50)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7",
+   "metadata": {},
+   "source": [
+    "`get(\"name\")` is an equivalent functional accessor — handy when you don't\n",
+    "want to import `REGISTRY` directly."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert get(\"basic\") is REGISTRY[\"basic\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9",
+   "metadata": {},
+   "source": [
+    "## 3. Iterate the whole registry\n",
+    "\n",
+    "Useful when you want to sweep your own test or profiling logic across every\n",
+    "model — e.g. checking that a refactor didn't break any spec. Each spec's\n",
+    "`__repr__` carries enough info for an at-a-glance overview."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "list(REGISTRY.values())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "11",
+   "metadata": {},
+   "source": [
+    "## 4. Filter by feature\n",
+    "\n",
+    "`filter_by(has_feature=...)` returns specs that advertise that feature. The\n",
+    "feature tag constants (`CONTINUOUS`, `BINARY`, `INTEGER`, `QUADRATIC`, `SOS`,\n",
+    "`PIECEWISE`, `MASKED`) are exported from `benchmarks`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "12",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filter_by(has_feature=QUADRATIC)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "13",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filter_by(has_feature=INTEGER)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "14",
+   "metadata": {},
+   "source": [
+    "## 5. Filter by phase\n",
+    "\n",
+    "Each spec declares which **phases** apply — `BUILD`, `MATRICES`, `LP_WRITE`,\n",
+    "`NETCDF`, `SOLVER_BUILD`, plus per-solver `TO_HIGHSPY` / `TO_GUROBIPY` /\n",
+    "`TO_MOSEK` / `TO_XPRESS`. Use `has_phase=` to narrow to solver-compatible\n",
+    "models, e.g. when writing a Gurobi-specific regression test."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "15",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "filter_by(has_phase=TO_GUROBIPY)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "16",
+   "metadata": {},
+   "source": [
+    "## 6. Reuse pattern — parametrize your own pytest\n",
+    "\n",
+    "The pattern the suite itself uses (see `benchmarks/test_build.py` etc.) —\n",
+    "`iter_params(phase)` returns `(spec, size)` pairs for the given phase, and\n",
+    "`param_ids(...)` builds stable test IDs for `pytest.mark.parametrize`:\n",
+    "\n",
+    "```python\n",
+    "import pytest\n",
+    "from benchmarks import BUILD, iter_params, param_ids\n",
+    "\n",
+    "_PARAMS = iter_params(BUILD)\n",
+    "\n",
+    "@pytest.mark.parametrize(\"spec,size\", _PARAMS, ids=param_ids(_PARAMS))\n",
+    "def test_my_invariant(spec, size):\n",
+    "    m = spec.build(size)\n",
+    "    # ... assertion that should hold for every model\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "17",
+   "metadata": {},
+   "source": [
+    "## 7. Reuse pattern — one-off profiling\n",
+    "\n",
+    "Grab a single model at a chosen size, measure something, throw it away.\n",
+    "`tracemalloc` works well for in-process peak-RSS spot checks (use\n",
+    "`benchmarks/memory.py` + pytest-memray for the real metric)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "18",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tracemalloc  # noqa: E402\n",
+    "\n",
+    "tracemalloc.start()\n",
+    "m = REGISTRY[\"sparse_network\"].build(100)\n",
+    "_current, peak = tracemalloc.get_traced_memory()\n",
+    "tracemalloc.stop()\n",
+    "\n",
+    "print(f\"sparse_network n=100: built, peak allocation ≈ {peak / 1e6:.1f} MB\")\n",
+    "print(\n",
+    "    f\"  {m.variables.nvars} scalar variables, {m.constraints.ncons} scalar constraints\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "19",
+   "metadata": {},
+   "source": [
+    "## 8. Running the benchmark suite\n",
+    "\n",
+    "Three size tiers, configured per-spec via `quick_threshold` and\n",
+    "`long_threshold`:\n",
+    "\n",
+    "| Flag        | Sizes included            | Use case                              |\n",
+    "| ----------- | ------------------------- | ------------------------------------- |\n",
+    "| `--quick`   | `size <= quick_threshold` | CI smoke (~18s, one size per model)   |\n",
+    "| _(none)_    | `size <= long_threshold`  | Local regression run (~45s)           |\n",
+    "| `--long`    | all sizes                 | Full sweep (~2 min, slow stuff)       |\n",
+    "\n",
+    "```bash\n",
+    "# Quickest smoke\n",
+    "pytest benchmarks/ --quick --benchmark-disable\n",
+    "\n",
+    "# Default timing\n",
+    "pytest benchmarks/ --benchmark-only\n",
+    "\n",
+    "# Full sweep with the slow sizes\n",
+    "pytest benchmarks/ --benchmark-only --long\n",
+    "\n",
+    "# Pick a single (phase, model) pair\n",
+    "pytest benchmarks/test_lp_write.py -k \"knapsack and n=1000\"\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "20",
+   "metadata": {},
+   "source": [
+    "## 9. Adding a new model\n",
+    "\n",
+    "1. Drop `benchmarks/models/<name>.py` with a `build_<name>(size) -> Model`.\n",
+    "2. Build a `ModelSpec` and call `register(...)` at module scope. Declare\n",
+    "   realistic `quick_threshold` / `long_threshold` so the smoke stays fast.\n",
+    "3. Add an import in `benchmarks/models/__init__.py` so registration fires.\n",
+    "\n",
+    "That's it — every phase test picks the spec up automatically through\n",
+    "`iter_params(phase)`."
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/benchmarks/notebooks/registry_usage.py b/benchmarks/notebooks/registry_usage.py
deleted file mode 100644
index e999970b..00000000
--- a/benchmarks/notebooks/registry_usage.py
+++ /dev/null
@@ -1,204 +0,0 @@
-# ---
-# jupyter:
-#   jupytext:
-#     formats: py:percent
-#     text_representation:
-#       extension: .py
-#       format_name: percent
-#       format_version: '1.3'
-#       jupytext_version: 1.16.0
-#   kernelspec:
-#     display_name: Python 3
-#     language: python
-#     name: python3
-# ---
-
-# %% [markdown]
-# # Benchmark model registry — usage guide
-#
-# This file is the canonical walkthrough for the benchmark **model registry**.
-# It's authored in [jupytext](https://jupytext.readthedocs.io/) percent format,
-# which means:
-#
-# - **Run as a script:** `python benchmarks/notebooks/registry_usage.py` — every
-#   pattern below executes end-to-end. CI runs it this way on every PR, so the
-#   examples can't silently rot.
-# - **Open as a notebook:** in JupyterLab or VSCode with the jupytext extension,
-#   this file appears as a notebook with markdown + code cells.
-# - **Lint:** `ruff check` works because it's plain Python.
-#
-# The registry lives in `benchmarks/registry.py`. Each model file under
-# `benchmarks/models/` self-registers a `ModelSpec` on import, so just touching
-# the `benchmarks` package populates `REGISTRY`.
-
-# %% [markdown]
-# ## 1. Import the registry
-#
-# Single entry point: `from benchmarks import REGISTRY` plus the feature / phase
-# constants you need for filtering.
-
-# %%
-# Put the repo root on sys.path so the file runs from anywhere
-# (e.g. ``python benchmarks/notebooks/registry_usage.py``).
-import sys
-from pathlib import Path
-
-_REPO_ROOT = Path(__file__).resolve().parents[2]
-if str(_REPO_ROOT) not in sys.path:
-    sys.path.insert(0, str(_REPO_ROOT))
-
-from benchmarks import (  # noqa: E402
-    INTEGER,
-    QUADRATIC,
-    REGISTRY,
-    TO_GUROBIPY,
-    filter_by,
-    get,
-)
-
-print(f"{len(REGISTRY)} models registered: {sorted(REGISTRY)}")
-
-# %% [markdown]
-# ## 2. Look up one model by name
-#
-# `REGISTRY[name]` returns a `ModelSpec` (frozen dataclass). `.build(size)`
-# constructs and returns a `linopy.Model`.
-
-# %%
-spec = REGISTRY["basic"]
-print(f"name:            {spec.name}")
-print(f"sizes:           {spec.sizes}")
-print(f"features:        {sorted(spec.features)}")
-print(f"quick_threshold: {spec.quick_threshold}")
-print(f"long_threshold:  {spec.long_threshold}")
-
-m = spec.build(50)
-print(
-    f"\nbuilt at n=50: {len(m.variables)} variable arrays, "
-    f"{len(m.constraints)} constraint arrays"
-)
-
-# %% [markdown]
-# `get("name")` is an equivalent functional accessor — handy when you don't
-# want to import `REGISTRY` directly.
-
-# %%
-assert get("basic") is REGISTRY["basic"]
-
-# %% [markdown]
-# ## 3. Iterate the whole registry
-#
-# Useful when you want to sweep your own test or profiling logic across every
-# model — e.g. checking that a refactor didn't break any spec.
-
-# %%
-print(f"{'name':<25} {'features':<35} {'sizes':<20}")
-print("-" * 80)
-for name, spec in REGISTRY.items():
-    feats = ",".join(sorted(spec.features))
-    sizes = f"{spec.sizes[0]}..{spec.sizes[-1]}"
-    print(f"{name:<25} {feats:<35} {sizes:<20}")
-
-# %% [markdown]
-# ## 4. Filter by feature
-#
-# `filter_by(has_feature=...)` returns specs that advertise that feature. The
-# feature tag constants (`CONTINUOUS`, `BINARY`, `INTEGER`, `QUADRATIC`, `SOS`,
-# `PIECEWISE`, `MASKED`) are exported from `benchmarks`.
-
-# %%
-qp_specs = filter_by(has_feature=QUADRATIC)
-print("Quadratic models:", [s.name for s in qp_specs])
-
-mip_specs = filter_by(has_feature=INTEGER)
-print("Integer models:  ", [s.name for s in mip_specs])
-
-# %% [markdown]
-# ## 5. Filter by phase
-#
-# Each spec declares which **phases** apply — `BUILD`, `MATRICES`, `LP_WRITE`,
-# `NETCDF`, `SOLVER_BUILD`, plus per-solver `TO_HIGHSPY` / `TO_GUROBIPY` /
-# `TO_MOSEK` / `TO_XPRESS`. Use `has_phase=` to narrow to solver-compatible
-# models, e.g. when writing a Gurobi-specific regression test.
-
-# %%
-gurobi_specs = filter_by(has_phase=TO_GUROBIPY)
-print(f"{len(gurobi_specs)} models declare TO_GUROBIPY:")
-for s in gurobi_specs:
-    print(f"  - {s.name}")
-
-# %% [markdown]
-# ## 6. Reuse pattern — parametrize your own pytest
-#
-# The pattern the suite itself uses (see `benchmarks/test_build.py` etc.) —
-# `iter_params(phase)` returns `(spec, size)` pairs for the given phase, and
-# `param_ids(...)` builds stable test IDs for `pytest.mark.parametrize`:
-#
-# ```python
-# import pytest
-# from benchmarks import BUILD, iter_params, param_ids
-#
-# _PARAMS = iter_params(BUILD)
-#
-# @pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
-# def test_my_invariant(spec, size):
-#     m = spec.build(size)
-#     # ... assertion that should hold for every model
-# ```
-
-# %% [markdown]
-# ## 7. Reuse pattern — one-off profiling
-#
-# Grab a single model at a chosen size, measure something, throw it away.
-# `tracemalloc` works well for in-process peak-RSS spot checks (use
-# `benchmarks/memory.py` + pytest-memray for the real metric).
-
-# %%
-import tracemalloc  # noqa: E402
-
-tracemalloc.start()
-m = REGISTRY["sparse_network"].build(100)
-_current, peak = tracemalloc.get_traced_memory()
-tracemalloc.stop()
-
-print(f"sparse_network n=100: built, peak allocation ≈ {peak / 1e6:.1f} MB")
-print(
-    f"  {m.variables.nvars} scalar variables, {m.constraints.ncons} scalar constraints"
-)
-
-# %% [markdown]
-# ## 8. Running the benchmark suite
-#
-# Three size tiers, configured per-spec via `quick_threshold` and
-# `long_threshold`:
-#
-# | Flag        | Sizes included            | Use case                              |
-# | ----------- | ------------------------- | ------------------------------------- |
-# | `--quick`   | `size <= quick_threshold` | CI smoke (~18s, one size per model)   |
-# | _(none)_    | `size <= long_threshold`  | Local regression run (~45s)           |
-# | `--long`    | all sizes                 | Full sweep (~2 min, slow stuff)       |
-#
-# ```bash
-# # Quickest smoke
-# pytest benchmarks/ --quick --benchmark-disable
-#
-# # Default timing
-# pytest benchmarks/ --benchmark-only
-#
-# # Full sweep with the slow sizes
-# pytest benchmarks/ --benchmark-only --long
-#
-# # Pick a single (phase, model) pair
-# pytest benchmarks/test_lp_write.py -k "knapsack and n=1000"
-# ```
-
-# %% [markdown]
-# ## 9. Adding a new model
-#
-# 1. Drop `benchmarks/models/<name>.py` with a `build_<name>(size) -> Model`.
-# 2. Build a `ModelSpec` and call `register(...)` at module scope. Declare
-#    realistic `quick_threshold` / `long_threshold` so the smoke stays fast.
-# 3. Add an import in `benchmarks/models/__init__.py` so registration fires.
-#
-# That's it — every phase test picks the spec up automatically through
-# `iter_params(phase)`.
diff --git a/benchmarks/registry.py b/benchmarks/registry.py
index 228152b8..dbee2281 100644
--- a/benchmarks/registry.py
+++ b/benchmarks/registry.py
@@ -86,7 +86,7 @@
 )
 
 
-@dataclass(frozen=True)
+@dataclass(frozen=True, repr=False)
 class ModelSpec:
     """
     Declarative description of one benchmark model.
@@ -115,6 +115,36 @@ def applies_to(self, phase: str) -> bool:
     def has_feature(self, feature: str) -> bool:
         return feature in self.features
 
+    def __repr__(self) -> str:
+        feats = ",".join(sorted(self.features))
+        size_range = (
+            f"{self.sizes[0]}..{self.sizes[-1]}"
+            if len(self.sizes) > 1
+            else str(self.sizes[0])
+        )
+        return f"ModelSpec({self.name!r}, features={{{feats}}}, sizes={size_range})"
+
+    def _repr_html_(self) -> str:
+        # Rich rendering for Jupyter — a compact two-column table.
+        rows = [
+            ("name", self.name),
+            ("features", ", ".join(sorted(self.features))),
+            ("sizes", ", ".join(str(s) for s in self.sizes)),
+            ("phases", ", ".join(sorted(self.phases))),
+            ("quick_threshold", self.quick_threshold),
+            ("long_threshold", self.long_threshold),
+            ("requires", ", ".join(self.requires) or "—"),
+        ]
+        body = "".join(
+            f"<tr><th style='text-align:left;padding-right:1em'>{k}</th>"
+            f"<td>{v}</td></tr>"
+            for k, v in rows
+        )
+        return (
+            f"<b>ModelSpec</b> <code>{self.name}</code>"
+            f"<table style='font-size:90%'>{body}</table>"
+        )
+
 
 REGISTRY: dict[str, ModelSpec] = {}
 

From 751aa78e15546a32b9737126eb64c774eb6d6fff Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 12:48:21 +0200
Subject: [PATCH 07/68] benchmarks: typer-based CLI as the single entry point
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds ``python -m benchmarks <command>`` with typer subcommands:

- list / show / filter — introspect the registry
- smoke               — pytest --quick --benchmark-disable (CI)
- run [--long --phase --model --filter --json]
                      — pytest --benchmark-only with knobs
- notebook            — execute the registry-usage notebook
- memory save/compare — replaces the argparse main in memory.py

Modern typer style throughout: Annotated[...] for every parameter,
Literal[...] for the --phase choice, function docstrings for command
help. ``--help`` is auto-generated and is the source of truth — README
and the notebook just point at it instead of duplicating the menu.

CI smoke now calls ``python -m benchmarks smoke`` and
``python -m benchmarks notebook``. memory.py keeps its save/compare
functions but loses the argparse layer. typer added to the [benchmarks]
extra.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark-smoke.yml     |   7 +-
 benchmarks/README.md                      |  47 ++--
 benchmarks/__main__.py                    |   5 +
 benchmarks/cli.py                         | 319 ++++++++++++++++++++++
 benchmarks/memory.py                      |  54 +---
 benchmarks/notebooks/registry_usage.ipynb |  39 +--
 pyproject.toml                            |   1 +
 7 files changed, 366 insertions(+), 106 deletions(-)
 create mode 100644 benchmarks/__main__.py
 create mode 100644 benchmarks/cli.py

diff --git a/.github/workflows/benchmark-smoke.yml b/.github/workflows/benchmark-smoke.yml
index 8cffb97d..25fee396 100644
--- a/.github/workflows/benchmark-smoke.yml
+++ b/.github/workflows/benchmark-smoke.yml
@@ -38,13 +38,10 @@ jobs:
 
     - name: Run benchmark smoke
       run: |
-        pytest benchmarks/ --quick --benchmark-disable -q
+        python -m benchmarks smoke
 
     - name: Execute registry-usage notebook
       # Catches doc rot — the notebook is the canonical "how to use the
       # registry" walkthrough and must stay runnable end-to-end.
       run: |
-        jupyter nbconvert --to notebook --execute \
-          --ExecutePreprocessor.timeout=300 \
-          --output executed.ipynb \
-          benchmarks/notebooks/registry_usage.ipynb
+        python -m benchmarks notebook
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 39a22d46..8fdced0f 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -91,39 +91,44 @@ jupyter lab benchmarks/notebooks/registry_usage.ipynb
 ## Setup
 
 ```bash
-uv sync --extra dev --extra solvers
+uv sync --extra dev --extra benchmarks
 source .venv/bin/activate
 ```
 
 ## Run benchmarks
 
-The suite has three size tiers, each spec declaring its own `quick_threshold`
-and `long_threshold`:
-
-| Flag       | Sizes included                        | Typical use                              |
-| ---------- | ------------------------------------- | ---------------------------------------- |
-| `--quick`  | `size <= quick_threshold`             | CI smoke, fast local sanity check        |
-| _(none)_   | `size <= long_threshold`              | Default: medium-cost regression timing   |
-| `--long`   | all sizes                             | Full sweep (the slow stuff — many min)   |
+Everything is exposed through a single typer-based CLI. The CLI's
+`--help` is the source of truth — run it for the full menu:
 
 ```bash
-# Quick smoke run (small sizes only, no timing)
-pytest benchmarks/ --quick --benchmark-disable
+python -m benchmarks --help
+python -m benchmarks <command> --help
+```
+
+Pytest still works directly for power users (`pytest benchmarks/ ...`).
 
-# Default timing run (skips the super-long sizes)
-pytest benchmarks/ --benchmark-only
+### Size tiers
 
-# Full sweep — every size on every model
-pytest benchmarks/ --benchmark-only --long
+Each spec declares its own `quick_threshold` and `long_threshold`:
 
-# A single phase
-pytest benchmarks/test_build.py
+| Mode              | Sizes included            | Typical use                            |
+| ----------------- | ------------------------- | -------------------------------------- |
+| `smoke`           | `size <= quick_threshold` | CI smoke, fast local sanity check      |
+| `run`             | `size <= long_threshold`  | Default: medium-cost regression timing |
+| `run --long`      | all sizes                 | Full sweep (the slow stuff — many min) |
+
+### Quick reference
+
+```bash
+# Fastest sanity check (~18s, what CI runs)
+python -m benchmarks smoke
 
-# A single model across all phases
-pytest benchmarks/ -k basic
+# Default timing run
+python -m benchmarks run
 
-# A single (phase, model) pair
-pytest benchmarks/test_lp_write.py -k "knapsack and n=1000"
+# Save / compare memory snapshots
+python -m benchmarks memory save "$(git rev-parse --short HEAD)"
+python -m benchmarks memory compare master my-feature
 ```
 
 ## Metrics
diff --git a/benchmarks/__main__.py b/benchmarks/__main__.py
new file mode 100644
index 00000000..34a28439
--- /dev/null
+++ b/benchmarks/__main__.py
@@ -0,0 +1,5 @@
+"""Allow ``python -m benchmarks <command>``."""
+
+from benchmarks.cli import app
+
+app()
diff --git a/benchmarks/cli.py b/benchmarks/cli.py
new file mode 100644
index 00000000..255b4caa
--- /dev/null
+++ b/benchmarks/cli.py
@@ -0,0 +1,319 @@
+"""
+linopy benchmark CLI — one entry point for the suite.
+
+Run with::
+
+    python -m benchmarks <command> [options]
+
+The CLI is a thin layer over pytest for the timing / smoke commands, plus
+direct dispatch for registry introspection and memory snapshots.
+"""
+
+from __future__ import annotations
+
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+from typing import Annotated, Literal
+
+import typer
+
+from benchmarks import (
+    REGISTRY,
+    filter_by,
+    get,
+)
+from benchmarks.memory import compare as memory_compare
+from benchmarks.memory import save as memory_save
+
+app = typer.Typer(
+    help=(
+        "Linopy internal benchmark suite — a thin layer over pytest plus "
+        "registry introspection and memory snapshots."
+    ),
+    no_args_is_help=True,
+    rich_markup_mode="rich",
+)
+
+memory_app = typer.Typer(
+    help="Peak-RSS memory snapshots (pytest-memray under the hood).",
+    no_args_is_help=True,
+)
+app.add_typer(memory_app, name="memory")
+
+
+PhaseName = Literal["build", "matrices", "lp_write", "netcdf", "solver_handoff"]
+
+_PHASE_TEST_FILE: dict[PhaseName, str] = {
+    "build": "benchmarks/test_build.py",
+    "matrices": "benchmarks/test_matrices.py",
+    "lp_write": "benchmarks/test_lp_write.py",
+    "netcdf": "benchmarks/test_netcdf.py",
+    "solver_handoff": "benchmarks/test_solver_handoff.py",
+}
+
+
+# --- Introspection commands ------------------------------------------------
+
+
+@app.command("list")
+def list_(
+    details: Annotated[
+        bool,
+        typer.Option("--details", "-d", help="Show features and size range."),
+    ] = False,
+) -> None:
+    """
+    List the registered model specs.
+
+    By default emits one name per line — suitable for piping into other
+    tools. Pass ``--details`` for a small table that also shows the
+    features tags and the size range.
+    """
+    if not details:
+        for name in sorted(REGISTRY):
+            typer.echo(name)
+        return
+
+    rows = [
+        (
+            spec.name,
+            ",".join(sorted(spec.features)),
+            f"{spec.sizes[0]}..{spec.sizes[-1]}",
+        )
+        for spec in REGISTRY.values()
+    ]
+    name_w = max(len(r[0]) for r in rows)
+    feat_w = max(len(r[1]) for r in rows)
+    typer.echo(f"{'name':<{name_w}}  {'features':<{feat_w}}  sizes")
+    typer.echo("-" * (name_w + feat_w + 20))
+    for name, feats, sizes in rows:
+        typer.echo(f"{name:<{name_w}}  {feats:<{feat_w}}  {sizes}")
+
+
+@app.command()
+def show(
+    name: Annotated[str, typer.Argument(help="Spec name (see ``list``).")],
+) -> None:
+    """
+    Print full attributes of one model spec.
+
+    Output includes sizes, feature tags, applicable phases, the quick /
+    long size thresholds, and any optional ``requires=`` dependencies the
+    spec advertises.
+    """
+    try:
+        spec = get(name)
+    except KeyError as exc:
+        typer.secho(f"unknown model: {name!r}", fg=typer.colors.RED, err=True)
+        typer.echo(f"available: {', '.join(sorted(REGISTRY))}", err=True)
+        raise typer.Exit(code=2) from exc
+    typer.echo(repr(spec))
+    typer.echo(f"  sizes:           {spec.sizes}")
+    typer.echo(f"  features:        {sorted(spec.features)}")
+    typer.echo(f"  phases:          {sorted(spec.phases)}")
+    typer.echo(f"  quick_threshold: {spec.quick_threshold}")
+    typer.echo(f"  long_threshold:  {spec.long_threshold}")
+    if spec.requires:
+        typer.echo(f"  requires:        {list(spec.requires)}")
+
+
+@app.command("filter")
+def filter_(
+    feature: Annotated[
+        str | None,
+        typer.Option(help="Feature tag, e.g. 'quadratic', 'integer', 'sos'."),
+    ] = None,
+    phase: Annotated[
+        str | None,
+        typer.Option(help="Phase tag, e.g. 'to_gurobipy', 'lp_write'."),
+    ] = None,
+) -> None:
+    """
+    Filter specs by feature or phase tag.
+
+    Both filters can be combined; the result is the intersection.
+    At least one of ``--feature`` / ``--phase`` must be supplied.
+    """
+    if feature is None and phase is None:
+        typer.secho("pass --feature and/or --phase", fg=typer.colors.RED, err=True)
+        raise typer.Exit(code=2)
+    matches = filter_by(has_feature=feature, has_phase=phase)
+    for spec in matches:
+        typer.echo(repr(spec))
+
+
+# --- Execution commands ----------------------------------------------------
+
+
+def _run_pytest(args: list[str]) -> None:
+    """Invoke pytest as a subprocess and propagate its exit code."""
+    cmd = [sys.executable, "-m", "pytest", *args]
+    typer.secho(f"$ {' '.join(cmd)}", fg=typer.colors.BRIGHT_BLACK)
+    result = subprocess.run(cmd, check=False)
+    if result.returncode != 0:
+        raise typer.Exit(code=result.returncode)
+
+
+@app.command()
+def smoke(
+    extra: Annotated[
+        list[str] | None,
+        typer.Argument(help="Extra args forwarded to pytest verbatim."),
+    ] = None,
+) -> None:
+    """
+    Quick smoke run — what CI uses on every PR.
+
+    Equivalent to ``pytest benchmarks/ --quick --benchmark-disable -q``.
+    Every model builds at one size and every phase fires once, no timings
+    recorded. Typical wall-clock: ~20s.
+    """
+    args = ["benchmarks/", "--quick", "--benchmark-disable", "-q"]
+    if extra:
+        args.extend(extra)
+    _run_pytest(args)
+
+
+@app.command()
+def run(
+    long: Annotated[
+        bool,
+        typer.Option(
+            "--long",
+            help="Include the slowest sizes (above each spec's long_threshold).",
+        ),
+    ] = False,
+    phase: Annotated[
+        PhaseName | None,
+        typer.Option(help="Restrict to one phase's test file."),
+    ] = None,
+    model: Annotated[
+        str | None,
+        typer.Option(help="Restrict to one model (passed as pytest ``-k``)."),
+    ] = None,
+    filter_expr: Annotated[
+        str | None,
+        typer.Option(
+            "--filter",
+            "-k",
+            help="Arbitrary pytest ``-k`` expression (AND-ed with ``--model``).",
+        ),
+    ] = None,
+    json_out: Annotated[
+        Path | None,
+        typer.Option("--json", help="Save pytest-benchmark JSON to this path."),
+    ] = None,
+    extra: Annotated[
+        list[str] | None,
+        typer.Argument(help="Extra args forwarded to pytest verbatim."),
+    ] = None,
+) -> None:
+    """
+    Default timing run. Records timings with pytest-benchmark.
+
+    Without ``--long``, sizes above each spec's ``long_threshold`` are
+    skipped — keeps the wall-clock around 45s instead of several minutes.
+    Add ``--long`` for the full sweep including the heaviest sizes
+    (knapsack at 1M, basic at 1600, pypsa_scigrid at >50).
+
+    To skip timing entirely (e.g. just verifying everything runs at a
+    bigger size), use ``smoke`` instead, or pass ``--benchmark-disable``
+    through ``extra``.
+    """
+    args: list[str] = []
+    args.append(_PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/")
+    if long:
+        args.append("--long")
+    args.append("--benchmark-only")
+    if json_out is not None:
+        args.extend(["--benchmark-json", str(json_out)])
+
+    k_parts = [p for p in (model, filter_expr) if p]
+    if k_parts:
+        args.extend(["-k", " and ".join(k_parts)])
+
+    if extra:
+        args.extend(extra)
+    _run_pytest(args)
+
+
+@app.command()
+def notebook() -> None:
+    """
+    Execute the registry-usage notebook end-to-end.
+
+    Used by CI to catch doc rot — if any cell raises, the workflow fails.
+    The executed copy is written to a tempdir and discarded, so the
+    in-tree notebook stays output-free (nbstripout doesn't have to chase
+    a populated file).
+    """
+    nb = Path("benchmarks/notebooks/registry_usage.ipynb")
+    if not nb.exists():
+        typer.secho(f"notebook not found: {nb}", fg=typer.colors.RED, err=True)
+        raise typer.Exit(code=1)
+    with tempfile.TemporaryDirectory() as tmp:
+        cmd = [
+            sys.executable,
+            "-m",
+            "jupyter",
+            "nbconvert",
+            "--to",
+            "notebook",
+            "--execute",
+            "--ExecutePreprocessor.timeout=300",
+            "--output-dir",
+            tmp,
+            "--output",
+            "executed.ipynb",
+            str(nb),
+        ]
+        typer.secho(f"$ {' '.join(cmd)}", fg=typer.colors.BRIGHT_BLACK)
+        result = subprocess.run(cmd, check=False)
+    if result.returncode != 0:
+        raise typer.Exit(code=result.returncode)
+
+
+# --- Memory subcommands ----------------------------------------------------
+
+
+@memory_app.command("save")
+def memory_save_cmd(
+    label: Annotated[
+        str, typer.Argument(help="Label to attach to this snapshot, e.g. a git sha.")
+    ],
+    quick: Annotated[
+        bool, typer.Option("--quick", help="Use smaller problem sizes.")
+    ] = False,
+    test_path: Annotated[
+        list[str] | None,
+        typer.Option("--test-path", help="Test file(s) to run; defaults to build."),
+    ] = None,
+) -> None:
+    """
+    Run the build phase under pytest-memray and save peak RSS to JSON.
+
+    Results land in ``.benchmarks/memory/<label>.json``. Use ``compare``
+    afterwards to diff two snapshots.
+    """
+    memory_save(label, quick=quick, test_paths=test_path)
+
+
+@memory_app.command("compare")
+def memory_compare_cmd(
+    label_a: Annotated[str, typer.Argument(help="Baseline label (typically master).")],
+    label_b: Annotated[str, typer.Argument(help="Candidate label (your branch).")],
+) -> None:
+    """
+    Compare two saved memory snapshots side-by-side.
+
+    Prints a per-test table of label_a vs label_b peak RSS and a percent
+    change. Tests present in only one snapshot are shown with ``—`` for
+    the missing column.
+    """
+    memory_compare(label_a, label_b)
+
+
+if __name__ == "__main__":  # pragma: no cover
+    app()
diff --git a/benchmarks/memory.py b/benchmarks/memory.py
index 20af4b8a..b48a0be1 100644
--- a/benchmarks/memory.py
+++ b/benchmarks/memory.py
@@ -1,26 +1,17 @@
-#!/usr/bin/env python
 """
 Measure and compare peak memory using pytest-memray.
 
-Usage:
-    # Save a baseline (on master)
-    python benchmarks/memory.py save master
+This module exposes ``save(label, ...)`` and ``compare(label_a, label_b)`` as
+plain functions; user-facing invocation goes through the typer CLI::
 
-    # Save current branch
-    python benchmarks/memory.py save my-feature
+    python -m benchmarks memory save <label>
+    python -m benchmarks memory compare <a> <b>
 
-    # Compare two saved runs
-    python benchmarks/memory.py compare master my-feature
-
-    # Quick mode (smaller sizes)
-    python benchmarks/memory.py save master --quick
-
-Results are stored in .benchmarks/memory/.
+Results are stored in ``.benchmarks/memory/``.
 """
 
 from __future__ import annotations
 
-import argparse
 import json
 import platform
 import re
@@ -162,38 +153,3 @@ def compare(label_a: str, label_b: str) -> None:
         print(f"{short:<60} {a_str:>10} {b_str:>10} {change:>10}")
 
     print()
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
-    )
-    sub = parser.add_subparsers(dest="cmd", required=True)
-
-    p_save = sub.add_parser("save", help="Run benchmarks and save memory results")
-    p_save.add_argument(
-        "label", help="Label for this run (e.g. 'master', 'my-feature')"
-    )
-    p_save.add_argument(
-        "--quick", action="store_true", help="Use smaller problem sizes"
-    )
-    p_save.add_argument(
-        "--test-path",
-        nargs="+",
-        default=None,
-        help="Test file(s) to run (default: all phases)",
-    )
-
-    p_cmp = sub.add_parser("compare", help="Compare two saved runs")
-    p_cmp.add_argument("label_a", help="First run label (baseline)")
-    p_cmp.add_argument("label_b", help="Second run label")
-
-    args = parser.parse_args()
-    if args.cmd == "save":
-        save(args.label, quick=args.quick, test_paths=args.test_path)
-    elif args.cmd == "compare":
-        compare(args.label_a, args.label_b)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmarks/notebooks/registry_usage.ipynb b/benchmarks/notebooks/registry_usage.ipynb
index 17511e5d..ba8c8c46 100644
--- a/benchmarks/notebooks/registry_usage.ipynb
+++ b/benchmarks/notebooks/registry_usage.ipynb
@@ -264,28 +264,17 @@
    "source": [
     "## 8. Running the benchmark suite\n",
     "\n",
-    "Three size tiers, configured per-spec via `quick_threshold` and\n",
-    "`long_threshold`:\n",
-    "\n",
-    "| Flag        | Sizes included            | Use case                              |\n",
-    "| ----------- | ------------------------- | ------------------------------------- |\n",
-    "| `--quick`   | `size <= quick_threshold` | CI smoke (~18s, one size per model)   |\n",
-    "| _(none)_    | `size <= long_threshold`  | Local regression run (~45s)           |\n",
-    "| `--long`    | all sizes                 | Full sweep (~2 min, slow stuff)       |\n",
+    "Everything is exposed through a typer CLI; its auto-generated help is the\n",
+    "source of truth:\n",
     "\n",
     "```bash\n",
-    "# Quickest smoke\n",
-    "pytest benchmarks/ --quick --benchmark-disable\n",
-    "\n",
-    "# Default timing\n",
-    "pytest benchmarks/ --benchmark-only\n",
-    "\n",
-    "# Full sweep with the slow sizes\n",
-    "pytest benchmarks/ --benchmark-only --long\n",
+    "python -m benchmarks --help            # top-level menu\n",
+    "python -m benchmarks run --help        # per-command flags\n",
+    "```\n",
     "\n",
-    "# Pick a single (phase, model) pair\n",
-    "pytest benchmarks/test_lp_write.py -k \"knapsack and n=1000\"\n",
-    "```"
+    "Three size tiers configured per-spec via `quick_threshold` /\n",
+    "`long_threshold`: `smoke` (≤ quick), `run` (≤ long), `run --long` (all\n",
+    "sizes). Pytest still works directly for power users."
    ]
   },
   {
@@ -310,18 +299,6 @@
    "display_name": "Python 3",
    "language": "python",
    "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.11"
   }
  },
  "nbformat": 4,
diff --git a/pyproject.toml b/pyproject.toml
index 19d0abb3..88aafb3b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -87,6 +87,7 @@ benchmarks = [
     "pypsa",
     "highspy>=1.7.1",
     "pytest-memray",
+    "typer>=0.12",
 ]
 solvers = [
     "gurobipy",

From 8b124e29de065e0e93249133e9f8579240506cd8 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 14:07:17 +0200
Subject: [PATCH 08/68] benchmarks: pin typer==0.26.2, use ctx.args for pytest
 pass-through

Two cleanups after checking typer's docs:

- Pin typer to the latest release (==0.26.2) in the [benchmarks] extra,
  so the CLI's behaviour is reproducible across dev / CI / contributor
  machines.

- Switch ``smoke`` and ``run`` from the ``extra: list[str]`` argument
  to the idiomatic ``typer.Context`` + ``context_settings`` pattern
  (allow_extra_args, ignore_unknown_options). With the old style, any
  trailing ``--flag`` would be parsed as an unknown option and rejected;
  with ctx.args, ``python -m benchmarks run --long -- --tb=short -x``
  actually works.

Other patterns already match typer's recommended style: Annotated[...],
Literal for choice params, docstrings for command help, sub-apps via
add_typer.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py | 37 +++++++++++++++++++------------------
 pyproject.toml    |  2 +-
 2 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 255b4caa..b38c86ab 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -156,28 +156,30 @@ def _run_pytest(args: list[str]) -> None:
         raise typer.Exit(code=result.returncode)
 
 
-@app.command()
-def smoke(
-    extra: Annotated[
-        list[str] | None,
-        typer.Argument(help="Extra args forwarded to pytest verbatim."),
-    ] = None,
-) -> None:
+@app.command(
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def smoke(ctx: typer.Context) -> None:
     """
     Quick smoke run — what CI uses on every PR.
 
     Equivalent to ``pytest benchmarks/ --quick --benchmark-disable -q``.
     Every model builds at one size and every phase fires once, no timings
     recorded. Typical wall-clock: ~20s.
+
+    Any trailing arguments are forwarded to pytest verbatim, e.g.::
+
+        python -m benchmarks smoke -k basic --tb=short
     """
-    args = ["benchmarks/", "--quick", "--benchmark-disable", "-q"]
-    if extra:
-        args.extend(extra)
+    args = ["benchmarks/", "--quick", "--benchmark-disable", "-q", *ctx.args]
     _run_pytest(args)
 
 
-@app.command()
+@app.command(
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
 def run(
+    ctx: typer.Context,
     long: Annotated[
         bool,
         typer.Option(
@@ -205,10 +207,6 @@ def run(
         Path | None,
         typer.Option("--json", help="Save pytest-benchmark JSON to this path."),
     ] = None,
-    extra: Annotated[
-        list[str] | None,
-        typer.Argument(help="Extra args forwarded to pytest verbatim."),
-    ] = None,
 ) -> None:
     """
     Default timing run. Records timings with pytest-benchmark.
@@ -218,9 +216,13 @@ def run(
     Add ``--long`` for the full sweep including the heaviest sizes
     (knapsack at 1M, basic at 1600, pypsa_scigrid at >50).
 
+    Any trailing arguments are forwarded to pytest verbatim, e.g.::
+
+        python -m benchmarks run --long -- --tb=short -x
+
     To skip timing entirely (e.g. just verifying everything runs at a
     bigger size), use ``smoke`` instead, or pass ``--benchmark-disable``
-    through ``extra``.
+    as a trailing arg.
     """
     args: list[str] = []
     args.append(_PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/")
@@ -234,8 +236,7 @@ def run(
     if k_parts:
         args.extend(["-k", " and ".join(k_parts)])
 
-    if extra:
-        args.extend(extra)
+    args.extend(ctx.args)
     _run_pytest(args)
 
 
diff --git a/pyproject.toml b/pyproject.toml
index 88aafb3b..8be431d1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -87,7 +87,7 @@ benchmarks = [
     "pypsa",
     "highspy>=1.7.1",
     "pytest-memray",
-    "typer>=0.12",
+    "typer==0.26.2",
 ]
 solvers = [
     "gurobipy",

From 86fd03656f1cfe1a2e37733ea6b7307a30637cdb Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 14:32:11 +0200
Subject: [PATCH 09/68] benchmarks: pin test infra + add transitive lockfile
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two layers of pinning for stable measurement:

- ``[benchmarks]`` extra in pyproject pins the test infra exactly
  (pytest, pytest-benchmark, pytest-memray, pypsa, highspy, netcdf4,
  nbconvert, typer). Loose enough that the sweep workflow can install
  varying linopy versions on top.

- ``benchmarks/requirements.lock`` is the full transitive resolution
  (numpy, scipy, pandas, xarray, plus everything else). Generated via
  ``uv pip compile --no-emit-package linopy`` so the lockfile pins the
  *environment around linopy* without pinning linopy itself — that lets
  the same lockfile work for both current-tip regression runs and
  cross-version sweeps.

README clarifies that the lockfile gives consistency over time on the
same machine, not absolute reproducibility across machines (CPU / cache
/ memory bandwidth still matter).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/README.md         |  24 ++
 benchmarks/requirements.lock | 601 +++++++++++++++++++++++++++++++++++
 pyproject.toml               |  24 +-
 3 files changed, 645 insertions(+), 4 deletions(-)
 create mode 100644 benchmarks/requirements.lock

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 8fdced0f..5047ad3d 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -90,9 +90,33 @@ jupyter lab benchmarks/notebooks/registry_usage.ipynb
 
 ## Setup
 
+Two install paths, depending on what you're doing:
+
 ```bash
+# Development / casual benchmark runs — loose constraints from pyproject
 uv sync --extra dev --extra benchmarks
 source .venv/bin/activate
+
+# Stable measurement environment — fully resolved lockfile (linopy itself
+# is excluded, so you install whichever linopy version you want on top)
+uv pip install -r benchmarks/requirements.lock
+uv pip install -e .            # current linopy
+# — or —
+uv pip install linopy==0.5.0   # for a cross-version sweep
+```
+
+The lockfile pins every transitive (numpy / scipy / pandas / xarray / ...)
+so the *environment around linopy* stays stable. Absolute numbers are still
+machine-dependent (CPU, cache, memory bandwidth) — what the lockfile gives
+you is consistency over time on the same machine, so when you run the suite
+at two points the delta reflects linopy changes, not a numpy upgrade.
+
+Regenerate after bumping the ``[benchmarks]`` pins in ``pyproject.toml``:
+
+```bash
+uv pip compile pyproject.toml --extra benchmarks --extra dev --extra solvers \
+  --no-emit-package linopy \
+  -o benchmarks/requirements.lock
 ```
 
 ## Run benchmarks
diff --git a/benchmarks/requirements.lock b/benchmarks/requirements.lock
new file mode 100644
index 00000000..da416b5d
--- /dev/null
+++ b/benchmarks/requirements.lock
@@ -0,0 +1,601 @@
+# This file was autogenerated by uv via the following command:
+#    uv pip compile pyproject.toml --extra benchmarks --extra dev --extra solvers --no-emit-package linopy -o benchmarks/requirements.lock
+annotated-doc==0.0.4
+    # via typer
+anyio==4.13.0
+    # via
+    #   httpx
+    #   jupyter-server
+appnope==0.1.4
+    # via ipykernel
+argon2-cffi==25.1.0
+    # via jupyter-server
+argon2-cffi-bindings==25.1.0
+    # via argon2-cffi
+arrow==1.4.0
+    # via isoduration
+ast-serialize==0.5.0
+    # via mypy
+asttokens==3.0.1
+    # via stack-data
+async-lru==2.3.0
+    # via jupyterlab
+attrs==26.1.0
+    # via
+    #   jsonschema
+    #   referencing
+babel==2.18.0
+    # via jupyterlab-server
+bcrypt==5.0.0
+    # via paramiko
+beautifulsoup4==4.14.3
+    # via nbconvert
+bleach==6.3.0
+    # via nbconvert
+bottleneck==1.6.0
+    # via
+    #   linopy (pyproject.toml)
+    #   linopy
+certifi==2026.5.20
+    # via
+    #   httpcore
+    #   httpx
+    #   netcdf4
+    #   pyogrio
+    #   pyproj
+    #   requests
+cffi==2.0.0
+    # via
+    #   argon2-cffi-bindings
+    #   cryptography
+    #   pynacl
+cfgv==3.5.0
+    # via pre-commit
+cftime==1.6.5
+    # via netcdf4
+charset-normalizer==3.4.7
+    # via requests
+click==8.4.1
+    # via dask
+cloudpickle==3.1.2
+    # via dask
+comm==0.2.3
+    # via
+    #   ipykernel
+    #   ipywidgets
+contourpy==1.3.3
+    # via matplotlib
+coptpy==8.0.4
+    # via linopy (pyproject.toml)
+coverage==7.14.1
+    # via pytest-cov
+cryptography==48.0.0
+    # via
+    #   paramiko
+    #   types-paramiko
+cycler==0.12.1
+    # via matplotlib
+dask==2026.3.0
+    # via
+    #   linopy (pyproject.toml)
+    #   linopy
+debugpy==1.8.20
+    # via ipykernel
+decorator==5.3.1
+    # via ipython
+defusedxml==0.7.1
+    # via nbconvert
+deprecation==2.1.0
+    # via
+    #   linopy (pyproject.toml)
+    #   linopy
+    #   pypsa
+distlib==0.4.0
+    # via virtualenv
+executing==2.2.1
+    # via stack-data
+fastjsonschema==2.21.2
+    # via nbformat
+filelock==3.29.0
+    # via
+    #   python-discovery
+    #   virtualenv
+fonttools==4.63.0
+    # via matplotlib
+fqdn==1.5.1
+    # via jsonschema
+fsspec==2026.4.0
+    # via dask
+geopandas==1.1.3
+    # via pypsa
+gurobipy==13.0.2
+    # via linopy (pyproject.toml)
+h11==0.16.0
+    # via httpcore
+highspy==1.13.1
+    # via
+    #   linopy (pyproject.toml)
+    #   pypsa
+httpcore==1.0.9
+    # via httpx
+httpx==0.28.1
+    # via jupyterlab
+identify==2.6.19
+    # via pre-commit
+idna==3.16
+    # via
+    #   anyio
+    #   httpx
+    #   jsonschema
+    #   requests
+importlib-metadata==9.0.0
+    # via dask
+iniconfig==2.3.0
+    # via pytest
+invoke==3.0.3
+    # via paramiko
+ipykernel==7.2.0
+    # via
+    #   jupyter
+    #   jupyter-console
+    #   jupyterlab
+ipython==9.13.0
+    # via
+    #   ipykernel
+    #   ipywidgets
+    #   jupyter-console
+ipython-pygments-lexers==1.1.1
+    # via ipython
+ipywidgets==8.1.8
+    # via jupyter
+isoduration==20.11.0
+    # via jsonschema
+jedi==0.20.0
+    # via ipython
+jinja2==3.1.6
+    # via
+    #   jupyter-server
+    #   jupyterlab
+    #   jupyterlab-server
+    #   memray
+    #   nbconvert
+    #   pydeck
+json5==0.14.0
+    # via jupyterlab-server
+jsonpointer==3.1.1
+    # via jsonschema
+jsonschema==4.26.0
+    # via
+    #   jupyter-events
+    #   jupyterlab-server
+    #   nbformat
+jsonschema-specifications==2025.9.1
+    # via jsonschema
+jupyter==1.1.1
+    # via linopy (pyproject.toml)
+jupyter-client==8.8.0
+    # via
+    #   ipykernel
+    #   jupyter-console
+    #   jupyter-server
+    #   nbclient
+jupyter-console==6.6.3
+    # via jupyter
+jupyter-core==5.9.1
+    # via
+    #   ipykernel
+    #   jupyter-client
+    #   jupyter-console
+    #   jupyter-server
+    #   jupyterlab
+    #   nbclient
+    #   nbconvert
+    #   nbformat
+jupyter-events==0.12.1
+    # via jupyter-server
+jupyter-lsp==2.3.1
+    # via jupyterlab
+jupyter-server==2.18.2
+    # via
+    #   jupyter-lsp
+    #   jupyterlab
+    #   jupyterlab-server
+    #   notebook
+    #   notebook-shim
+jupyter-server-terminals==0.5.4
+    # via jupyter-server
+jupyterlab==4.5.7
+    # via
+    #   jupyter
+    #   notebook
+jupyterlab-pygments==0.3.0
+    # via nbconvert
+jupyterlab-server==2.28.0
+    # via
+    #   jupyterlab
+    #   notebook
+jupyterlab-widgets==3.0.16
+    # via ipywidgets
+kiwisolver==1.5.0
+    # via matplotlib
+knitro==15.1.0
+    # via linopy (pyproject.toml)
+lark==1.3.1
+    # via rfc3987-syntax
+levenshtein==0.27.3
+    # via pypsa
+librt==0.11.0
+    # via mypy
+linkify-it-py==2.1.0
+    # via markdown-it-py
+locket==1.0.0
+    # via partd
+markdown-it-py==4.2.0
+    # via
+    #   mdit-py-plugins
+    #   rich
+    #   textual
+markupsafe==3.0.3
+    # via
+    #   jinja2
+    #   nbconvert
+matplotlib==3.10.9
+    # via
+    #   pypsa
+    #   seaborn
+matplotlib-inline==0.2.2
+    # via
+    #   ipykernel
+    #   ipython
+mdit-py-plugins==0.6.1
+    # via textual
+mdurl==0.1.2
+    # via markdown-it-py
+memray==1.19.3
+    # via pytest-memray
+mindoptpy==2.3.0
+    # via linopy (pyproject.toml)
+mistune==3.2.1
+    # via nbconvert
+mosek==11.2.0
+    # via linopy (pyproject.toml)
+mypy==2.1.0
+    # via linopy (pyproject.toml)
+mypy-extensions==1.1.0
+    # via mypy
+narwhals==2.21.2
+    # via plotly
+nbclient==0.10.4
+    # via nbconvert
+nbconvert==7.17.1
+    # via
+    #   linopy (pyproject.toml)
+    #   jupyter
+    #   jupyter-server
+nbformat==5.10.4
+    # via
+    #   jupyter-server
+    #   nbclient
+    #   nbconvert
+nest-asyncio==1.6.0
+    # via ipykernel
+netcdf4==1.7.4
+    # via
+    #   linopy (pyproject.toml)
+    #   pypsa
+networkx==3.6.1
+    # via pypsa
+nodeenv==1.10.0
+    # via pre-commit
+notebook==7.5.6
+    # via jupyter
+notebook-shim==0.2.4
+    # via
+    #   jupyterlab
+    #   notebook
+numexpr==2.14.1
+    # via
+    #   linopy (pyproject.toml)
+    #   linopy
+numpy==2.4.6
+    # via
+    #   linopy (pyproject.toml)
+    #   bottleneck
+    #   cftime
+    #   contourpy
+    #   geopandas
+    #   highspy
+    #   linopy
+    #   matplotlib
+    #   mindoptpy
+    #   mosek
+    #   netcdf4
+    #   numexpr
+    #   pandas
+    #   pydeck
+    #   pyogrio
+    #   pypsa
+    #   scipy
+    #   seaborn
+    #   shapely
+    #   xarray
+overrides==7.7.0
+    # via jupyter-server
+packaging==26.2
+    # via
+    #   linopy (pyproject.toml)
+    #   dask
+    #   deprecation
+    #   geopandas
+    #   ipykernel
+    #   jupyter-events
+    #   jupyter-server
+    #   jupyterlab
+    #   jupyterlab-server
+    #   linopy
+    #   matplotlib
+    #   nbconvert
+    #   plotly
+    #   pyogrio
+    #   pytest
+    #   xarray
+pandas==3.0.3
+    # via
+    #   geopandas
+    #   pypsa
+    #   seaborn
+    #   xarray
+pandocfilters==1.5.1
+    # via nbconvert
+paramiko==5.0.0
+    # via linopy (pyproject.toml)
+parso==0.8.7
+    # via jedi
+partd==1.4.2
+    # via dask
+pathspec==1.1.1
+    # via mypy
+pexpect==4.9.0
+    # via ipython
+pillow==12.2.0
+    # via matplotlib
+platformdirs==4.10.0
+    # via
+    #   jupyter-core
+    #   pypsa
+    #   python-discovery
+    #   textual
+    #   virtualenv
+plotly==6.7.0
+    # via pypsa
+pluggy==1.6.0
+    # via
+    #   pytest
+    #   pytest-cov
+polars==1.41.1
+    # via
+    #   linopy (pyproject.toml)
+    #   linopy
+polars-runtime-32==1.41.1
+    # via polars
+pre-commit==4.6.0
+    # via linopy (pyproject.toml)
+prometheus-client==0.25.0
+    # via jupyter-server
+prompt-toolkit==3.0.52
+    # via
+    #   ipython
+    #   jupyter-console
+psutil==7.2.2
+    # via
+    #   ipykernel
+    #   ipython
+ptyprocess==0.7.0
+    # via
+    #   pexpect
+    #   terminado
+pure-eval==0.2.3
+    # via stack-data
+py-cpuinfo==9.0.0
+    # via pytest-benchmark
+pycparser==3.0
+    # via cffi
+pydeck==0.9.2
+    # via pypsa
+pygments==2.20.0
+    # via
+    #   ipython
+    #   ipython-pygments-lexers
+    #   jupyter-console
+    #   nbconvert
+    #   pytest
+    #   rich
+    #   textual
+pynacl==1.6.2
+    # via paramiko
+pyogrio==0.12.1
+    # via geopandas
+pyparsing==3.3.2
+    # via matplotlib
+pyproj==3.7.2
+    # via geopandas
+pypsa==1.2.2
+    # via linopy (pyproject.toml)
+pytest==9.0.3
+    # via
+    #   linopy (pyproject.toml)
+    #   pytest-benchmark
+    #   pytest-cov
+    #   pytest-memray
+pytest-benchmark==5.2.3
+    # via linopy (pyproject.toml)
+pytest-cov==7.1.0
+    # via linopy (pyproject.toml)
+pytest-memray==1.8.0
+    # via linopy (pyproject.toml)
+python-dateutil==2.9.0.post0
+    # via
+    #   arrow
+    #   jupyter-client
+    #   matplotlib
+    #   pandas
+python-discovery==1.4.0
+    # via virtualenv
+python-json-logger==4.1.0
+    # via jupyter-events
+pyyaml==6.0.3
+    # via
+    #   dask
+    #   jupyter-events
+    #   pre-commit
+pyzmq==27.1.0
+    # via
+    #   ipykernel
+    #   jupyter-client
+    #   jupyter-console
+    #   jupyter-server
+rapidfuzz==3.14.5
+    # via levenshtein
+referencing==0.37.0
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+    #   jupyter-events
+requests==2.34.2
+    # via jupyterlab-server
+rfc3339-validator==0.1.4
+    # via
+    #   jsonschema
+    #   jupyter-events
+rfc3986-validator==0.1.1
+    # via
+    #   jsonschema
+    #   jupyter-events
+rfc3987-syntax==1.1.0
+    # via jsonschema
+rich==15.0.0
+    # via
+    #   memray
+    #   textual
+    #   typer
+rpds-py==2026.5.1
+    # via
+    #   jsonschema
+    #   referencing
+scipy==1.17.1
+    # via
+    #   linopy (pyproject.toml)
+    #   linopy
+    #   mindoptpy
+    #   pypsa
+seaborn==0.13.2
+    # via pypsa
+send2trash==2.1.0
+    # via jupyter-server
+setuptools==82.0.1
+    # via jupyterlab
+shapely==2.1.2
+    # via
+    #   geopandas
+    #   pypsa
+shellingham==1.5.4
+    # via typer
+six==1.17.0
+    # via
+    #   python-dateutil
+    #   rfc3339-validator
+soupsieve==2.8.4
+    # via beautifulsoup4
+stack-data==0.6.3
+    # via ipython
+terminado==0.18.1
+    # via
+    #   jupyter-server
+    #   jupyter-server-terminals
+textual==8.2.7
+    # via memray
+tinycss2==1.4.0
+    # via bleach
+toolz==1.1.0
+    # via
+    #   linopy (pyproject.toml)
+    #   dask
+    #   linopy
+    #   partd
+tornado==6.5.6
+    # via
+    #   ipykernel
+    #   jupyter-client
+    #   jupyter-server
+    #   jupyterlab
+    #   notebook
+    #   terminado
+tqdm==4.67.3
+    # via
+    #   linopy (pyproject.toml)
+    #   linopy
+traitlets==5.15.0
+    # via
+    #   ipykernel
+    #   ipython
+    #   ipywidgets
+    #   jupyter-client
+    #   jupyter-console
+    #   jupyter-core
+    #   jupyter-events
+    #   jupyter-server
+    #   jupyterlab
+    #   matplotlib-inline
+    #   nbclient
+    #   nbconvert
+    #   nbformat
+typer==0.26.2
+    # via linopy (pyproject.toml)
+types-paramiko==4.0.0.20260518
+    # via linopy (pyproject.toml)
+types-requests==2.33.0.20260518
+    # via linopy (pyproject.toml)
+typing-extensions==4.15.0
+    # via
+    #   anyio
+    #   beautifulsoup4
+    #   ipython
+    #   mypy
+    #   referencing
+    #   textual
+tzdata==2026.2
+    # via arrow
+uc-micro-py==2.0.0
+    # via linkify-it-py
+uri-template==1.3.0
+    # via jsonschema
+urllib3==2.7.0
+    # via
+    #   requests
+    #   types-requests
+validators==0.35.0
+    # via pypsa
+virtualenv==21.4.1
+    # via pre-commit
+wcwidth==0.7.0
+    # via prompt-toolkit
+webcolors==25.10.0
+    # via jsonschema
+webencodings==0.5.1
+    # via
+    #   bleach
+    #   tinycss2
+websocket-client==1.9.0
+    # via jupyter-server
+widgetsnbextension==4.0.15
+    # via ipywidgets
+xarray==2026.4.0
+    # via
+    #   linopy (pyproject.toml)
+    #   linopy
+    #   pypsa
+zipp==4.1.0
+    # via importlib-metadata
+
+# The following packages were excluded from the output:
+# linopy
diff --git a/pyproject.toml b/pyproject.toml
index 8be431d1..cae6831c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -82,11 +82,27 @@ dev = [
     "highspy",
     "jupyter",
 ]
+# Test infrastructure pinned exactly so the measurement environment stays
+# stable over time on the same machine — deltas between two runs then
+# reflect linopy changes, not a pytest or pandas upgrade. Absolute numbers
+# are still machine-dependent (CPU / cache / memory bandwidth).
+#
+# Not pinned here: numpy / scipy / pandas / xarray. They also affect
+# measurements, but the full transitive set lives in
+# ``benchmarks/requirements.lock`` (regen via ``uv pip compile``). The
+# lockfile excludes linopy itself so ``sweep`` can install any linopy
+# version on top of a stable environment.
+#
+# ``highspy`` follows the project-wide ``!=1.14.0`` exclusion (see the
+# ``solvers`` extra).
 benchmarks = [
-    "pytest-benchmark",
-    "pypsa",
-    "highspy>=1.7.1",
-    "pytest-memray",
+    "pytest==9.0.3",
+    "pytest-benchmark==5.2.3",
+    "pytest-memray==1.8.0",
+    "pypsa==1.2.2",
+    "highspy==1.13.1",
+    "netcdf4==1.7.4",
+    "nbconvert==7.17.1",
     "typer==0.26.2",
 ]
 solvers = [

From 9be18e1ad1874627732baf7377a71edbd6282377 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 14:43:54 +0200
Subject: [PATCH 10/68] benchmarks: add ``sweep`` subcommand for cross-version
 perf runs

``python -m benchmarks sweep 0.5.0 0.6.0 0.7.0`` builds a fresh venv
per version with uv, installs the benchmark infra (lockfile by default,
or the [benchmarks] pinned subset with --no-use-lock) plus the target
linopy in a single resolution pass, and runs the suite. Snapshots land
in ``<output-dir>/linopy-<version>.json``.

Useful for bootstrapping a perf history against published linopy
releases. The current benchmark code runs against each linopy version
(constant measurement layer); the ``_API_AVAILABLE`` gates on sos /
piecewise specs make older linopy versions skip those phases gracefully.

Verified locally: ``sweep 0.7.0 --quick --no-use-lock`` runs end-to-end
in ~2 min (uv installs 57 packages in 200ms; the rest is the benchmark
run). Plain releases (0.4.0) and pip specs (git+https://...) both work
via the ``_linopy_install_spec`` helper.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/README.md |   5 ++
 benchmarks/cli.py    | 179 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 184 insertions(+)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 5047ad3d..acf53316 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -153,6 +153,11 @@ python -m benchmarks run
 # Save / compare memory snapshots
 python -m benchmarks memory save "$(git rev-parse --short HEAD)"
 python -m benchmarks memory compare master my-feature
+
+# Sweep across several linopy versions (bootstraps perf history).
+# Builds a fresh uv venv per version, installs lockfile (or pinned
+# subset) + that linopy, runs the suite, saves JSON. Uv keeps it fast.
+python -m benchmarks sweep 0.5.0 0.6.0 0.7.0
 ```
 
 ## Metrics
diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index b38c86ab..85506c32 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -11,6 +11,9 @@
 
 from __future__ import annotations
 
+import os
+import re
+import shutil
 import subprocess
 import sys
 import tempfile
@@ -276,6 +279,182 @@ def notebook() -> None:
         raise typer.Exit(code=result.returncode)
 
 
+# --- Sweep across linopy versions ------------------------------------------
+
+
+_PLAIN_VERSION_RE = re.compile(r"^\d+(\.\d+)*([a-z]+\d*)?$")
+
+
+def _linopy_install_spec(version: str) -> str:
+    """Turn ``0.4.0`` → ``linopy==0.4.0``, leave anything URL-y untouched."""
+    if _PLAIN_VERSION_RE.match(version):
+        return f"linopy=={version}"
+    return version
+
+
+def _venv_python(venv: Path) -> Path:
+    return (
+        venv / "Scripts" / "python.exe" if os.name == "nt" else venv / "bin" / "python"
+    )
+
+
+@app.command()
+def sweep(
+    versions: Annotated[
+        list[str],
+        typer.Argument(help="linopy versions, e.g. 0.4.0 0.5.0 (or any pip spec)."),
+    ],
+    output_dir: Annotated[
+        Path,
+        typer.Option("--output-dir", "-o", help="Where to save snapshot JSONs."),
+    ] = Path(".benchmarks/sweep"),
+    long: Annotated[
+        bool, typer.Option("--long", help="Include the slowest sizes.")
+    ] = False,
+    quick: Annotated[
+        bool,
+        typer.Option("--quick", help="Use only the smallest sizes (faster sweep)."),
+    ] = False,
+    use_lock: Annotated[
+        bool,
+        typer.Option(
+            "--use-lock/--no-use-lock",
+            help="Install ``benchmarks/requirements.lock`` in each venv.",
+        ),
+    ] = True,
+) -> None:
+    """
+    Run the benchmark suite against several linopy versions.
+
+    Uses ``uv`` to build a fresh venv per version (near-instant) and to
+    install the benchmark infra + target linopy in a single resolution
+    pass. The pytest-benchmark JSON snapshot lands in
+    ``<output-dir>/linopy-<version>.json``.
+
+    Versions are accepted in two forms:
+
+    - Plain releases: ``0.4.0``, ``0.5.0a1`` — expanded to ``linopy==X``.
+    - Pip specs verbatim: ``git+https://github.com/PyPSA/linopy.git@<sha>``
+      or ``linopy @ file:///path/to/checkout``.
+
+    The current (repo-tip) benchmark code runs against each linopy
+    version, so the measurement layer is constant. ``_API_AVAILABLE``
+    gates in the ``sos`` / ``piecewise`` specs let older linopy versions
+    skip those phases gracefully.
+
+    Wall-clock: roughly 1-2 minutes per version (venv + install +
+    benchmarks). uv's wheel cache makes repeated runs much faster.
+    """
+    if quick and long:
+        typer.secho(
+            "--quick and --long are mutually exclusive",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
+
+    if shutil.which("uv") is None:
+        typer.secho(
+            "uv not found on PATH — install via https://docs.astral.sh/uv/",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
+
+    repo_root = Path.cwd()
+    lockfile = repo_root / "benchmarks" / "requirements.lock"
+    if use_lock and not lockfile.exists():
+        typer.secho(
+            f"--use-lock set but {lockfile} is missing — "
+            "regenerate it via ``uv pip compile`` or pass ``--no-use-lock``.",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    failed: list[str] = []
+    for version in versions:
+        typer.secho(f"\n=== linopy {version} ===", fg=typer.colors.CYAN, bold=True)
+        with tempfile.TemporaryDirectory(prefix="linopy-bench-") as tmp:
+            venv = Path(tmp) / "venv"
+
+            # 1. uv venv — same interpreter that's driving the CLI.
+            r = subprocess.run(
+                ["uv", "venv", "--python", sys.executable, str(venv)],
+                check=False,
+            )
+            if r.returncode != 0:
+                typer.secho(
+                    f"venv creation failed: {version}",
+                    fg=typer.colors.RED,
+                    err=True,
+                )
+                failed.append(version)
+                continue
+
+            vpy = _venv_python(venv)
+            spec = _linopy_install_spec(version)
+
+            # 2. Single install pass: infra (lockfile or pinned subset) + linopy.
+            install_args = ["uv", "pip", "install", "--python", str(vpy)]
+            if use_lock:
+                install_args += ["-r", str(lockfile)]
+            else:
+                install_args += [
+                    "pytest==9.0.3",
+                    "pytest-benchmark==5.2.3",
+                    "pypsa==1.2.2",
+                    "highspy==1.13.1",
+                    "netcdf4==1.7.4",
+                ]
+            install_args.append(spec)
+            r = subprocess.run(install_args, check=False)
+            if r.returncode != 0:
+                typer.secho(f"install failed: {version}", fg=typer.colors.RED, err=True)
+                failed.append(version)
+                continue
+
+            # 3. Run the benchmarks. PYTHONPATH makes ``import benchmarks``
+            #    resolve against the local checkout — the venv only needs to
+            #    provide linopy + the test infra.
+            snapshot = (output_dir / f"linopy-{version}.json").resolve()
+            env = os.environ.copy()
+            env["PYTHONPATH"] = str(repo_root)
+
+            pytest_cmd = [
+                str(vpy),
+                "-m",
+                "pytest",
+                "benchmarks/",
+                "--benchmark-only",
+                "--benchmark-json",
+                str(snapshot),
+            ]
+            if quick:
+                pytest_cmd.append("--quick")
+            elif long:
+                pytest_cmd.append("--long")
+
+            typer.secho(f"$ {' '.join(pytest_cmd)}", fg=typer.colors.BRIGHT_BLACK)
+            subprocess.run(pytest_cmd, env=env, check=False)
+
+            if snapshot.exists():
+                typer.secho(f"saved {snapshot}", fg=typer.colors.GREEN)
+            else:
+                typer.secho(
+                    f"no snapshot produced for {version}",
+                    fg=typer.colors.RED,
+                    err=True,
+                )
+                failed.append(version)
+
+    if failed:
+        typer.secho(f"\nFailed versions: {failed}", fg=typer.colors.RED, err=True)
+        raise typer.Exit(code=1)
+
+
 # --- Memory subcommands ----------------------------------------------------
 
 

From 51f418dcb04ec23954ef6ca068e9760303430c0e Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 14:45:35 +0200
Subject: [PATCH 11/68] benchmarks: collapse README to a pointer, kill
 duplication
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The README previously duplicated content from three sources:
- the notebook (models table, registry-usage code blocks)
- ``--help`` (quick-reference command list)
- a stale memory.py invocation (since replaced by ``memory save/compare``)

After the consolidation each surface has a clear single job:

- README: 1-paragraph what, setup (uv sync / lockfile), size-tier table
  (architectural), pointers to the notebook + ``--help``, metrics blurb.
- ``notebooks/registry_usage.ipynb``: the walkthrough — registry import,
  lookup / iterate / filter, parametrize your own pytest, profiling.
- ``python -m benchmarks --help``: command reference, autogenerated by
  typer from docstrings / Annotated[..., Option(...)] declarations.

Drops ~140 lines from the README; nothing actually disappears — it just
lives in the one place that owns it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/README.md | 179 +++++++++----------------------------------
 1 file changed, 38 insertions(+), 141 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index acf53316..8b3db2a8 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,117 +1,35 @@
 # Internal Performance Benchmarks
 
-This suite benchmarks the **linopy part end-to-end** across three phases:
-
-1. **Build** — construct the linopy model.
-2. **Solver handoff** — convert a built model into solver-consumable form
-   (in-memory matrices, LP file, native solver instance, netCDF).
-3. **Persistence round-trip** — `to_netcdf` / `read_netcdf`.
+End-to-end timing and memory for the linopy half of an optimization run:
+build a model, hand it off to a solver, round-trip via netCDF. Solver
+algorithm runtime is intentionally out of scope.
 
 > **Note:** `benchmark/` (singular) is for external framework comparisons.
-> `benchmarks/` is only for internal linopy performance tracking.
-
-## What is covered
-
-| Phase                 | Test file                       | Notes                                              |
-| --------------------- | ------------------------------- | -------------------------------------------------- |
-| Build                 | `test_build.py`                 | variables / expressions / constraints / objective  |
-| Matrices              | `test_matrices.py`              | `A`, `b`, `c`, bounds, labels, `Q` for QP          |
-| LP write              | `test_lp_write.py`              | `model.to_file(...)`                               |
-| netCDF write/read     | `test_netcdf.py`                | `to_netcdf` / `read_netcdf`                        |
-| Solver handoff        | `test_solver_handoff.py`        | `lp.io.to_highspy / to_gurobipy / to_mosek / to_xpress` — skipped per-solver when not installed |
-| PyPSA carbon handoff  | `test_pypsa_carbon_management.py` | `set_names=True/False`, `freeze_constraints=True/False` |
-
-What we *don't* cover: solver algorithm performance (`Solver.solve()`
-runtime), cross-solver ranking, nonlinear / general-quadratic constraint
-suites.
-
-## Models
-
-The suite is driven by a **reusable model registry**. Each model file under
-`benchmarks/models/` exposes a `build_<name>(size) -> linopy.Model` callable
-and a module-level `SPEC` describing features, applicable phases, default
-sizes, and optional dependencies.
-
-| Name                    | Features            | Typical use                                         |
-| ----------------------- | ------------------- | --------------------------------------------------- |
-| `basic`                 | continuous          | dense LP scaling                                    |
-| `knapsack`              | binary              | MIP binary-section path                             |
-| `expression_arithmetic` | continuous          | stresses `+`, `*`, `sum`, broadcasting              |
-| `sparse_network`        | continuous          | mismatched-coordinate / sparse coefficient handling |
-| `milp`                  | integer             | general-integer (non-binary) MIP path               |
-| `qp`                    | quadratic           | continuous QP / `matrices.Q` path                   |
-| `sos` *(linopy ≥ recent)* | sos              | `Model.add_sos_constraints` + LP SOS section        |
-| `piecewise` *(linopy ≥ recent)* | piecewise  | `Model.add_piecewise_formulation`                    |
-| `masked`                | masked              | `mask=` on `add_variables` / `add_constraints`      |
-| `pypsa_scigrid` *(optional)* | continuous     | real PyPSA model                                    |
-
-The `sos` and `piecewise` specs are skipped automatically if the underlying
-APIs aren't present in the installed linopy.
-
-### Reusing the registry outside the suite
-
-The registry is a plain importable object — use it from any test, script,
-or profiling session:
-
-```python
-from benchmarks import REGISTRY
-
-# Look up by name
-model = REGISTRY["basic"].build(100)
-
-# Iterate (e.g. parametrize your own test)
-for spec in REGISTRY.values():
-    m = spec.build(spec.sizes[0])
-    ...
-
-# Filter by feature or phase
-from benchmarks import filter_by, QUADRATIC, TO_GUROBIPY
-
-qp_specs = filter_by(has_feature=QUADRATIC)
-gurobi_specs = filter_by(has_phase=TO_GUROBIPY)
-```
-
-To add a new model, drop a file under `benchmarks/models/`, expose a
-`build_<name>(size)`, and call `register(ModelSpec(...))`. Import it from
-`benchmarks/models/__init__.py` so the registration fires.
-
-### Worked walkthrough
-
-[`notebooks/registry_usage.ipynb`](notebooks/registry_usage.ipynb) is the
-canonical walkthrough — it runs through every pattern above end-to-end.
-GitHub renders it inline. CI executes it on every PR via `jupyter nbconvert
---execute`, so the examples can't silently rot.
-
-Open it locally with JupyterLab launched from the repo root:
-
-```bash
-jupyter lab benchmarks/notebooks/registry_usage.ipynb
-```
+> `benchmarks/` (plural) is only for internal linopy performance tracking.
 
 ## Setup
 
-Two install paths, depending on what you're doing:
+Two install paths:
 
 ```bash
 # Development / casual benchmark runs — loose constraints from pyproject
 uv sync --extra dev --extra benchmarks
 source .venv/bin/activate
 
-# Stable measurement environment — fully resolved lockfile (linopy itself
-# is excluded, so you install whichever linopy version you want on top)
+# Stable measurement environment — fully resolved lockfile
 uv pip install -r benchmarks/requirements.lock
 uv pip install -e .            # current linopy
 # — or —
-uv pip install linopy==0.5.0   # for a cross-version sweep
+uv pip install linopy==0.5.0   # cross-version sweep target
 ```
 
-The lockfile pins every transitive (numpy / scipy / pandas / xarray / ...)
-so the *environment around linopy* stays stable. Absolute numbers are still
-machine-dependent (CPU, cache, memory bandwidth) — what the lockfile gives
-you is consistency over time on the same machine, so when you run the suite
-at two points the delta reflects linopy changes, not a numpy upgrade.
+The lockfile excludes linopy itself so the same lockfile works for both
+current-tip regression runs and `sweep` against older releases. Absolute
+benchmark numbers are still machine-dependent (CPU, cache, memory
+bandwidth) — what the lockfile gives you is consistency over time on the
+same machine, so deltas reflect linopy changes, not a numpy upgrade.
 
-Regenerate after bumping the ``[benchmarks]`` pins in ``pyproject.toml``:
+Regenerate after bumping the `[benchmarks]` pins in `pyproject.toml`:
 
 ```bash
 uv pip compile pyproject.toml --extra benchmarks --extra dev --extra solvers \
@@ -119,66 +37,45 @@ uv pip compile pyproject.toml --extra benchmarks --extra dev --extra solvers \
   -o benchmarks/requirements.lock
 ```
 
-## Run benchmarks
+## Run
 
-Everything is exposed through a single typer-based CLI. The CLI's
-`--help` is the source of truth — run it for the full menu:
+Everything is exposed through one typer CLI. Its `--help` is the source of
+truth — no command menu duplicated here:
 
 ```bash
 python -m benchmarks --help
 python -m benchmarks <command> --help
 ```
 
-Pytest still works directly for power users (`pytest benchmarks/ ...`).
-
-### Size tiers
+Three size tiers, configured per spec via `quick_threshold` / `long_threshold`:
 
-Each spec declares its own `quick_threshold` and `long_threshold`:
+| Mode         | Sizes included            | Typical use                              |
+| ------------ | ------------------------- | ---------------------------------------- |
+| `smoke`      | `size <= quick_threshold` | CI smoke (~18 s), fast local sanity      |
+| `run`        | `size <= long_threshold`  | Default regression timing (~45 s)        |
+| `run --long` | all sizes                 | Full sweep — the slow stuff (~2 min)     |
 
-| Mode              | Sizes included            | Typical use                            |
-| ----------------- | ------------------------- | -------------------------------------- |
-| `smoke`           | `size <= quick_threshold` | CI smoke, fast local sanity check      |
-| `run`             | `size <= long_threshold`  | Default: medium-cost regression timing |
-| `run --long`      | all sizes                 | Full sweep (the slow stuff — many min) |
-
-### Quick reference
+Pytest still works directly for power users (`pytest benchmarks/ ...`).
 
-```bash
-# Fastest sanity check (~18s, what CI runs)
-python -m benchmarks smoke
+## Walkthrough
 
-# Default timing run
-python -m benchmarks run
+[`notebooks/registry_usage.ipynb`](notebooks/registry_usage.ipynb) is the
+canonical walkthrough: import the registry, look up / iterate / filter
+specs, build a model, parametrize your own pytest test off the registry,
+spot-profile memory. GitHub renders it inline; CI executes it on every PR
+(`python -m benchmarks notebook`) so the examples can't silently rot.
 
-# Save / compare memory snapshots
-python -m benchmarks memory save "$(git rev-parse --short HEAD)"
-python -m benchmarks memory compare master my-feature
+Open it locally with JupyterLab launched from the repo root:
 
-# Sweep across several linopy versions (bootstraps perf history).
-# Builds a fresh uv venv per version, installs lockfile (or pinned
-# subset) + that linopy, runs the suite, saves JSON. Uv keeps it fast.
-python -m benchmarks sweep 0.5.0 0.6.0 0.7.0
+```bash
+jupyter lab benchmarks/notebooks/registry_usage.ipynb
 ```
 
 ## Metrics
 
-- **Time** — pytest-benchmark median runtime (IQR for stability).
-- **Memory** — pytest-memray peak RSS (MiB), tracked for Build only because
-  later phases include build allocations and make attribution unreliable.
-
-## Results and history
-
-Raw outputs live in `.benchmarks/` (gitignored). Store comparison snapshots
-as JSON and compare to a rolling `master` baseline:
-
-```bash
-# Timing snapshot
-pytest benchmarks/ \
-  --benchmark-json ".benchmarks/timing-$(date +%Y%m%d-%H%M%S).json"
-
-# Memory snapshot (Build by default)
-python benchmarks/memory.py save "$(git rev-parse --short HEAD)"
-
-# Compare memory snapshots
-python benchmarks/memory.py compare <baseline-label> <candidate-label>
-```
+- **Time** — pytest-benchmark median runtime (IQR for stability). Snapshots
+  are JSON; pass `--json <path>` to `run` to save one, then diff against a
+  baseline.
+- **Memory** — pytest-memray peak RSS (MiB), tracked for the build phase
+  only because later phases include build allocations and attribution
+  becomes unreliable. Use `memory save` / `memory compare`.

From c0f3fee23945e54d231de726d3273703059993dc Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 14:56:50 +0200
Subject: [PATCH 12/68] benchmarks: make pypsa optional, expand notebook into
 proper guide
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pypsa removed from the [benchmarks] pinned set, from the sweep
``--no-use-lock`` install list, and from the lockfile. The
``test_pypsa_carbon_management.py`` module uses ``pytest.importorskip``
so collection no longer fails without pypsa; ``pypsa_scigrid`` already
had ``requires=("pypsa",)`` so its phase tests skip gracefully.
Install pypsa separately when you want those benchmarks.

Notebook (registry_usage.ipynb) rewritten as a proper operator guide:

- Architecture overview + per-phase measurement table up front.
- Registry walkthrough (lookup / iterate / filter) kept as the spine.
- Reuse patterns (parametrize-your-own-pytest, tracemalloc spot check).
- ``Running`` section now embeds ``--help`` output live via a
  ``show_help()`` helper that shells out to ``python -m benchmarks ...
  --help``. The doc stays in sync with the typer implementation
  automatically — change a flag in cli.py, re-run the notebook,
  documentation updates.
- New sections cover timing snapshots, memory snapshots, the
  cross-version sweep, and lockfile regeneration.

README gains an explicit "pypsa is optional" note in setup.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/README.md                       |   8 +
 benchmarks/cli.py                          |   1 -
 benchmarks/notebooks/registry_usage.ipynb  | 386 +++++++++++++++++----
 benchmarks/requirements.lock               | 113 +-----
 benchmarks/test_pypsa_carbon_management.py |   5 +-
 pyproject.toml                             |   1 -
 6 files changed, 337 insertions(+), 177 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index 8b3db2a8..df649ecc 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -23,6 +23,14 @@ uv pip install -e .            # current linopy
 uv pip install linopy==0.5.0   # cross-version sweep target
 ```
 
+`pypsa` is an **optional** benchmark dep — the `pypsa_scigrid` registry
+spec and `test_pypsa_carbon_management.py` skip gracefully without it.
+Install separately when you want them:
+
+```bash
+uv pip install pypsa
+```
+
 The lockfile excludes linopy itself so the same lockfile works for both
 current-tip regression runs and `sweep` against older releases. Absolute
 benchmark numbers are still machine-dependent (CPU, cache, memory
diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 85506c32..8ef6349d 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -405,7 +405,6 @@ def sweep(
                 install_args += [
                     "pytest==9.0.3",
                     "pytest-benchmark==5.2.3",
-                    "pypsa==1.2.2",
                     "highspy==1.13.1",
                     "netcdf4==1.7.4",
                 ]
diff --git a/benchmarks/notebooks/registry_usage.ipynb b/benchmarks/notebooks/registry_usage.ipynb
index ba8c8c46..79801dff 100644
--- a/benchmarks/notebooks/registry_usage.ipynb
+++ b/benchmarks/notebooks/registry_usage.ipynb
@@ -5,18 +5,22 @@
    "id": "0",
    "metadata": {},
    "source": [
-    "# Benchmark model registry — usage guide\n",
+    "# Linopy benchmark suite — guide\n",
     "\n",
-    "This notebook is the canonical walkthrough for the benchmark\n",
-    "**model registry**. CI executes it end-to-end on every PR\n",
-    "(`jupyter nbconvert --execute`), so the examples can't silently rot.\n",
+    "This notebook is the canonical documentation for the internal benchmark\n",
+    "suite under `benchmarks/`. CI executes it end-to-end on every PR via\n",
+    "`python -m benchmarks notebook`, so anything written here stays runnable.\n",
     "\n",
-    "Launch jupyter from the repo root so `from benchmarks import ...` resolves\n",
-    "(same convention as `examples/*.ipynb`).\n",
+    "Two complementary surfaces:\n",
     "\n",
-    "The registry lives in `benchmarks/registry.py`. Each model file under\n",
-    "`benchmarks/models/` self-registers a `ModelSpec` on import, so just touching\n",
-    "the `benchmarks` package populates `REGISTRY`."
+    "- **README** (`benchmarks/README.md`) — install / size-tier reference / metrics.\n",
+    "- **CLI help** — `python -m benchmarks --help` is the source of truth for\n",
+    "  command flags. This notebook embeds that output live (see the *Running*\n",
+    "  section) so it stays in sync with the actual implementation.\n",
+    "\n",
+    "What this notebook covers: the suite's architecture (registry × phases),\n",
+    "how to use the registry from your own code, how to run / interpret\n",
+    "benchmarks, and how to extend the suite."
    ]
   },
   {
@@ -24,23 +28,75 @@
    "id": "1",
    "metadata": {},
    "source": [
-    "## 1. Import the registry\n",
+    "## Architecture\n",
+    "\n",
+    "Two halves:\n",
+    "\n",
+    "1. **The model registry** (`benchmarks/registry.py`) — every benchmark\n",
+    "   model is a `ModelSpec` declaring how to build it, its sizes, the\n",
+    "   features it exercises, the phases it can drive, and the size tiers.\n",
+    "   Models self-register at import.\n",
+    "\n",
+    "2. **The phase tests** (`test_build.py`, `test_matrices.py`, …) — one\n",
+    "   pytest file per phase. Each iterates the registry via\n",
+    "   `iter_params(phase)` so adding a model to the registry automatically\n",
+    "   extends every applicable phase test.\n",
+    "\n",
+    "A typer CLI (`benchmarks/cli.py`) wraps pytest plus introspection and\n",
+    "memory snapshots."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2",
+   "metadata": {},
+   "source": [
+    "## What each phase measures\n",
+    "\n",
+    "| Phase             | Test file                         | Measures                                                     |\n",
+    "| ----------------- | --------------------------------- | ------------------------------------------------------------ |\n",
+    "| `build`           | `test_build.py`                   | constructing variables / expressions / constraints / objective |\n",
+    "| `matrices`        | `test_matrices.py`                | `A`, `b`, `c`, bounds, labels, `Q` for QP                    |\n",
+    "| `lp_write`        | `test_lp_write.py`                | `model.to_file(...)` — LP / MPS serialization                |\n",
+    "| `netcdf`          | `test_netcdf.py`                  | `to_netcdf` and `read_netcdf` round-trip                     |\n",
+    "| `solver_handoff`  | `test_solver_handoff.py`          | `lp.io.to_highspy` / `to_gurobipy` / `to_mosek` / `to_xpress` |\n",
+    "| (PyPSA scenario)  | `test_pypsa_carbon_management.py` | `set_names` / `freeze_constraints` variants — *optional, needs `pypsa`* |\n",
+    "\n",
+    "Out of scope: solver algorithm runtime (i.e. `Solver.solve()`),\n",
+    "cross-solver ranking, nonlinear suites.\n",
+    "\n",
+    "`pypsa` is an optional benchmark dependency — install it (`pip install pypsa`)\n",
+    "if you want the `pypsa_scigrid` registry spec and the carbon-management\n",
+    "scenario to run; otherwise both skip gracefully."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3",
+   "metadata": {},
+   "source": [
+    "## The registry\n",
+    "\n",
+    "### Import\n",
     "\n",
-    "Single entry point: `from benchmarks import REGISTRY` plus the feature / phase\n",
-    "constants you need for filtering."
+    "Single entry point: `from benchmarks import REGISTRY` plus whichever\n",
+    "feature / phase tags you need for filtering. The cell below also defines\n",
+    "a `show_help(...)` helper used later to embed live `--help` output."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2",
+   "id": "4",
    "metadata": {},
    "outputs": [],
    "source": [
     "# The benchmark suite isn't shipped in the linopy wheel — it lives in-tree.\n",
-    "# Find the repo root by walking up from cwd and put it on sys.path so the\n",
+    "# Walk up from cwd to find the repo root and put it on sys.path so the\n",
     "# import resolves whether jupyter was launched from the repo root, the\n",
     "# notebooks directory, or anywhere in between.\n",
+    "import os\n",
+    "import subprocess\n",
     "import sys\n",
     "from pathlib import Path\n",
     "\n",
@@ -61,25 +117,46 @@
     "    get,\n",
     ")\n",
     "\n",
+    "\n",
+    "def show_help(*subcommand: str) -> None:\n",
+    "    \"\"\"\n",
+    "    Shell out to ``python -m benchmarks ... --help`` and print the output.\n",
+    "\n",
+    "    Subprocesses don't inherit ``sys.path`` so we forward the repo root via\n",
+    "    PYTHONPATH. ``NO_COLOR=1`` makes rich emit plain text suitable for the\n",
+    "    notebook's text-output mechanism.\n",
+    "    \"\"\"\n",
+    "    cmd = [sys.executable, \"-m\", \"benchmarks\", *subcommand, \"--help\"]\n",
+    "    env = {**os.environ, \"PYTHONPATH\": str(_p), \"NO_COLOR\": \"1\"}\n",
+    "    result = subprocess.run(\n",
+    "        cmd,\n",
+    "        capture_output=True,\n",
+    "        text=True,\n",
+    "        env=env,\n",
+    "        check=True,\n",
+    "    )\n",
+    "    print(result.stdout)\n",
+    "\n",
+    "\n",
     "print(f\"{len(REGISTRY)} models registered: {sorted(REGISTRY)}\")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "3",
+   "id": "5",
    "metadata": {},
    "source": [
-    "## 2. Look up one model by name\n",
+    "### Look up by name\n",
     "\n",
     "`REGISTRY[name]` returns a `ModelSpec` (frozen dataclass). Evaluating it\n",
-    "renders a full attribute table in Jupyter; `__repr__` gives a one-line\n",
-    "summary in scripts or `pytest -v` output."
+    "in Jupyter renders an attribute table via `_repr_html_`; `__repr__` gives\n",
+    "a one-line summary in scripts and `pytest -v` output."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "4",
+   "id": "6",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -89,16 +166,17 @@
   },
   {
    "cell_type": "markdown",
-   "id": "5",
+   "id": "7",
    "metadata": {},
    "source": [
-    "`.build(size)` constructs and returns a `linopy.Model`:"
+    "`.build(size)` constructs and returns a `linopy.Model`. Models pick up\n",
+    "their own repr from linopy:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6",
+   "id": "8",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -107,17 +185,17 @@
   },
   {
    "cell_type": "markdown",
-   "id": "7",
+   "id": "9",
    "metadata": {},
    "source": [
-    "`get(\"name\")` is an equivalent functional accessor — handy when you don't\n",
-    "want to import `REGISTRY` directly."
+    "`get(\"name\")` is a functional equivalent to `REGISTRY[name]` — handy when\n",
+    "you don't want to import `REGISTRY` directly."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "8",
+   "id": "10",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -126,20 +204,19 @@
   },
   {
    "cell_type": "markdown",
-   "id": "9",
+   "id": "11",
    "metadata": {},
    "source": [
-    "## 3. Iterate the whole registry\n",
+    "### Iterate\n",
     "\n",
-    "Useful when you want to sweep your own test or profiling logic across every\n",
-    "model — e.g. checking that a refactor didn't break any spec. Each spec's\n",
-    "`__repr__` carries enough info for an at-a-glance overview."
+    "`REGISTRY.values()` yields every spec. Useful for sweeping your own\n",
+    "regression logic or any operation that should hold across every model."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "10",
+   "id": "12",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -148,20 +225,22 @@
   },
   {
    "cell_type": "markdown",
-   "id": "11",
+   "id": "13",
    "metadata": {},
    "source": [
-    "## 4. Filter by feature\n",
+    "### Filter\n",
     "\n",
-    "`filter_by(has_feature=...)` returns specs that advertise that feature. The\n",
-    "feature tag constants (`CONTINUOUS`, `BINARY`, `INTEGER`, `QUADRATIC`, `SOS`,\n",
-    "`PIECEWISE`, `MASKED`) are exported from `benchmarks`."
+    "`filter_by(has_feature=...)` and `filter_by(has_phase=...)` narrow to\n",
+    "specs that declare a given feature or phase. Tags exported from\n",
+    "`benchmarks`: `CONTINUOUS`, `BINARY`, `INTEGER`, `QUADRATIC`, `SOS`,\n",
+    "`PIECEWISE`, `MASKED`, plus `BUILD`, `MATRICES`, `LP_WRITE`, `NETCDF`,\n",
+    "`SOLVER_BUILD`, `TO_HIGHSPY`, `TO_GUROBIPY`, `TO_MOSEK`, `TO_XPRESS`."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "12",
+   "id": "14",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -171,7 +250,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "13",
+   "id": "15",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -180,21 +259,17 @@
   },
   {
    "cell_type": "markdown",
-   "id": "14",
+   "id": "16",
    "metadata": {},
    "source": [
-    "## 5. Filter by phase\n",
-    "\n",
-    "Each spec declares which **phases** apply — `BUILD`, `MATRICES`, `LP_WRITE`,\n",
-    "`NETCDF`, `SOLVER_BUILD`, plus per-solver `TO_HIGHSPY` / `TO_GUROBIPY` /\n",
-    "`TO_MOSEK` / `TO_XPRESS`. Use `has_phase=` to narrow to solver-compatible\n",
-    "models, e.g. when writing a Gurobi-specific regression test."
+    "Useful for solver-specific tests — find every spec that declares the\n",
+    "Gurobi handoff phase (i.e. claims Gurobi can ingest it):"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "15",
+   "id": "17",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -203,14 +278,16 @@
   },
   {
    "cell_type": "markdown",
-   "id": "16",
+   "id": "18",
    "metadata": {},
    "source": [
-    "## 6. Reuse pattern — parametrize your own pytest\n",
+    "## Reuse patterns\n",
     "\n",
-    "The pattern the suite itself uses (see `benchmarks/test_build.py` etc.) —\n",
-    "`iter_params(phase)` returns `(spec, size)` pairs for the given phase, and\n",
-    "`param_ids(...)` builds stable test IDs for `pytest.mark.parametrize`:\n",
+    "### Parametrize your own pytest\n",
+    "\n",
+    "The same pattern the suite uses internally — `iter_params(phase)`\n",
+    "flattens `(spec, size)` pairs for one phase, and `param_ids(...)` builds\n",
+    "stable pytest IDs:\n",
     "\n",
     "```python\n",
     "import pytest\n",
@@ -227,20 +304,21 @@
   },
   {
    "cell_type": "markdown",
-   "id": "17",
+   "id": "19",
    "metadata": {},
    "source": [
-    "## 7. Reuse pattern — one-off profiling\n",
+    "### Spot-profile memory\n",
     "\n",
-    "Grab a single model at a chosen size, measure something, throw it away.\n",
-    "`tracemalloc` works well for in-process peak-RSS spot checks (use\n",
-    "`benchmarks/memory.py` + pytest-memray for the real metric)."
+    "`tracemalloc` is a fast, in-process spot check. For real measurement\n",
+    "(peak RSS, separate process per benchmark) use\n",
+    "`python -m benchmarks memory save / compare`, which routes through\n",
+    "pytest-memray."
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "18",
+   "id": "20",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -251,7 +329,7 @@
     "_current, peak = tracemalloc.get_traced_memory()\n",
     "tracemalloc.stop()\n",
     "\n",
-    "print(f\"sparse_network n=100: built, peak allocation ≈ {peak / 1e6:.1f} MB\")\n",
+    "print(f\"sparse_network n=100: peak allocation ≈ {peak / 1e6:.1f} MB\")\n",
     "print(\n",
     "    f\"  {m.variables.nvars} scalar variables, {m.constraints.ncons} scalar constraints\"\n",
     ")"
@@ -259,39 +337,193 @@
   },
   {
    "cell_type": "markdown",
-   "id": "19",
+   "id": "21",
+   "metadata": {},
+   "source": [
+    "## Running\n",
+    "\n",
+    "All commands are subcommands of `python -m benchmarks`. The CLI is\n",
+    "self-documenting; the cells below embed its `--help` output live. If you\n",
+    "change a flag in `benchmarks/cli.py`, re-running this notebook updates\n",
+    "the documentation automatically.\n",
+    "\n",
+    "Three size tiers gate cost. Each spec declares its own thresholds:\n",
+    "\n",
+    "| Mode         | Sizes included            | Wall-clock |\n",
+    "| ------------ | ------------------------- | ---------- |\n",
+    "| `smoke`      | `size <= quick_threshold` | ~18 s      |\n",
+    "| `run`        | `size <= long_threshold`  | ~45 s      |\n",
+    "| `run --long` | all sizes                 | ~2 min     |\n",
+    "\n",
+    "Top-level command menu:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_help()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "23",
    "metadata": {},
    "source": [
-    "## 8. Running the benchmark suite\n",
+    "### Timing snapshots\n",
     "\n",
-    "Everything is exposed through a typer CLI; its auto-generated help is the\n",
-    "source of truth:\n",
+    "`run` is the main timing entry point. Its flags:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_help(\"run\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "25",
+   "metadata": {},
+   "source": [
+    "Use `--json` to save pytest-benchmark's output for later diffing — the\n",
+    "JSON includes per-test min / median / IQR over multiple iterations:\n",
     "\n",
     "```bash\n",
-    "python -m benchmarks --help            # top-level menu\n",
-    "python -m benchmarks run --help        # per-command flags\n",
+    "# baseline (e.g. on master)\n",
+    "python -m benchmarks run --json .benchmarks/master.json\n",
+    "\n",
+    "# candidate (e.g. on your branch)\n",
+    "python -m benchmarks run --json .benchmarks/my-feature.json\n",
+    "\n",
+    "# pytest-benchmark ships its own diff tool:\n",
+    "pytest-benchmark compare .benchmarks/master.json .benchmarks/my-feature.json\n",
     "```\n",
     "\n",
-    "Three size tiers configured per-spec via `quick_threshold` /\n",
-    "`long_threshold`: `smoke` (≤ quick), `run` (≤ long), `run --long` (all\n",
-    "sizes). Pytest still works directly for power users."
+    "IQR is the metric to trust on short runs — it stays stable across noise\n",
+    "in a way that min / mean can't."
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "20",
+   "id": "26",
+   "metadata": {},
+   "source": [
+    "### Memory snapshots\n",
+    "\n",
+    "`memory save` runs the build phase under pytest-memray in a separate\n",
+    "process per benchmark (so peak RSS doesn't accumulate across tests) and\n",
+    "writes JSON. `memory compare` diffs two snapshots:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_help(\"memory\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "28",
+   "metadata": {},
+   "source": [
+    "```bash\n",
+    "python -m benchmarks memory save master\n",
+    "python -m benchmarks memory save \"$(git rev-parse --short HEAD)\"\n",
+    "python -m benchmarks memory compare master \"$(git rev-parse --short HEAD)\"\n",
+    "```\n",
+    "\n",
+    "Memory is build-only because later phases include build allocations and\n",
+    "attribution becomes unreliable."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "29",
+   "metadata": {},
+   "source": [
+    "### Cross-version sweep\n",
+    "\n",
+    "`sweep` bootstraps perf history against published linopy releases. For\n",
+    "each version it builds a fresh uv venv, installs the lockfile + that\n",
+    "linopy, runs the suite, and saves a JSON snapshot."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "show_help(\"sweep\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "31",
+   "metadata": {},
+   "source": [
+    "```bash\n",
+    "python -m benchmarks sweep 0.5.0 0.6.0 0.7.0 \\\n",
+    "    --output-dir .benchmarks/sweep\n",
+    "```\n",
+    "\n",
+    "The current (repo-tip) benchmark code runs against each linopy version,\n",
+    "so the measurement layer is constant. Specs whose APIs aren't present in\n",
+    "older linopy (currently `sos` and `piecewise`) skip themselves gracefully\n",
+    "via the `_API_AVAILABLE` gate at registration time."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "32",
    "metadata": {},
    "source": [
-    "## 9. Adding a new model\n",
+    "## Extending\n",
+    "\n",
+    "### Adding a new model\n",
     "\n",
     "1. Drop `benchmarks/models/<name>.py` with a `build_<name>(size) -> Model`.\n",
     "2. Build a `ModelSpec` and call `register(...)` at module scope. Declare\n",
     "   realistic `quick_threshold` / `long_threshold` so the smoke stays fast.\n",
     "3. Add an import in `benchmarks/models/__init__.py` so registration fires.\n",
     "\n",
-    "That's it — every phase test picks the spec up automatically through\n",
+    "Every phase test picks the spec up automatically through\n",
     "`iter_params(phase)`."
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33",
+   "metadata": {},
+   "source": [
+    "### Regenerating the lockfile\n",
+    "\n",
+    "After bumping pins in `pyproject.toml`'s `[benchmarks]` extra, regenerate\n",
+    "`benchmarks/requirements.lock`:\n",
+    "\n",
+    "```bash\n",
+    "uv pip compile pyproject.toml \\\n",
+    "    --extra benchmarks --extra dev --extra solvers \\\n",
+    "    --no-emit-package linopy \\\n",
+    "    -o benchmarks/requirements.lock\n",
+    "```\n",
+    "\n",
+    "The `--no-emit-package linopy` exclusion is critical — without it, the\n",
+    "lockfile pins linopy itself and `sweep` can't vary it."
+   ]
   }
  ],
  "metadata": {
@@ -299,6 +531,18 @@
    "display_name": "Python 3",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,
diff --git a/benchmarks/requirements.lock b/benchmarks/requirements.lock
index da416b5d..40a78e20 100644
--- a/benchmarks/requirements.lock
+++ b/benchmarks/requirements.lock
@@ -33,16 +33,12 @@ beautifulsoup4==4.14.3
 bleach==6.3.0
     # via nbconvert
 bottleneck==1.6.0
-    # via
-    #   linopy (pyproject.toml)
-    #   linopy
+    # via linopy (pyproject.toml)
 certifi==2026.5.20
     # via
     #   httpcore
     #   httpx
     #   netcdf4
-    #   pyogrio
-    #   pyproj
     #   requests
 cffi==2.0.0
     # via
@@ -63,8 +59,6 @@ comm==0.2.3
     # via
     #   ipykernel
     #   ipywidgets
-contourpy==1.3.3
-    # via matplotlib
 coptpy==8.0.4
     # via linopy (pyproject.toml)
 coverage==7.14.1
@@ -73,12 +67,8 @@ cryptography==48.0.0
     # via
     #   paramiko
     #   types-paramiko
-cycler==0.12.1
-    # via matplotlib
 dask==2026.3.0
-    # via
-    #   linopy (pyproject.toml)
-    #   linopy
+    # via linopy (pyproject.toml)
 debugpy==1.8.20
     # via ipykernel
 decorator==5.3.1
@@ -86,10 +76,7 @@ decorator==5.3.1
 defusedxml==0.7.1
     # via nbconvert
 deprecation==2.1.0
-    # via
-    #   linopy (pyproject.toml)
-    #   linopy
-    #   pypsa
+    # via linopy (pyproject.toml)
 distlib==0.4.0
     # via virtualenv
 executing==2.2.1
@@ -100,22 +87,16 @@ filelock==3.29.0
     # via
     #   python-discovery
     #   virtualenv
-fonttools==4.63.0
-    # via matplotlib
 fqdn==1.5.1
     # via jsonschema
 fsspec==2026.4.0
     # via dask
-geopandas==1.1.3
-    # via pypsa
 gurobipy==13.0.2
     # via linopy (pyproject.toml)
 h11==0.16.0
     # via httpcore
 highspy==1.13.1
-    # via
-    #   linopy (pyproject.toml)
-    #   pypsa
+    # via linopy (pyproject.toml)
 httpcore==1.0.9
     # via httpx
 httpx==0.28.1
@@ -159,7 +140,6 @@ jinja2==3.1.6
     #   jupyterlab-server
     #   memray
     #   nbconvert
-    #   pydeck
 json5==0.14.0
     # via jupyterlab-server
 jsonpointer==3.1.1
@@ -216,14 +196,10 @@ jupyterlab-server==2.28.0
     #   notebook
 jupyterlab-widgets==3.0.16
     # via ipywidgets
-kiwisolver==1.5.0
-    # via matplotlib
 knitro==15.1.0
     # via linopy (pyproject.toml)
 lark==1.3.1
     # via rfc3987-syntax
-levenshtein==0.27.3
-    # via pypsa
 librt==0.11.0
     # via mypy
 linkify-it-py==2.1.0
@@ -239,10 +215,6 @@ markupsafe==3.0.3
     # via
     #   jinja2
     #   nbconvert
-matplotlib==3.10.9
-    # via
-    #   pypsa
-    #   seaborn
 matplotlib-inline==0.2.2
     # via
     #   ipykernel
@@ -263,8 +235,6 @@ mypy==2.1.0
     # via linopy (pyproject.toml)
 mypy-extensions==1.1.0
     # via mypy
-narwhals==2.21.2
-    # via plotly
 nbclient==0.10.4
     # via nbconvert
 nbconvert==7.17.1
@@ -280,11 +250,7 @@ nbformat==5.10.4
 nest-asyncio==1.6.0
     # via ipykernel
 netcdf4==1.7.4
-    # via
-    #   linopy (pyproject.toml)
-    #   pypsa
-networkx==3.6.1
-    # via pypsa
+    # via linopy (pyproject.toml)
 nodeenv==1.10.0
     # via pre-commit
 notebook==7.5.6
@@ -294,30 +260,19 @@ notebook-shim==0.2.4
     #   jupyterlab
     #   notebook
 numexpr==2.14.1
-    # via
-    #   linopy (pyproject.toml)
-    #   linopy
+    # via linopy (pyproject.toml)
 numpy==2.4.6
     # via
     #   linopy (pyproject.toml)
     #   bottleneck
     #   cftime
-    #   contourpy
-    #   geopandas
     #   highspy
-    #   linopy
-    #   matplotlib
     #   mindoptpy
     #   mosek
     #   netcdf4
     #   numexpr
     #   pandas
-    #   pydeck
-    #   pyogrio
-    #   pypsa
     #   scipy
-    #   seaborn
-    #   shapely
     #   xarray
 overrides==7.7.0
     # via jupyter-server
@@ -326,25 +281,16 @@ packaging==26.2
     #   linopy (pyproject.toml)
     #   dask
     #   deprecation
-    #   geopandas
     #   ipykernel
     #   jupyter-events
     #   jupyter-server
     #   jupyterlab
     #   jupyterlab-server
-    #   linopy
-    #   matplotlib
     #   nbconvert
-    #   plotly
-    #   pyogrio
     #   pytest
     #   xarray
 pandas==3.0.3
-    # via
-    #   geopandas
-    #   pypsa
-    #   seaborn
-    #   xarray
+    # via xarray
 pandocfilters==1.5.1
     # via nbconvert
 paramiko==5.0.0
@@ -357,25 +303,18 @@ pathspec==1.1.1
     # via mypy
 pexpect==4.9.0
     # via ipython
-pillow==12.2.0
-    # via matplotlib
 platformdirs==4.10.0
     # via
     #   jupyter-core
-    #   pypsa
     #   python-discovery
     #   textual
     #   virtualenv
-plotly==6.7.0
-    # via pypsa
 pluggy==1.6.0
     # via
     #   pytest
     #   pytest-cov
 polars==1.41.1
-    # via
-    #   linopy (pyproject.toml)
-    #   linopy
+    # via linopy (pyproject.toml)
 polars-runtime-32==1.41.1
     # via polars
 pre-commit==4.6.0
@@ -400,8 +339,6 @@ py-cpuinfo==9.0.0
     # via pytest-benchmark
 pycparser==3.0
     # via cffi
-pydeck==0.9.2
-    # via pypsa
 pygments==2.20.0
     # via
     #   ipython
@@ -413,14 +350,6 @@ pygments==2.20.0
     #   textual
 pynacl==1.6.2
     # via paramiko
-pyogrio==0.12.1
-    # via geopandas
-pyparsing==3.3.2
-    # via matplotlib
-pyproj==3.7.2
-    # via geopandas
-pypsa==1.2.2
-    # via linopy (pyproject.toml)
 pytest==9.0.3
     # via
     #   linopy (pyproject.toml)
@@ -437,7 +366,6 @@ python-dateutil==2.9.0.post0
     # via
     #   arrow
     #   jupyter-client
-    #   matplotlib
     #   pandas
 python-discovery==1.4.0
     # via virtualenv
@@ -454,8 +382,6 @@ pyzmq==27.1.0
     #   jupyter-client
     #   jupyter-console
     #   jupyter-server
-rapidfuzz==3.14.5
-    # via levenshtein
 referencing==0.37.0
     # via
     #   jsonschema
@@ -485,19 +411,11 @@ rpds-py==2026.5.1
 scipy==1.17.1
     # via
     #   linopy (pyproject.toml)
-    #   linopy
     #   mindoptpy
-    #   pypsa
-seaborn==0.13.2
-    # via pypsa
 send2trash==2.1.0
     # via jupyter-server
 setuptools==82.0.1
     # via jupyterlab
-shapely==2.1.2
-    # via
-    #   geopandas
-    #   pypsa
 shellingham==1.5.4
     # via typer
 six==1.17.0
@@ -520,7 +438,6 @@ toolz==1.1.0
     # via
     #   linopy (pyproject.toml)
     #   dask
-    #   linopy
     #   partd
 tornado==6.5.6
     # via
@@ -531,9 +448,7 @@ tornado==6.5.6
     #   notebook
     #   terminado
 tqdm==4.67.3
-    # via
-    #   linopy (pyproject.toml)
-    #   linopy
+    # via linopy (pyproject.toml)
 traitlets==5.15.0
     # via
     #   ipykernel
@@ -573,8 +488,6 @@ urllib3==2.7.0
     # via
     #   requests
     #   types-requests
-validators==0.35.0
-    # via pypsa
 virtualenv==21.4.1
     # via pre-commit
 wcwidth==0.7.0
@@ -590,12 +503,6 @@ websocket-client==1.9.0
 widgetsnbextension==4.0.15
     # via ipywidgets
 xarray==2026.4.0
-    # via
-    #   linopy (pyproject.toml)
-    #   linopy
-    #   pypsa
+    # via linopy (pyproject.toml)
 zipp==4.1.0
     # via importlib-metadata
-
-# The following packages were excluded from the output:
-# linopy
diff --git a/benchmarks/test_pypsa_carbon_management.py b/benchmarks/test_pypsa_carbon_management.py
index 7f29a52e..7583de5a 100644
--- a/benchmarks/test_pypsa_carbon_management.py
+++ b/benchmarks/test_pypsa_carbon_management.py
@@ -1,8 +1,11 @@
-import pypsa
 import pytest
 
 import linopy as lp
 
+# pypsa is an optional benchmark dep. Skip the whole module if it's missing
+# so the rest of the suite stays collectable without it.
+pypsa = pytest.importorskip("pypsa")
+
 
 @pytest.fixture(scope="module")
 def network():
diff --git a/pyproject.toml b/pyproject.toml
index cae6831c..5b3bc1cb 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -99,7 +99,6 @@ benchmarks = [
     "pytest==9.0.3",
     "pytest-benchmark==5.2.3",
     "pytest-memray==1.8.0",
-    "pypsa==1.2.2",
     "highspy==1.13.1",
     "netcdf4==1.7.4",
     "nbconvert==7.17.1",

From 0522a752623c78a9bf4348891208780452e70bbe Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 14:58:15 +0200
Subject: [PATCH 13/68] benchmarks: sweep gains --phase / --model / --filter +
 pytest pass-through

Mirrors ``run``'s filter knobs and applies them to every version's
pytest invocation. Also switches to the ``typer.Context`` +
``context_settings`` pattern so trailing args after ``--`` are
forwarded to pytest verbatim (same shape ``smoke`` / ``run`` use).

    python -m benchmarks sweep 0.6.7 --phase build --model basic
    python -m benchmarks sweep 0.6.7 -- --tb=short -x

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 8ef6349d..6157889f 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -298,8 +298,11 @@ def _venv_python(venv: Path) -> Path:
     )
 
 
-@app.command()
+@app.command(
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
 def sweep(
+    ctx: typer.Context,
     versions: Annotated[
         list[str],
         typer.Argument(help="linopy versions, e.g. 0.4.0 0.5.0 (or any pip spec)."),
@@ -315,6 +318,22 @@ def sweep(
         bool,
         typer.Option("--quick", help="Use only the smallest sizes (faster sweep)."),
     ] = False,
+    phase: Annotated[
+        PhaseName | None,
+        typer.Option(help="Restrict each version's run to one phase's test file."),
+    ] = None,
+    model: Annotated[
+        str | None,
+        typer.Option(help="Restrict to one model (passed as pytest ``-k``)."),
+    ] = None,
+    filter_expr: Annotated[
+        str | None,
+        typer.Option(
+            "--filter",
+            "-k",
+            help="Arbitrary pytest ``-k`` expression (AND-ed with ``--model``).",
+        ),
+    ] = None,
     use_lock: Annotated[
         bool,
         typer.Option(
@@ -342,6 +361,13 @@ def sweep(
     gates in the ``sos`` / ``piecewise`` specs let older linopy versions
     skip those phases gracefully.
 
+    Filter knobs (``--phase``, ``--model``, ``--filter``) mirror ``run``
+    and apply to every version's pytest invocation. Trailing arguments
+    after ``--`` are forwarded to pytest verbatim:
+
+        python -m benchmarks sweep 0.6.7 --phase build --model basic
+        python -m benchmarks sweep 0.6.7 -- --tb=short -x
+
     Wall-clock: roughly 1-2 minutes per version (venv + install +
     benchmarks). uv's wheel cache makes repeated runs much faster.
     """
@@ -422,11 +448,14 @@ def sweep(
             env = os.environ.copy()
             env["PYTHONPATH"] = str(repo_root)
 
+            test_target = (
+                _PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/"
+            )
             pytest_cmd = [
                 str(vpy),
                 "-m",
                 "pytest",
-                "benchmarks/",
+                test_target,
                 "--benchmark-only",
                 "--benchmark-json",
                 str(snapshot),
@@ -436,6 +465,12 @@ def sweep(
             elif long:
                 pytest_cmd.append("--long")
 
+            k_parts = [p for p in (model, filter_expr) if p]
+            if k_parts:
+                pytest_cmd.extend(["-k", " and ".join(k_parts)])
+
+            pytest_cmd.extend(ctx.args)
+
             typer.secho(f"$ {' '.join(pytest_cmd)}", fg=typer.colors.BRIGHT_BLACK)
             subprocess.run(pytest_cmd, env=env, check=False)
 

From 7bb464e7aed7c07fee79618639bf6c2fcae57fed Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 15:01:55 +0200
Subject: [PATCH 14/68] benchmarks: add ``compare`` subcommand wrapping
 pytest-benchmark compare
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

``python -m benchmarks compare a.json b.json [-- --columns=...]``
shells out to ``pytest-benchmark compare`` so the whole suite stays
under one entry point. Accepts any number of snapshots; first is the
baseline.

When called with no arguments — or with paths that don't exist — it
prints a copy-paste-ready list of snapshots found under
``.benchmarks/`` (including ``.benchmarks/sweep/`` for cross-version
runs). If nothing's saved yet, points at the ``run --json`` flow.

For memory snapshots use ``memory compare`` instead — different
format, different tool.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py | 82 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 82 insertions(+)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 6157889f..4b3897e5 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -489,6 +489,88 @@ def sweep(
         raise typer.Exit(code=1)
 
 
+# --- Compare timing snapshots ---------------------------------------------
+
+
+def _discover_snapshots() -> list[Path]:
+    """Return JSON snapshot files under the canonical .benchmarks/ tree."""
+    root = Path.cwd() / ".benchmarks"
+    if not root.exists():
+        return []
+    return sorted(root.rglob("*.json"))
+
+
+def _suggest_snapshots(reason: str) -> None:
+    """Print an error + a hint listing whatever snapshots we can find."""
+    typer.secho(reason, fg=typer.colors.RED, err=True)
+    found = _discover_snapshots()
+    if found:
+        typer.echo("\nAvailable snapshots under .benchmarks/:", err=True)
+        for p in found:
+            typer.echo(f"  {p}", err=True)
+    else:
+        typer.echo(
+            "\nNo snapshots found under .benchmarks/. Generate one with:\n"
+            "  python -m benchmarks run --json .benchmarks/<label>.json",
+            err=True,
+        )
+
+
+@app.command(
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def compare(
+    ctx: typer.Context,
+    snapshots: Annotated[
+        list[Path] | None,
+        typer.Argument(
+            help="Two or more pytest-benchmark JSON files saved via ``run --json``."
+        ),
+    ] = None,
+) -> None:
+    """
+    Compare timing snapshots side-by-side via ``pytest-benchmark compare``.
+
+    Thin wrapper around the upstream tool so the whole suite stays under
+    one entry point. Pass any number of JSONs — the first is treated as
+    the baseline. Trailing arguments after ``--`` are forwarded to
+    ``pytest-benchmark compare`` verbatim:
+
+        python -m benchmarks compare .benchmarks/master.json .benchmarks/branch.json
+        python -m benchmarks compare a.json b.json -- --columns=median,iqr --sort=name
+
+    With no arguments (or missing paths), prints what snapshots exist
+    under ``.benchmarks/`` so you can copy-paste the path you want.
+
+    For memory snapshots use ``memory compare`` instead — different format,
+    different tool.
+    """
+    snapshots = snapshots or []
+    if len(snapshots) < 2:
+        _suggest_snapshots(
+            f"compare needs at least two snapshot paths (got {len(snapshots)})."
+        )
+        raise typer.Exit(code=2)
+
+    missing = [p for p in snapshots if not p.exists()]
+    if missing:
+        _suggest_snapshots(f"missing snapshots: {[str(p) for p in missing]}")
+        raise typer.Exit(code=2)
+
+    cmd = [
+        sys.executable,
+        "-m",
+        "pytest_benchmark",
+        "compare",
+        *[str(p) for p in snapshots],
+        *ctx.args,
+    ]
+    typer.secho(f"$ {' '.join(cmd)}", fg=typer.colors.BRIGHT_BLACK)
+    result = subprocess.run(cmd, check=False)
+    if result.returncode != 0:
+        raise typer.Exit(code=result.returncode)
+
+
 # --- Memory subcommands ----------------------------------------------------
 
 

From 83bdedacd8eebcf2b23b6eb8f5584da9b643a432 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 15:02:58 +0200
Subject: [PATCH 15/68] benchmarks: compare lists snapshots as relative paths
 (easier to copy-paste)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 4b3897e5..48ceafa9 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -493,8 +493,13 @@ def sweep(
 
 
 def _discover_snapshots() -> list[Path]:
-    """Return JSON snapshot files under the canonical .benchmarks/ tree."""
-    root = Path.cwd() / ".benchmarks"
+    """
+    Return JSON snapshot files under the canonical .benchmarks/ tree.
+
+    Paths are relative to cwd so they're easier to copy-paste back into
+    the CLI than the absolute form would be.
+    """
+    root = Path(".benchmarks")
     if not root.exists():
         return []
     return sorted(root.rglob("*.json"))

From 8e378b58e6bfb2cf6c37e2dcd6cfd2e3012d3a27 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 15:06:10 +0200
Subject: [PATCH 16/68] benchmarks: tighter defaults for ``compare``
 (median/iqr, sorted by name)

pytest-benchmark's own default emits 10 columns side-by-side, which is
unreadable for any non-trivial comparison. Wrapper now prepends
``--columns=median,iqr --sort=name`` so the table is two stats wide
and the (baseline, candidate) pair of each test sits together
alphabetically.

Defaults are only applied when the user hasn't already set the flag,
so trailing pass-through overrides still work.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 48ceafa9..a9bdad77 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -562,13 +562,22 @@ def compare(
         _suggest_snapshots(f"missing snapshots: {[str(p) for p in missing]}")
         raise typer.Exit(code=2)
 
+    # Sensible defaults — pytest-benchmark's own default is 10 columns wide,
+    # which is unreadable. Only apply each default if the user hasn't already
+    # set it via a trailing arg.
+    extra = list(ctx.args)
+    if not any(a.startswith("--columns") for a in extra):
+        extra.insert(0, "--columns=median,iqr")
+    if not any(a.startswith("--sort") for a in extra):
+        extra.insert(0, "--sort=name")
+
     cmd = [
         sys.executable,
         "-m",
         "pytest_benchmark",
         "compare",
         *[str(p) for p in snapshots],
-        *ctx.args,
+        *extra,
     ]
     typer.secho(f"$ {' '.join(cmd)}", fg=typer.colors.BRIGHT_BLACK)
     result = subprocess.run(cmd, check=False)

From f67721b8d61d2ba3d3c077009b7e36bbc06d4c6b Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 15:09:44 +0200
Subject: [PATCH 17/68] benchmarks: compare gains ``--group-by=fullname``
 default + ctx.args arg split

Two fixes for the ``compare`` UX surfaced by the cross-version sweep:

- Default to ``--group-by=fullname`` so each test gets its own mini
  table showing (baseline, candidate) side-by-side with the
  parenthesized auto-ratio per column. Easy to scan ``(>1.10)`` for
  regressions in the median column. Combined with the existing
  ``--columns=median,iqr --sort=name`` defaults, the output goes from
  10-columns-wide-on-one-line to a focused two-column per-test view.

- Switch ``compare`` away from a positional ``list[Path]`` argument and
  parse ``ctx.args`` by hand: typer's positional list was greedily
  grabbing trailing ``--group-by=fullname`` etc. (and the ``--``
  separator didn't escape it either). Now arg-splitting is explicit:
  anything starting with ``-`` is pytest-benchmark pass-through,
  everything else is a snapshot path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py | 45 ++++++++++++++++++++++++++-------------------
 1 file changed, 26 insertions(+), 19 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index a9bdad77..de82aa23 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -524,25 +524,18 @@ def _suggest_snapshots(reason: str) -> None:
 @app.command(
     context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
-def compare(
-    ctx: typer.Context,
-    snapshots: Annotated[
-        list[Path] | None,
-        typer.Argument(
-            help="Two or more pytest-benchmark JSON files saved via ``run --json``."
-        ),
-    ] = None,
-) -> None:
+def compare(ctx: typer.Context) -> None:
     """
     Compare timing snapshots side-by-side via ``pytest-benchmark compare``.
 
     Thin wrapper around the upstream tool so the whole suite stays under
-    one entry point. Pass any number of JSONs — the first is treated as
-    the baseline. Trailing arguments after ``--`` are forwarded to
-    ``pytest-benchmark compare`` verbatim:
+    one entry point. Positional args that look like paths are treated as
+    snapshots (first = baseline); everything else is forwarded to
+    ``pytest-benchmark compare`` verbatim::
 
-        python -m benchmarks compare .benchmarks/master.json .benchmarks/branch.json
-        python -m benchmarks compare a.json b.json -- --columns=median,iqr --sort=name
+        python -m benchmarks compare .benchmarks/a.json .benchmarks/b.json
+        python -m benchmarks compare a.json b.json --group-by=fullname
+        python -m benchmarks compare a.json b.json --columns=median,iqr,outliers
 
     With no arguments (or missing paths), prints what snapshots exist
     under ``.benchmarks/`` so you can copy-paste the path you want.
@@ -550,7 +543,18 @@ def compare(
     For memory snapshots use ``memory compare`` instead — different format,
     different tool.
     """
-    snapshots = snapshots or []
+    # Split args into snapshot paths and flag-like pass-through. We do this
+    # by hand because typer's positional list[Path] greedily captures
+    # everything (including ``--unknown-flag``) even with
+    # ``ignore_unknown_options=True``.
+    snapshots: list[Path] = []
+    extra: list[str] = []
+    for arg in ctx.args:
+        if arg.startswith("-"):
+            extra.append(arg)
+        else:
+            snapshots.append(Path(arg))
+
     if len(snapshots) < 2:
         _suggest_snapshots(
             f"compare needs at least two snapshot paths (got {len(snapshots)})."
@@ -562,14 +566,17 @@ def compare(
         _suggest_snapshots(f"missing snapshots: {[str(p) for p in missing]}")
         raise typer.Exit(code=2)
 
-    # Sensible defaults — pytest-benchmark's own default is 10 columns wide,
-    # which is unreadable. Only apply each default if the user hasn't already
-    # set it via a trailing arg.
-    extra = list(ctx.args)
+    # Sensible defaults — pytest-benchmark's defaults emit 10 columns wide,
+    # grouped by parametrize group, which is unreadable for two-snapshot diffs.
+    # ``--group-by=fullname`` puts each test's (baseline, candidate) rows in
+    # their own mini-table; ``--columns=median,iqr`` keeps it narrow.
+    # Each default is only applied if the user didn't override it.
     if not any(a.startswith("--columns") for a in extra):
         extra.insert(0, "--columns=median,iqr")
     if not any(a.startswith("--sort") for a in extra):
         extra.insert(0, "--sort=name")
+    if not any(a.startswith("--group-by") for a in extra):
+        extra.insert(0, "--group-by=fullname")
 
     cmd = [
         sys.executable,

From 3ac333b728e940f68c5efb1c5a3349f00efbff9b Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 15:13:55 +0200
Subject: [PATCH 18/68] benchmarks: revert compare to manual arg-split +
 acknowledge typer wart
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tried switching to the canonical typer pattern (``--`` separator for
pass-through) but typer's positional ``list[Path]`` + ``allow_extra_args``
still greedily ate the trailing options. There's no clean typer/click
idiom for "list-typed positional + pass-through" — workarounds are
manual splitting, bounding the positional count, or named flags.

Manual splitting is the most pragmatic: snapshots come first, once we
see any flag-like token the rest is forwarded to pytest-benchmark.
That preserves things like ``--histogram=/tmp/hist/cmp`` (built-in
SVG-per-test plotting), ``--csv=out.csv``, ``--group-by=fullname``,
and the value-taking flags whose value doesn't start with ``-``.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py | 26 ++++++++++++++++----------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index de82aa23..d0f48832 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -529,28 +529,34 @@ def compare(ctx: typer.Context) -> None:
     Compare timing snapshots side-by-side via ``pytest-benchmark compare``.
 
     Thin wrapper around the upstream tool so the whole suite stays under
-    one entry point. Positional args that look like paths are treated as
-    snapshots (first = baseline); everything else is forwarded to
-    ``pytest-benchmark compare`` verbatim::
+    one entry point. Pass the snapshot paths first, then any pytest-benchmark
+    flags::
 
-        python -m benchmarks compare .benchmarks/a.json .benchmarks/b.json
-        python -m benchmarks compare a.json b.json --group-by=fullname
-        python -m benchmarks compare a.json b.json --columns=median,iqr,outliers
+        python -m benchmarks compare a.json b.json
+        python -m benchmarks compare a.json b.json --group-by=name
+        python -m benchmarks compare a.json b.json --histogram=plots/cmp
 
     With no arguments (or missing paths), prints what snapshots exist
     under ``.benchmarks/`` so you can copy-paste the path you want.
 
     For memory snapshots use ``memory compare`` instead — different format,
     different tool.
+
+    Implementation note: typer/click don't have a clean idiom for "list-typed
+    positional + pass-through", so this command parses ``ctx.args`` by hand
+    — anything before the first flag is a snapshot path, everything after
+    is forwarded.
     """
-    # Split args into snapshot paths and flag-like pass-through. We do this
-    # by hand because typer's positional list[Path] greedily captures
-    # everything (including ``--unknown-flag``) even with
-    # ``ignore_unknown_options=True``.
+    # Snapshots come first; once we see a flag (``-x`` / ``--foo``) every
+    # subsequent token is forwarded to pytest-benchmark. That way the value
+    # of a flag like ``-k "build and basic"`` doesn't get mistaken for a path.
     snapshots: list[Path] = []
     extra: list[str] = []
+    seen_flag = False
     for arg in ctx.args:
         if arg.startswith("-"):
+            seen_flag = True
+        if seen_flag:
             extra.append(arg)
         else:
             snapshots.append(Path(arg))

From 919e06149b69276ca1a3cf0658412039f5aef1e4 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 15:21:27 +0200
Subject: [PATCH 19/68] benchmarks: add ``plot`` subcommand (compare / sweep /
 scaling views)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three opinionated interactive HTML views over pytest-benchmark JSONs,
auto-picked from snapshot count or set explicitly via ``--view``:

- **compare** (2 snapshots) — horizontal bar chart of per-test median
  delta, sorted by magnitude, green→red colormap. The "did this PR
  regress anything?" picture in one glance, vs pytest-benchmark's
  60-individual-SVGs which are useless for that workflow.
- **sweep** (3+ snapshots) — heatmap of median ratio relative to the
  first snapshot, rows = tests, columns = labels. Pairs with the
  ``sweep`` subcommand.
- **scaling** (1 snapshot) — log-log median vs ``n`` for
  size-parametrized tests (e.g. ``[basic-n=10..1600]``), faceted by
  phase. Shows whether linopy's complexity scales as expected.

plotly==6.7.0 pinned in [benchmarks]; lockfile regenerated. plotly is
lazy-imported inside ``plot`` so the rest of the suite stays usable
without it (with a clear error if a user tries ``plot`` and it's
missing).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py            | 236 +++++++++++++++++++++++++++++++++++
 benchmarks/requirements.lock |   5 +
 pyproject.toml               |   1 +
 3 files changed, 242 insertions(+)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index d0f48832..11864c7c 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -598,6 +598,242 @@ def compare(ctx: typer.Context) -> None:
         raise typer.Exit(code=result.returncode)
 
 
+# --- Plotting --------------------------------------------------------------
+
+
+PlotView = Literal["compare", "sweep", "scaling"]
+
+
+def _load_snapshot(path: Path) -> tuple[str, dict[str, float]]:
+    """Return (label, {fullname: median_seconds}) for a pytest-benchmark JSON."""
+    import json
+
+    data = json.loads(path.read_text())
+    medians = {bm["fullname"]: bm["stats"]["median"] for bm in data["benchmarks"]}
+    return path.stem, medians
+
+
+def _plot_compare(snapshots: list[Path], output: Path) -> int:
+    """Bar chart of relative median delta per test, sorted by magnitude."""
+    import pandas as pd
+    import plotly.express as px
+
+    (a_label, a_med), (b_label, b_med) = (
+        _load_snapshot(snapshots[0]),
+        _load_snapshot(snapshots[1]),
+    )
+    common = sorted(set(a_med) & set(b_med))
+    if not common:
+        typer.secho(
+            "no tests in common between the two snapshots",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=1)
+
+    rows = [
+        {
+            "test": name,
+            "delta_pct": (b_med[name] - a_med[name]) / a_med[name] * 100.0,
+        }
+        for name in common
+    ]
+    df = pd.DataFrame(rows)
+    df = df.reindex(df["delta_pct"].abs().sort_values(ascending=True).index)
+
+    fig = px.bar(
+        df,
+        x="delta_pct",
+        y="test",
+        orientation="h",
+        color="delta_pct",
+        color_continuous_scale=["green", "white", "red"],
+        color_continuous_midpoint=0,
+        title=f"Median delta: {a_label} → {b_label} (positive = slower)",
+        labels={"delta_pct": "median delta %", "test": ""},
+    )
+    fig.update_layout(height=max(400, len(df) * 14), showlegend=False)
+    fig.write_html(output)
+    return len(df)
+
+
+def _plot_sweep(snapshots: list[Path], output: Path) -> int:
+    """Heatmap of per-test median ratio relative to the first snapshot."""
+    import pandas as pd
+    import plotly.express as px
+
+    loaded = [_load_snapshot(p) for p in snapshots]
+    versions = [label for label, _ in loaded]
+    baseline = loaded[0][1]
+    all_tests = sorted(set().union(*[set(med) for _, med in loaded]))
+
+    matrix: dict[str, list[float | None]] = {}
+    for test in all_tests:
+        base = baseline.get(test)
+        if not base:
+            continue
+        row = []
+        for _, med in loaded:
+            t = med.get(test)
+            row.append(t / base if t else None)
+        matrix[test] = row
+
+    if not matrix:
+        typer.secho(
+            f"no overlap with baseline snapshot {versions[0]}",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=1)
+
+    df = pd.DataFrame(matrix, index=versions).T  # rows = tests, cols = versions
+    fig = px.imshow(
+        df,
+        color_continuous_scale=["green", "white", "red"],
+        color_continuous_midpoint=1.0,
+        aspect="auto",
+        title=f"Median ratio relative to baseline ({versions[0]})",
+        labels={"x": "version", "y": "test", "color": "ratio"},
+    )
+    fig.update_layout(height=max(400, len(df) * 14))
+    fig.write_html(output)
+    return len(df)
+
+
+_SIZE_RE = re.compile(r"(.*)\[([^\[\]]+?)-n=(\d+)\]")
+
+
+def _plot_scaling(snapshots: list[Path], output: Path) -> int:
+    """Log-log median vs N for size-parametrized tests, faceted by phase."""
+    import pandas as pd
+    import plotly.express as px
+
+    _, med = _load_snapshot(snapshots[0])
+    rows = []
+    for name, t in med.items():
+        m = _SIZE_RE.match(name)
+        if not m:
+            continue
+        phase_path, model, n = m.groups()
+        phase = phase_path.split("::")[-1]
+        rows.append({"phase": phase, "model": model, "n": int(n), "median": t})
+
+    if not rows:
+        typer.secho(
+            "no size-parametrized tests found (expected ``...[<model>-n=<N>]``)",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=1)
+
+    df = pd.DataFrame(rows).sort_values(["phase", "model", "n"])
+    fig = px.line(
+        df,
+        x="n",
+        y="median",
+        color="model",
+        facet_col="phase",
+        facet_col_wrap=3,
+        log_x=True,
+        log_y=True,
+        markers=True,
+        title=f"Scaling: median time vs problem size ({snapshots[0].stem})",
+    )
+    fig.update_layout(height=max(400, ((df["phase"].nunique() + 2) // 3) * 350))
+    fig.write_html(output)
+    return len(df)
+
+
+@app.command()
+def plot(
+    snapshots: Annotated[
+        list[Path],
+        typer.Argument(help="pytest-benchmark JSON snapshot(s)."),
+    ],
+    view: Annotated[
+        PlotView | None,
+        typer.Option(
+            help=(
+                "Which plot to produce. Default: ``scaling`` for 1 input, "
+                "``compare`` for 2, ``sweep`` for 3+."
+            )
+        ),
+    ] = None,
+    output: Annotated[
+        Path,
+        typer.Option("--output", "-o", help="Where to write the HTML."),
+    ] = Path("benchmark-plot.html"),
+    open_browser: Annotated[
+        bool,
+        typer.Option("--open/--no-open", help="Open the result in a browser."),
+    ] = False,
+) -> None:
+    """
+    Render an interactive HTML plot from one or more snapshots.
+
+    Three views, picked automatically from the snapshot count or set
+    explicitly via ``--view``:
+
+    - **compare** (2 snapshots) — horizontal bar chart of per-test median
+      delta, sorted by magnitude, green→red colormap. The "did this PR
+      regress anything?" picture in one glance.
+    - **sweep** (3+ snapshots) — heatmap of median ratio relative to the
+      first snapshot, rows = tests, columns = snapshot labels. Useful
+      for cross-version sweeps from ``sweep``.
+    - **scaling** (1 snapshot) — log-log median vs ``n`` for
+      size-parametrized tests, faceted by phase. Shows whether linopy's
+      complexity scales as expected.
+
+    Output is an interactive Plotly HTML file. Open it in any browser
+    (or pass ``--open``).
+    """
+    missing = [p for p in snapshots if not p.exists()]
+    if missing:
+        _suggest_snapshots(f"missing snapshots: {[str(p) for p in missing]}")
+        raise typer.Exit(code=2)
+
+    chosen = view or (
+        "scaling"
+        if len(snapshots) == 1
+        else "compare"
+        if len(snapshots) == 2
+        else "sweep"
+    )
+    if chosen == "compare" and len(snapshots) != 2:
+        typer.secho(
+            "compare view needs exactly 2 snapshots", fg=typer.colors.RED, err=True
+        )
+        raise typer.Exit(code=2)
+    if chosen == "scaling" and len(snapshots) != 1:
+        typer.secho(
+            "scaling view takes exactly 1 snapshot", fg=typer.colors.RED, err=True
+        )
+        raise typer.Exit(code=2)
+
+    try:
+        import plotly.express  # noqa: F401
+    except ImportError as exc:
+        typer.secho(
+            "plotly is required for ``plot`` — ``pip install plotly``",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2) from exc
+
+    output.parent.mkdir(parents=True, exist_ok=True)
+    rendered = {
+        "compare": _plot_compare,
+        "sweep": _plot_sweep,
+        "scaling": _plot_scaling,
+    }[chosen](snapshots, output)
+
+    typer.secho(f"{chosen} view: {rendered} tests → {output}", fg=typer.colors.GREEN)
+    if open_browser:
+        import webbrowser
+
+        webbrowser.open(output.resolve().as_uri())
+
+
 # --- Memory subcommands ----------------------------------------------------
 
 
diff --git a/benchmarks/requirements.lock b/benchmarks/requirements.lock
index 40a78e20..72fbea79 100644
--- a/benchmarks/requirements.lock
+++ b/benchmarks/requirements.lock
@@ -235,6 +235,8 @@ mypy==2.1.0
     # via linopy (pyproject.toml)
 mypy-extensions==1.1.0
     # via mypy
+narwhals==2.21.2
+    # via plotly
 nbclient==0.10.4
     # via nbconvert
 nbconvert==7.17.1
@@ -287,6 +289,7 @@ packaging==26.2
     #   jupyterlab
     #   jupyterlab-server
     #   nbconvert
+    #   plotly
     #   pytest
     #   xarray
 pandas==3.0.3
@@ -309,6 +312,8 @@ platformdirs==4.10.0
     #   python-discovery
     #   textual
     #   virtualenv
+plotly==6.7.0
+    # via linopy (pyproject.toml)
 pluggy==1.6.0
     # via
     #   pytest
diff --git a/pyproject.toml b/pyproject.toml
index 5b3bc1cb..ff114e10 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -103,6 +103,7 @@ benchmarks = [
     "netcdf4==1.7.4",
     "nbconvert==7.17.1",
     "typer==0.26.2",
+    "plotly==6.7.0",
 ]
 solvers = [
     "gurobipy",

From c921b780ed9fa9ad10c97354c2169b83f93d8835 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 15:37:32 +0200
Subject: [PATCH 20/68] benchmarks: move plotting to benchmarks/plotting.py +
 text_auto + hover absolutes

- New ``benchmarks/plotting.py`` module owns the three views
  (``plot_compare`` / ``plot_sweep`` / ``plot_scaling``) plus a
  ``RENDERERS`` dispatch dict. cli.py drops ~140 lines and just imports
  ``PlotView`` + ``RENDERERS``; plotly is still lazy-loaded inside the
  view functions so importing the module without plotly works.

- ``compare`` bar chart and ``sweep`` heatmap now use ``text_auto``
  so values render inside each bar / cell.

- Hover info upgraded:
  - compare hover shows the per-test median of *both* snapshots
    (formatted to 4 significant figures) in addition to the delta %.
  - sweep hover shows the absolute median (s) alongside the ratio, via
    a customdata + hovertemplate plumbed through ``update_traces``.

scaling view already shows the absolute median on hover by virtue of
being a line chart.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py      | 156 ++---------------------------------
 benchmarks/plotting.py | 179 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 186 insertions(+), 149 deletions(-)
 create mode 100644 benchmarks/plotting.py

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 11864c7c..2fe92416 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -29,6 +29,7 @@
 )
 from benchmarks.memory import compare as memory_compare
 from benchmarks.memory import save as memory_save
+from benchmarks.plotting import PlotView
 
 app = typer.Typer(
     help=(
@@ -601,149 +602,6 @@ def compare(ctx: typer.Context) -> None:
 # --- Plotting --------------------------------------------------------------
 
 
-PlotView = Literal["compare", "sweep", "scaling"]
-
-
-def _load_snapshot(path: Path) -> tuple[str, dict[str, float]]:
-    """Return (label, {fullname: median_seconds}) for a pytest-benchmark JSON."""
-    import json
-
-    data = json.loads(path.read_text())
-    medians = {bm["fullname"]: bm["stats"]["median"] for bm in data["benchmarks"]}
-    return path.stem, medians
-
-
-def _plot_compare(snapshots: list[Path], output: Path) -> int:
-    """Bar chart of relative median delta per test, sorted by magnitude."""
-    import pandas as pd
-    import plotly.express as px
-
-    (a_label, a_med), (b_label, b_med) = (
-        _load_snapshot(snapshots[0]),
-        _load_snapshot(snapshots[1]),
-    )
-    common = sorted(set(a_med) & set(b_med))
-    if not common:
-        typer.secho(
-            "no tests in common between the two snapshots",
-            fg=typer.colors.RED,
-            err=True,
-        )
-        raise typer.Exit(code=1)
-
-    rows = [
-        {
-            "test": name,
-            "delta_pct": (b_med[name] - a_med[name]) / a_med[name] * 100.0,
-        }
-        for name in common
-    ]
-    df = pd.DataFrame(rows)
-    df = df.reindex(df["delta_pct"].abs().sort_values(ascending=True).index)
-
-    fig = px.bar(
-        df,
-        x="delta_pct",
-        y="test",
-        orientation="h",
-        color="delta_pct",
-        color_continuous_scale=["green", "white", "red"],
-        color_continuous_midpoint=0,
-        title=f"Median delta: {a_label} → {b_label} (positive = slower)",
-        labels={"delta_pct": "median delta %", "test": ""},
-    )
-    fig.update_layout(height=max(400, len(df) * 14), showlegend=False)
-    fig.write_html(output)
-    return len(df)
-
-
-def _plot_sweep(snapshots: list[Path], output: Path) -> int:
-    """Heatmap of per-test median ratio relative to the first snapshot."""
-    import pandas as pd
-    import plotly.express as px
-
-    loaded = [_load_snapshot(p) for p in snapshots]
-    versions = [label for label, _ in loaded]
-    baseline = loaded[0][1]
-    all_tests = sorted(set().union(*[set(med) for _, med in loaded]))
-
-    matrix: dict[str, list[float | None]] = {}
-    for test in all_tests:
-        base = baseline.get(test)
-        if not base:
-            continue
-        row = []
-        for _, med in loaded:
-            t = med.get(test)
-            row.append(t / base if t else None)
-        matrix[test] = row
-
-    if not matrix:
-        typer.secho(
-            f"no overlap with baseline snapshot {versions[0]}",
-            fg=typer.colors.RED,
-            err=True,
-        )
-        raise typer.Exit(code=1)
-
-    df = pd.DataFrame(matrix, index=versions).T  # rows = tests, cols = versions
-    fig = px.imshow(
-        df,
-        color_continuous_scale=["green", "white", "red"],
-        color_continuous_midpoint=1.0,
-        aspect="auto",
-        title=f"Median ratio relative to baseline ({versions[0]})",
-        labels={"x": "version", "y": "test", "color": "ratio"},
-    )
-    fig.update_layout(height=max(400, len(df) * 14))
-    fig.write_html(output)
-    return len(df)
-
-
-_SIZE_RE = re.compile(r"(.*)\[([^\[\]]+?)-n=(\d+)\]")
-
-
-def _plot_scaling(snapshots: list[Path], output: Path) -> int:
-    """Log-log median vs N for size-parametrized tests, faceted by phase."""
-    import pandas as pd
-    import plotly.express as px
-
-    _, med = _load_snapshot(snapshots[0])
-    rows = []
-    for name, t in med.items():
-        m = _SIZE_RE.match(name)
-        if not m:
-            continue
-        phase_path, model, n = m.groups()
-        phase = phase_path.split("::")[-1]
-        rows.append({"phase": phase, "model": model, "n": int(n), "median": t})
-
-    if not rows:
-        typer.secho(
-            "no size-parametrized tests found (expected ``...[<model>-n=<N>]``)",
-            fg=typer.colors.RED,
-            err=True,
-        )
-        raise typer.Exit(code=1)
-
-    df = pd.DataFrame(rows).sort_values(["phase", "model", "n"])
-    fig = px.line(
-        df,
-        x="n",
-        y="median",
-        color="model",
-        facet_col="phase",
-        facet_col_wrap=3,
-        log_x=True,
-        log_y=True,
-        markers=True,
-        title=f"Scaling: median time vs problem size ({snapshots[0].stem})",
-    )
-    fig.update_layout(height=max(400, ((df["phase"].nunique() + 2) // 3) * 350))
-    fig.write_html(output)
-    return len(df)
-
-
 @app.command()
 def plot(
     snapshots: Annotated[
@@ -811,7 +669,7 @@ def plot(
         raise typer.Exit(code=2)
 
     try:
-        import plotly.express  # noqa: F401
+        from benchmarks.plotting import RENDERERS
     except ImportError as exc:
         typer.secho(
             "plotly is required for ``plot`` — ``pip install plotly``",
@@ -821,11 +679,11 @@ def plot(
         raise typer.Exit(code=2) from exc
 
     output.parent.mkdir(parents=True, exist_ok=True)
-    rendered = {
-        "compare": _plot_compare,
-        "sweep": _plot_sweep,
-        "scaling": _plot_scaling,
-    }[chosen](snapshots, output)
+    try:
+        rendered = RENDERERS[chosen](snapshots, output)
+    except ValueError as exc:
+        typer.secho(str(exc), fg=typer.colors.RED, err=True)
+        raise typer.Exit(code=1) from exc
 
     typer.secho(f"{chosen} view: {rendered} tests → {output}", fg=typer.colors.GREEN)
     if open_browser:
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
new file mode 100644
index 00000000..c2cfe31e
--- /dev/null
+++ b/benchmarks/plotting.py
@@ -0,0 +1,179 @@
+"""
+Interactive plotly views over pytest-benchmark JSON snapshots.
+
+Three opinionated views, all returning the number of tests rendered:
+
+- :func:`plot_compare` (2 snapshots) — sorted-by-delta bar chart.
+- :func:`plot_sweep` (3+ snapshots) — heatmap of per-test median ratio
+  relative to the first snapshot. Useful for cross-version sweeps.
+- :func:`plot_scaling` (1 snapshot) — log-log median vs ``n`` for
+  size-parametrized tests, faceted by phase.
+
+plotly is imported lazily by the dispatcher so the rest of the benchmark
+suite still works without it.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from collections.abc import Callable
+from pathlib import Path
+from typing import Literal
+
+PlotView = Literal["compare", "sweep", "scaling"]
+
+_SIZE_RE = re.compile(r"(.*)\[([^\[\]]+?)-n=(\d+)\]")
+
+
+def _load_snapshot(path: Path) -> tuple[str, dict[str, float]]:
+    """Return ``(label, {fullname: median_seconds})`` for one snapshot."""
+    data = json.loads(path.read_text())
+    medians = {bm["fullname"]: bm["stats"]["median"] for bm in data["benchmarks"]}
+    return path.stem, medians
+
+
+def plot_compare(snapshots: list[Path], output: Path) -> int:
+    """Bar chart of relative median delta per test, sorted by magnitude."""
+    import pandas as pd
+    import plotly.express as px
+
+    (a_label, a_med), (b_label, b_med) = (
+        _load_snapshot(snapshots[0]),
+        _load_snapshot(snapshots[1]),
+    )
+    common = sorted(set(a_med) & set(b_med))
+    if not common:
+        raise ValueError("no tests in common between the two snapshots")
+
+    rows = [
+        {
+            "test": name,
+            a_label: a_med[name],
+            b_label: b_med[name],
+            "delta_pct": (b_med[name] - a_med[name]) / a_med[name] * 100.0,
+        }
+        for name in common
+    ]
+    df = pd.DataFrame(rows)
+    df = df.reindex(df["delta_pct"].abs().sort_values(ascending=True).index)
+
+    fig = px.bar(
+        df,
+        x="delta_pct",
+        y="test",
+        orientation="h",
+        color="delta_pct",
+        color_continuous_scale=["green", "white", "red"],
+        color_continuous_midpoint=0,
+        title=f"Median delta: {a_label} → {b_label} (positive = slower)",
+        labels={"delta_pct": "median delta %", "test": ""},
+        text_auto=".1f",
+        hover_data={
+            a_label: ":.4g",
+            b_label: ":.4g",
+            "delta_pct": ":.2f",
+        },
+    )
+    fig.update_layout(height=max(400, len(df) * 14), showlegend=False)
+    fig.write_html(output)
+    return len(df)
+
+
+def plot_sweep(snapshots: list[Path], output: Path) -> int:
+    """Heatmap of per-test median ratio relative to the first snapshot."""
+    import pandas as pd
+    import plotly.express as px
+
+    loaded = [_load_snapshot(p) for p in snapshots]
+    versions = [label for label, _ in loaded]
+    baseline = loaded[0][1]
+    all_tests = sorted(set().union(*[set(med) for _, med in loaded]))
+
+    ratios: dict[str, list[float | None]] = {}
+    absolutes: dict[str, list[float | None]] = {}
+    for test in all_tests:
+        base = baseline.get(test)
+        if not base:
+            continue
+        ratios[test] = []
+        absolutes[test] = []
+        for _, med in loaded:
+            t = med.get(test)
+            ratios[test].append(t / base if t else None)
+            absolutes[test].append(t)
+
+    if not ratios:
+        raise ValueError(f"no overlap with baseline snapshot {versions[0]}")
+
+    df = pd.DataFrame(ratios, index=versions).T  # rows = tests, cols = versions
+    abs_df = pd.DataFrame(absolutes, index=versions).T
+
+    fig = px.imshow(
+        df,
+        color_continuous_scale=["green", "white", "red"],
+        color_continuous_midpoint=1.0,
+        aspect="auto",
+        title=f"Median ratio relative to baseline ({versions[0]})",
+        labels={"x": "version", "y": "test", "color": "ratio"},
+        text_auto=".2f",
+    )
+    # Inject absolute medians as customdata so hover shows both.
+    fig.update_traces(
+        customdata=abs_df.values,
+        hovertemplate=(
+            "test: %{y}<br>"
+            "version: %{x}<br>"
+            "ratio: %{z:.3f}<br>"
+            "median: %{customdata:.4g}s"
+            "<extra></extra>"
+        ),
+    )
+    fig.update_layout(height=max(400, len(df) * 14))
+    fig.write_html(output)
+    return len(df)
+
+
+def plot_scaling(snapshots: list[Path], output: Path) -> int:
+    """Log-log median vs N for size-parametrized tests, faceted by phase."""
+    import pandas as pd
+    import plotly.express as px
+
+    _, med = _load_snapshot(snapshots[0])
+    rows = []
+    for name, t in med.items():
+        m = _SIZE_RE.match(name)
+        if not m:
+            continue
+        phase_path, model, n = m.groups()
+        phase = phase_path.split("::")[-1]
+        rows.append({"phase": phase, "model": model, "n": int(n), "median": t})
+
+    if not rows:
+        raise ValueError(
+            "no size-parametrized tests found (expected ``...[<model>-n=<N>]``)"
+        )
+
+    df = pd.DataFrame(rows).sort_values(["phase", "model", "n"])
+    fig = px.line(
+        df,
+        x="n",
+        y="median",
+        color="model",
+        facet_col="phase",
+        facet_col_wrap=3,
+        log_x=True,
+        log_y=True,
+        markers=True,
+        title=f"Scaling: median time vs problem size ({snapshots[0].stem})",
+    )
+    fig.update_layout(height=max(400, ((df["phase"].nunique() + 2) // 3) * 350))
+    fig.write_html(output)
+    return len(df)
+
+
+RENDERERS: dict[PlotView, Callable[[list[Path], Path], int]] = {
+    "compare": plot_compare,
+    "sweep": plot_sweep,
+    "scaling": plot_scaling,
+}

From 4c6f328fd313140e9d7b537f8f3f3070ae17d2b9 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 15:52:23 +0200
Subject: [PATCH 21/68] benchmarks: switch primary metric to ``min``, allow
 ``--metric`` override
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For microbenchmarks the lowest observed time is closest to the "true"
cost — background noise (GC, context switches, thermal throttling) can
only slow things down. pytest-benchmark's own ``--sort`` default is
``min`` for the same reason; LLVM's perf guide, JMH, Google Benchmark
and Alexandrescu's "Speed is found in the minds of people" all argue
similarly.

Changes:

- ``plot`` defaults to ``--metric min`` (was median). Accepts
  ``--metric median|mean|max`` to override. The metric drives the bar
  values, heatmap ratios, scaling-curve y-axis, and the hover labels.
- ``plot_compare`` / ``plot_sweep`` / ``plot_scaling`` in
  ``benchmarks/plotting.py`` all take a ``metric: Metric = "min"`` arg.
- ``compare`` table defaults to ``--columns=min,iqr --sort=min`` (was
  median,iqr / name). The auto-ratios next to each ``min`` flag
  regressions in the same readable form.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py      | 26 +++++++++++----
 benchmarks/plotting.py | 75 +++++++++++++++++++++++-------------------
 2 files changed, 61 insertions(+), 40 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 2fe92416..7d67c460 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -29,7 +29,7 @@
 )
 from benchmarks.memory import compare as memory_compare
 from benchmarks.memory import save as memory_save
-from benchmarks.plotting import PlotView
+from benchmarks.plotting import Metric, PlotView
 
 app = typer.Typer(
     help=(
@@ -576,12 +576,13 @@ def compare(ctx: typer.Context) -> None:
     # Sensible defaults — pytest-benchmark's defaults emit 10 columns wide,
     # grouped by parametrize group, which is unreadable for two-snapshot diffs.
     # ``--group-by=fullname`` puts each test's (baseline, candidate) rows in
-    # their own mini-table; ``--columns=median,iqr`` keeps it narrow.
+    # their own mini-table; ``--columns=min,iqr`` shows the lowest observed
+    # time (closest to "true" cost for microbenchmarks) plus the spread.
     # Each default is only applied if the user didn't override it.
     if not any(a.startswith("--columns") for a in extra):
-        extra.insert(0, "--columns=median,iqr")
+        extra.insert(0, "--columns=min,iqr")
     if not any(a.startswith("--sort") for a in extra):
-        extra.insert(0, "--sort=name")
+        extra.insert(0, "--sort=min")
     if not any(a.startswith("--group-by") for a in extra):
         extra.insert(0, "--group-by=fullname")
 
@@ -617,6 +618,16 @@ def plot(
             )
         ),
     ] = None,
+    metric: Annotated[
+        Metric,
+        typer.Option(
+            help=(
+                "Stat to drive the plot. ``min`` (default) is closest to "
+                "the 'true' cost — noise can only slow things down. ``median``"
+                " is more robust to a single fast warmup round."
+            )
+        ),
+    ] = "min",
     output: Annotated[
         Path,
         typer.Option("--output", "-o", help="Where to write the HTML."),
@@ -680,12 +691,15 @@ def plot(
 
     output.parent.mkdir(parents=True, exist_ok=True)
     try:
-        rendered = RENDERERS[chosen](snapshots, output)
+        rendered = RENDERERS[chosen](snapshots, output, metric)
     except ValueError as exc:
         typer.secho(str(exc), fg=typer.colors.RED, err=True)
         raise typer.Exit(code=1) from exc
 
-    typer.secho(f"{chosen} view: {rendered} tests → {output}", fg=typer.colors.GREEN)
+    typer.secho(
+        f"{chosen} view ({metric}): {rendered} tests → {output}",
+        fg=typer.colors.GREEN,
+    )
     if open_browser:
         import webbrowser
 
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index c2cfe31e..805306c4 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -4,11 +4,17 @@
 Three opinionated views, all returning the number of tests rendered:
 
 - :func:`plot_compare` (2 snapshots) — sorted-by-delta bar chart.
-- :func:`plot_sweep` (3+ snapshots) — heatmap of per-test median ratio
+- :func:`plot_sweep` (3+ snapshots) — heatmap of per-test ratio
   relative to the first snapshot. Useful for cross-version sweeps.
-- :func:`plot_scaling` (1 snapshot) — log-log median vs ``n`` for
+- :func:`plot_scaling` (1 snapshot) — log-log time vs ``n`` for
   size-parametrized tests, faceted by phase.
 
+All three accept a ``metric`` argument selecting which pytest-benchmark
+stat drives the plot. Default is ``min`` — for microbenchmarks the
+lowest observed time is closest to the "true" cost (noise can only slow
+things down). ``median`` is more robust to a single weirdly-fast warmup
+round; ``mean`` and ``max`` are also accepted.
+
 plotly is imported lazily by the dispatcher so the rest of the benchmark
 suite still works without it.
 """
@@ -22,36 +28,37 @@
 from typing import Literal
 
 PlotView = Literal["compare", "sweep", "scaling"]
+Metric = Literal["min", "median", "mean", "max"]
 
 _SIZE_RE = re.compile(r"(.*)\[([^\[\]]+?)-n=(\d+)\]")
 
 
-def _load_snapshot(path: Path) -> tuple[str, dict[str, float]]:
-    """Return ``(label, {fullname: median_seconds})`` for one snapshot."""
+def _load_snapshot(path: Path, metric: Metric = "min") -> tuple[str, dict[str, float]]:
+    """Return ``(label, {fullname: <metric>_seconds})`` for one snapshot."""
     data = json.loads(path.read_text())
-    medians = {bm["fullname"]: bm["stats"]["median"] for bm in data["benchmarks"]}
-    return path.stem, medians
+    values = {bm["fullname"]: bm["stats"][metric] for bm in data["benchmarks"]}
+    return path.stem, values
 
 
-def plot_compare(snapshots: list[Path], output: Path) -> int:
-    """Bar chart of relative median delta per test, sorted by magnitude."""
+def plot_compare(snapshots: list[Path], output: Path, metric: Metric = "min") -> int:
+    """Bar chart of relative delta per test, sorted by magnitude."""
     import pandas as pd
     import plotly.express as px
 
-    (a_label, a_med), (b_label, b_med) = (
-        _load_snapshot(snapshots[0]),
-        _load_snapshot(snapshots[1]),
+    (a_label, a_vals), (b_label, b_vals) = (
+        _load_snapshot(snapshots[0], metric),
+        _load_snapshot(snapshots[1], metric),
     )
-    common = sorted(set(a_med) & set(b_med))
+    common = sorted(set(a_vals) & set(b_vals))
     if not common:
         raise ValueError("no tests in common between the two snapshots")
 
     rows = [
         {
             "test": name,
-            a_label: a_med[name],
-            b_label: b_med[name],
-            "delta_pct": (b_med[name] - a_med[name]) / a_med[name] * 100.0,
+            a_label: a_vals[name],
+            b_label: b_vals[name],
+            "delta_pct": (b_vals[name] - a_vals[name]) / a_vals[name] * 100.0,
         }
         for name in common
     ]
@@ -66,8 +73,8 @@ def plot_compare(snapshots: list[Path], output: Path) -> int:
         color="delta_pct",
         color_continuous_scale=["green", "white", "red"],
         color_continuous_midpoint=0,
-        title=f"Median delta: {a_label} → {b_label} (positive = slower)",
-        labels={"delta_pct": "median delta %", "test": ""},
+        title=f"{metric} delta: {a_label} → {b_label} (positive = slower)",
+        labels={"delta_pct": f"{metric} delta %", "test": ""},
         text_auto=".1f",
         hover_data={
             a_label: ":.4g",
@@ -80,15 +87,15 @@ def plot_compare(snapshots: list[Path], output: Path) -> int:
     return len(df)
 
 
-def plot_sweep(snapshots: list[Path], output: Path) -> int:
-    """Heatmap of per-test median ratio relative to the first snapshot."""
+def plot_sweep(snapshots: list[Path], output: Path, metric: Metric = "min") -> int:
+    """Heatmap of per-test ratio relative to the first snapshot."""
     import pandas as pd
     import plotly.express as px
 
-    loaded = [_load_snapshot(p) for p in snapshots]
+    loaded = [_load_snapshot(p, metric) for p in snapshots]
     versions = [label for label, _ in loaded]
     baseline = loaded[0][1]
-    all_tests = sorted(set().union(*[set(med) for _, med in loaded]))
+    all_tests = sorted(set().union(*[set(vals) for _, vals in loaded]))
 
     ratios: dict[str, list[float | None]] = {}
     absolutes: dict[str, list[float | None]] = {}
@@ -98,8 +105,8 @@ def plot_sweep(snapshots: list[Path], output: Path) -> int:
             continue
         ratios[test] = []
         absolutes[test] = []
-        for _, med in loaded:
-            t = med.get(test)
+        for _, vals in loaded:
+            t = vals.get(test)
             ratios[test].append(t / base if t else None)
             absolutes[test].append(t)
 
@@ -114,18 +121,18 @@ def plot_sweep(snapshots: list[Path], output: Path) -> int:
         color_continuous_scale=["green", "white", "red"],
         color_continuous_midpoint=1.0,
         aspect="auto",
-        title=f"Median ratio relative to baseline ({versions[0]})",
+        title=f"{metric} ratio relative to baseline ({versions[0]})",
         labels={"x": "version", "y": "test", "color": "ratio"},
         text_auto=".2f",
     )
-    # Inject absolute medians as customdata so hover shows both.
+    # Inject absolute values as customdata so hover shows both.
     fig.update_traces(
         customdata=abs_df.values,
         hovertemplate=(
             "test: %{y}<br>"
             "version: %{x}<br>"
             "ratio: %{z:.3f}<br>"
-            "median: %{customdata:.4g}s"
+            f"{metric}: %{{customdata:.4g}}s"
             "<extra></extra>"
         ),
     )
@@ -134,20 +141,20 @@ def plot_sweep(snapshots: list[Path], output: Path) -> int:
     return len(df)
 
 
-def plot_scaling(snapshots: list[Path], output: Path) -> int:
-    """Log-log median vs N for size-parametrized tests, faceted by phase."""
+def plot_scaling(snapshots: list[Path], output: Path, metric: Metric = "min") -> int:
+    """Log-log time vs N for size-parametrized tests, faceted by phase."""
     import pandas as pd
     import plotly.express as px
 
-    _, med = _load_snapshot(snapshots[0])
+    _, vals = _load_snapshot(snapshots[0], metric)
     rows = []
-    for name, t in med.items():
+    for name, t in vals.items():
         m = _SIZE_RE.match(name)
         if not m:
             continue
         phase_path, model, n = m.groups()
         phase = phase_path.split("::")[-1]
-        rows.append({"phase": phase, "model": model, "n": int(n), "median": t})
+        rows.append({"phase": phase, "model": model, "n": int(n), metric: t})
 
     if not rows:
         raise ValueError(
@@ -158,21 +165,21 @@ def plot_scaling(snapshots: list[Path], output: Path) -> int:
     fig = px.line(
         df,
         x="n",
-        y="median",
+        y=metric,
         color="model",
         facet_col="phase",
         facet_col_wrap=3,
         log_x=True,
         log_y=True,
         markers=True,
-        title=f"Scaling: median time vs problem size ({snapshots[0].stem})",
+        title=f"Scaling: {metric} time vs problem size ({snapshots[0].stem})",
     )
     fig.update_layout(height=max(400, ((df["phase"].nunique() + 2) // 3) * 350))
     fig.write_html(output)
     return len(df)
 
 
-RENDERERS: dict[PlotView, Callable[[list[Path], Path], int]] = {
+RENDERERS: dict[PlotView, Callable[[list[Path], Path, Metric], int]] = {
     "compare": plot_compare,
     "sweep": plot_sweep,
     "scaling": plot_scaling,

From d703cb12a40ffac45864664d60426bdbb252a67a Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 16:04:19 +0200
Subject: [PATCH 22/68] benchmarks: plot compare sorts/bars by absolute time
 delta by default
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For a suite where test costs span six orders of magnitude (knapsack
microsecond builds vs PyPSA carbon at 2.4 s), sorting by % delta
overweights cheap tests — a 100% regression on a 1µs test ranks above
a 1% regression on a 2s test, but the absolute impacts are 1µs vs
24ms.

Changes:

- Default sort is now ``absolute`` (``b - a`` in seconds). Bar values
  are the time delta with SI-prefix formatting on the x-axis (24 ms,
  240 µs, etc.). Big actual-time impacts float to the bottom.
- ``--sort relative`` keeps the old percent behaviour.
- Both ``delta_abs`` and ``delta_pct`` are surfaced in hover regardless
  of which one drives the sort, so you can read off whichever lens.
- ``plot_sweep`` / ``plot_scaling`` accept a ``sort`` arg for uniform
  signature but ignore it (no two-snapshot diff there).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py      | 15 +++++++++--
 benchmarks/plotting.py | 60 ++++++++++++++++++++++++++++++++++--------
 2 files changed, 62 insertions(+), 13 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 7d67c460..008bef79 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -29,7 +29,7 @@
 )
 from benchmarks.memory import compare as memory_compare
 from benchmarks.memory import save as memory_save
-from benchmarks.plotting import Metric, PlotView
+from benchmarks.plotting import Metric, PlotView, SortMode
 
 app = typer.Typer(
     help=(
@@ -628,6 +628,17 @@ def plot(
             )
         ),
     ] = "min",
+    sort: Annotated[
+        SortMode,
+        typer.Option(
+            help=(
+                "Compare-view sort and bar dimension. ``absolute`` (default) "
+                "uses ``b - a`` in seconds so the biggest actual-time impacts "
+                "float to the bottom — avoids over-weighting cheap "
+                "microsecond tests. ``relative`` uses percent change."
+            )
+        ),
+    ] = "absolute",
     output: Annotated[
         Path,
         typer.Option("--output", "-o", help="Where to write the HTML."),
@@ -691,7 +702,7 @@ def plot(
 
     output.parent.mkdir(parents=True, exist_ok=True)
     try:
-        rendered = RENDERERS[chosen](snapshots, output, metric)
+        rendered = RENDERERS[chosen](snapshots, output, metric, sort)
     except ValueError as exc:
         typer.secho(str(exc), fg=typer.colors.RED, err=True)
         raise typer.Exit(code=1) from exc
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index 805306c4..69404670 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -29,6 +29,7 @@
 
 PlotView = Literal["compare", "sweep", "scaling"]
 Metric = Literal["min", "median", "mean", "max"]
+SortMode = Literal["absolute", "relative"]
 
 _SIZE_RE = re.compile(r"(.*)\[([^\[\]]+?)-n=(\d+)\]")
 
@@ -40,8 +41,22 @@ def _load_snapshot(path: Path, metric: Metric = "min") -> tuple[str, dict[str, f
     return path.stem, values
 
 
-def plot_compare(snapshots: list[Path], output: Path, metric: Metric = "min") -> int:
-    """Bar chart of relative delta per test, sorted by magnitude."""
+def plot_compare(
+    snapshots: list[Path],
+    output: Path,
+    metric: Metric = "min",
+    sort: SortMode = "absolute",
+) -> int:
+    """
+    Bar chart of delta per test, sorted by magnitude.
+
+    ``sort="absolute"`` (default): bar = (b - a) seconds, sort by the
+    largest actual time impact. Best for "what change actually affected
+    total runtime?" — avoids over-weighting cheap microsecond tests.
+
+    ``sort="relative"``: bar = (b/a - 1) * 100 %, sort by the largest
+    proportional change. Best for "what got proportionally worse?".
+    """
     import pandas as pd
     import plotly.express as px
 
@@ -58,36 +73,54 @@ def plot_compare(snapshots: list[Path], output: Path, metric: Metric = "min") ->
             "test": name,
             a_label: a_vals[name],
             b_label: b_vals[name],
+            "delta_abs": b_vals[name] - a_vals[name],
             "delta_pct": (b_vals[name] - a_vals[name]) / a_vals[name] * 100.0,
         }
         for name in common
     ]
     df = pd.DataFrame(rows)
-    df = df.reindex(df["delta_pct"].abs().sort_values(ascending=True).index)
+    x_col = "delta_abs" if sort == "absolute" else "delta_pct"
+    df = df.reindex(df[x_col].abs().sort_values(ascending=True).index)
+
+    if sort == "absolute":
+        x_label = f"{metric} delta (s)"
+        text_fmt = ".2s"
+    else:
+        x_label = f"{metric} delta %"
+        text_fmt = ".1f"
 
     fig = px.bar(
         df,
-        x="delta_pct",
+        x=x_col,
         y="test",
         orientation="h",
-        color="delta_pct",
+        color=x_col,
         color_continuous_scale=["green", "white", "red"],
         color_continuous_midpoint=0,
-        title=f"{metric} delta: {a_label} → {b_label} (positive = slower)",
-        labels={"delta_pct": f"{metric} delta %", "test": ""},
-        text_auto=".1f",
+        title=f"{metric} delta ({sort}): {a_label} → {b_label} (positive = slower)",
+        labels={x_col: x_label, "test": ""},
+        text_auto=text_fmt,
         hover_data={
             a_label: ":.4g",
             b_label: ":.4g",
+            "delta_abs": ":.4g",
             "delta_pct": ":.2f",
         },
     )
+    if sort == "absolute":
+        # SI-prefixed time on the x-axis (e.g. 24 ms, 2.4 ms, 240 µs).
+        fig.update_xaxes(tickformat=".2s", ticksuffix="s")
     fig.update_layout(height=max(400, len(df) * 14), showlegend=False)
     fig.write_html(output)
     return len(df)
 
 
-def plot_sweep(snapshots: list[Path], output: Path, metric: Metric = "min") -> int:
+def plot_sweep(
+    snapshots: list[Path],
+    output: Path,
+    metric: Metric = "min",
+    sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
+) -> int:
     """Heatmap of per-test ratio relative to the first snapshot."""
     import pandas as pd
     import plotly.express as px
@@ -141,7 +174,12 @@ def plot_sweep(snapshots: list[Path], output: Path, metric: Metric = "min") -> i
     return len(df)
 
 
-def plot_scaling(snapshots: list[Path], output: Path, metric: Metric = "min") -> int:
+def plot_scaling(
+    snapshots: list[Path],
+    output: Path,
+    metric: Metric = "min",
+    sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
+) -> int:
     """Log-log time vs N for size-parametrized tests, faceted by phase."""
     import pandas as pd
     import plotly.express as px
@@ -179,7 +217,7 @@ def plot_scaling(snapshots: list[Path], output: Path, metric: Metric = "min") ->
     return len(df)
 
 
-RENDERERS: dict[PlotView, Callable[[list[Path], Path, Metric], int]] = {
+RENDERERS: dict[PlotView, Callable[[list[Path], Path, Metric, SortMode], int]] = {
     "compare": plot_compare,
     "sweep": plot_sweep,
     "scaling": plot_scaling,

From 69693c04c7aeed5025325fe277c22b71442a6528 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 16:07:57 +0200
Subject: [PATCH 23/68] benchmarks: add ``scatter`` plot view for two-snapshot
 exploration
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The compare bar chart forces a choice between sorting by relative %
or absolute Δ. Both have blind spots: pure-relative makes microbenchmark
noise look catastrophic, pure-absolute hides real algorithmic regressions
on fast paths.

The two-axis scatter resolves the tension visually. Per test:

- x = baseline time (log scale)
- y = candidate / baseline ratio
- colour = absolute Δ

A point is a real regression worth chasing only when it sits in the
top-right — slow tests that got slower. Top-left (high ratio, tiny
absolute) reads as microbenchmark noise; bottom-right (high absolute,
ratio ≈ 1) was already slow and didn't change. A dashed reference line
at ``y=1`` makes "no change" trivial to see.

The view is auto-picked for nothing (compare wins for 2 snapshots);
pass ``--view scatter`` explicitly to get it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py      | 33 +++++++++-------
 benchmarks/plotting.py | 89 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 107 insertions(+), 15 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 008bef79..51ccb1ac 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -651,18 +651,21 @@ def plot(
     """
     Render an interactive HTML plot from one or more snapshots.
 
-    Three views, picked automatically from the snapshot count or set
-    explicitly via ``--view``:
-
-    - **compare** (2 snapshots) — horizontal bar chart of per-test median
-      delta, sorted by magnitude, green→red colormap. The "did this PR
-      regress anything?" picture in one glance.
-    - **sweep** (3+ snapshots) — heatmap of median ratio relative to the
-      first snapshot, rows = tests, columns = snapshot labels. Useful
-      for cross-version sweeps from ``sweep``.
-    - **scaling** (1 snapshot) — log-log median vs ``n`` for
-      size-parametrized tests, faceted by phase. Shows whether linopy's
-      complexity scales as expected.
+    Four views, picked automatically from the snapshot count (compare
+    for 2, sweep for 3+, scaling for 1) or set explicitly via ``--view``:
+
+    - **compare** (2 snapshots) — horizontal bar chart of per-test delta,
+      sorted by magnitude. The "did this PR regress anything?" picture.
+    - **scatter** (2 snapshots) — exploratory two-axis plot: baseline
+      cost on log-x, ratio on y, absolute Δ encoded in colour. Tests
+      in the top-right are the real regressions (slow tests that got
+      slower); top-left = noisy microbenchmarks; bottom-right =
+      already-slow-but-unchanged. Resolves the absolute-vs-relative
+      tension visually.
+    - **sweep** (3+ snapshots) — heatmap of ratio relative to the first
+      snapshot, rows = tests, columns = snapshot labels.
+    - **scaling** (1 snapshot) — log-log time vs ``n`` for
+      size-parametrized tests, faceted by phase.
 
     Output is an interactive Plotly HTML file. Open it in any browser
     (or pass ``--open``).
@@ -679,9 +682,11 @@ def plot(
         if len(snapshots) == 2
         else "sweep"
     )
-    if chosen == "compare" and len(snapshots) != 2:
+    if chosen in ("compare", "scatter") and len(snapshots) != 2:
         typer.secho(
-            "compare view needs exactly 2 snapshots", fg=typer.colors.RED, err=True
+            f"{chosen} view needs exactly 2 snapshots",
+            fg=typer.colors.RED,
+            err=True,
         )
         raise typer.Exit(code=2)
     if chosen == "scaling" and len(snapshots) != 1:
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index 69404670..ba4ea445 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -27,7 +27,7 @@
 from pathlib import Path
 from typing import Literal
 
-PlotView = Literal["compare", "sweep", "scaling"]
+PlotView = Literal["compare", "scatter", "sweep", "scaling"]
 Metric = Literal["min", "median", "mean", "max"]
 SortMode = Literal["absolute", "relative"]
 
@@ -115,6 +115,92 @@ def plot_compare(
     return len(df)
 
 
+def plot_scatter(
+    snapshots: list[Path],
+    output: Path,
+    metric: Metric = "min",
+    sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
+) -> int:
+    """
+    Two-axis scatter — baseline cost on log-x, ratio on y.
+
+    Designed as the single best exploratory plot for regression hunting
+    across tests of wildly different magnitudes: a point lights up as
+    "fix this" only if it sits in the top-right corner — slow tests
+    that got slower. Top-left (big ratio, tiny absolute) reads as
+    microbenchmark noise; bottom-right (big absolute, tiny ratio) is
+    already-slow-but-unchanged. The combined position resolves the
+    tension that pure relative or pure absolute sort each blind-spot.
+
+    A horizontal reference at ``ratio = 1`` makes "no change" trivial
+    to see; the colour encodes absolute Δ as a third channel.
+    """
+    import pandas as pd
+    import plotly.express as px
+
+    (a_label, a_vals), (b_label, b_vals) = (
+        _load_snapshot(snapshots[0], metric),
+        _load_snapshot(snapshots[1], metric),
+    )
+    common = sorted(set(a_vals) & set(b_vals))
+    if not common:
+        raise ValueError("no tests in common between the two snapshots")
+
+    rows = []
+    for name in common:
+        a, b = a_vals[name], b_vals[name]
+        if a <= 0:
+            continue
+        rows.append(
+            {
+                "test": name,
+                "baseline_time": a,
+                "ratio": b / a,
+                "delta_abs": b - a,
+                "delta_pct": (b - a) / a * 100.0,
+                a_label: a,
+                b_label: b,
+            }
+        )
+
+    df = pd.DataFrame(rows)
+    fig = px.scatter(
+        df,
+        x="baseline_time",
+        y="ratio",
+        color="delta_abs",
+        color_continuous_scale=["green", "white", "red"],
+        color_continuous_midpoint=0,
+        log_x=True,
+        hover_name="test",
+        hover_data={
+            a_label: ":.4g",
+            b_label: ":.4g",
+            "delta_abs": ":.4g",
+            "delta_pct": ":.2f",
+            "ratio": ":.3f",
+            "baseline_time": ":.4g",
+        },
+        title=(
+            f"{metric} scatter: {a_label} → {b_label} "
+            "(top-right = slow tests that got slower)"
+        ),
+        labels={
+            "baseline_time": f"baseline {metric} (s, log scale)",
+            "ratio": f"{metric} ratio  (candidate / baseline)",
+            "delta_abs": "Δ (s)",
+        },
+    )
+    # Reference line at ratio == 1 (no change).
+    fig.add_hline(
+        y=1.0, line_dash="dash", line_color="grey", annotation_text="no change"
+    )
+    fig.update_traces(marker=dict(size=8, line=dict(width=0.5, color="DarkSlateGrey")))
+    fig.update_layout(height=600)
+    fig.write_html(output)
+    return len(df)
+
+
 def plot_sweep(
     snapshots: list[Path],
     output: Path,
@@ -219,6 +305,7 @@ def plot_scaling(
 
 RENDERERS: dict[PlotView, Callable[[list[Path], Path, Metric, SortMode], int]] = {
     "compare": plot_compare,
+    "scatter": plot_scatter,
     "sweep": plot_sweep,
     "scaling": plot_scaling,
 }

From 2f08aa6058e837aae9df872f2b1dd6cb61106cc3 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 16:12:44 +0200
Subject: [PATCH 24/68] benchmarks: scatter view handles N snapshots via plotly
 animation_frame
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The two-axis scatter now scales beyond a single baseline-vs-candidate
pair. With N>=3 inputs the first is still the baseline (reference); each
subsequent snapshot becomes one animation frame. Use the slider / play
button to scrub through versions and watch tests drift across releases.

Implementation:

- First snapshot is the baseline. Skipped from the frame set (would
  trivially be y=1 everywhere).
- Each subsequent snapshot contributes points at (baseline_time,
  ratio, Δ) per overlapping test. ``animation_frame="version"`` does
  the per-frame slicing; ``category_orders`` preserves input order in
  the slider so the timeline reads left→right.
- ``range_x`` / ``range_y`` are pinned to the global min/max so the
  camera doesn't jump between frames.
- 2 inputs still produces a static scatter (no animation overhead).

Considered ``facet_col`` but it gets cramped past ~4 versions — the
slider scales to arbitrary length.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py      |  9 +++--
 benchmarks/plotting.py | 81 ++++++++++++++++++++++++++++--------------
 2 files changed, 61 insertions(+), 29 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 51ccb1ac..510b396a 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -682,9 +682,14 @@ def plot(
         if len(snapshots) == 2
         else "sweep"
     )
-    if chosen in ("compare", "scatter") and len(snapshots) != 2:
+    if chosen == "compare" and len(snapshots) != 2:
         typer.secho(
-            f"{chosen} view needs exactly 2 snapshots",
+            "compare view needs exactly 2 snapshots", fg=typer.colors.RED, err=True
+        )
+        raise typer.Exit(code=2)
+    if chosen == "scatter" and len(snapshots) < 2:
+        typer.secho(
+            "scatter view needs at least 2 snapshots (baseline + 1)",
             fg=typer.colors.RED,
             err=True,
         )
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index ba4ea445..3f904391 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -132,38 +132,62 @@ def plot_scatter(
     already-slow-but-unchanged. The combined position resolves the
     tension that pure relative or pure absolute sort each blind-spot.
 
+    The first snapshot is the baseline. With 2 snapshots, a static
+    scatter is drawn; with 3+, every subsequent snapshot becomes an
+    ``animation_frame`` — use the slider / play button to step through
+    versions and watch points drift across releases.
+
     A horizontal reference at ``ratio = 1`` makes "no change" trivial
     to see; the colour encodes absolute Δ as a third channel.
     """
     import pandas as pd
     import plotly.express as px
 
-    (a_label, a_vals), (b_label, b_vals) = (
-        _load_snapshot(snapshots[0], metric),
-        _load_snapshot(snapshots[1], metric),
-    )
-    common = sorted(set(a_vals) & set(b_vals))
-    if not common:
-        raise ValueError("no tests in common between the two snapshots")
+    if len(snapshots) < 2:
+        raise ValueError("scatter needs at least 2 snapshots (baseline + 1)")
+
+    loaded = [_load_snapshot(p, metric) for p in snapshots]
+    baseline_label, baseline_vals = loaded[0]
+    others = loaded[1:]
 
     rows = []
-    for name in common:
-        a, b = a_vals[name], b_vals[name]
-        if a <= 0:
-            continue
-        rows.append(
-            {
-                "test": name,
-                "baseline_time": a,
-                "ratio": b / a,
-                "delta_abs": b - a,
-                "delta_pct": (b - a) / a * 100.0,
-                a_label: a,
-                b_label: b,
-            }
+    for label, vals in others:
+        common = sorted(set(baseline_vals) & set(vals))
+        for name in common:
+            a, b = baseline_vals[name], vals[name]
+            if a <= 0:
+                continue
+            rows.append(
+                {
+                    "test": name,
+                    "version": label,
+                    "baseline_time": a,
+                    "candidate_time": b,
+                    "ratio": b / a,
+                    "delta_abs": b - a,
+                    "delta_pct": (b - a) / a * 100.0,
+                }
+            )
+
+    if not rows:
+        raise ValueError(
+            f"no tests in common between baseline ({baseline_label}) "
+            "and any of the other snapshots"
         )
 
     df = pd.DataFrame(rows)
+    # Fix the axis ranges so the animation doesn't jitter; pad by a small
+    # margin so points on the edges aren't clipped.
+    x_lo, x_hi = df["baseline_time"].min(), df["baseline_time"].max()
+    y_lo, y_hi = df["ratio"].min(), df["ratio"].max()
+    pad_y = max(0.05, (y_hi - y_lo) * 0.05)
+
+    animate = len(others) >= 2
+    extra: dict = {}
+    if animate:
+        extra["animation_frame"] = "version"
+        extra["category_orders"] = {"version": [label for label, _ in others]}
+
     fig = px.scatter(
         df,
         x="baseline_time",
@@ -172,26 +196,29 @@ def plot_scatter(
         color_continuous_scale=["green", "white", "red"],
         color_continuous_midpoint=0,
         log_x=True,
+        range_x=[x_lo * 0.5, x_hi * 2],
+        range_y=[y_lo - pad_y, y_hi + pad_y],
         hover_name="test",
         hover_data={
-            a_label: ":.4g",
-            b_label: ":.4g",
+            "baseline_time": ":.4g",
+            "candidate_time": ":.4g",
             "delta_abs": ":.4g",
             "delta_pct": ":.2f",
             "ratio": ":.3f",
-            "baseline_time": ":.4g",
+            "version": True,
         },
         title=(
-            f"{metric} scatter: {a_label} → {b_label} "
-            "(top-right = slow tests that got slower)"
+            f"{metric} scatter vs baseline ({baseline_label}) — "
+            "top-right = slow tests that got slower"
         ),
         labels={
             "baseline_time": f"baseline {metric} (s, log scale)",
             "ratio": f"{metric} ratio  (candidate / baseline)",
+            "candidate_time": "candidate",
             "delta_abs": "Δ (s)",
         },
+        **extra,
     )
-    # Reference line at ratio == 1 (no change).
     fig.add_hline(
         y=1.0, line_dash="dash", line_color="grey", annotation_text="no change"
     )

From 321d2d96caa50703d840ac6aff6975968e2868d1 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 16:17:10 +0200
Subject: [PATCH 25/68] =?UTF-8?q?benchmarks:=20scatter=20=E2=80=94=20inclu?=
 =?UTF-8?q?de=20baseline=20as=20frame=200,=20clip=20colour=20to=20p95=20|?=
 =?UTF-8?q?=CE=94|?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two small but high-value tweaks for the multi-snapshot scatter:

- The baseline snapshot now contributes its own animation frame where
  every point sits at ratio=1, Δ=0. Gives the animation a "before
  anything happened" anchor: hit play and watch points drift from the
  baseline horizon outward. Previously the first frame was the first
  candidate, which made the visual feel as if it started mid-story.

- ``range_color`` is pinned to the 95th-percentile absolute Δ
  (±p95). One huge outlier no longer drags the colour scale and
  flattens everyone else to white; outliers saturate at the bound,
  the rest of the distribution stays readable. Colour-bar label notes
  ``Δ (s, p95-clipped)`` so the convention is explicit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/plotting.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index 3f904391..417f7a43 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -140,6 +140,7 @@ def plot_scatter(
     A horizontal reference at ``ratio = 1`` makes "no change" trivial
     to see; the colour encodes absolute Δ as a third channel.
     """
+    import numpy as np
     import pandas as pd
     import plotly.express as px
 
@@ -148,10 +149,12 @@ def plot_scatter(
 
     loaded = [_load_snapshot(p, metric) for p in snapshots]
     baseline_label, baseline_vals = loaded[0]
-    others = loaded[1:]
 
+    # Include the baseline itself as the first animation frame (all points
+    # at ratio=1, Δ=0). Gives the animation a "before anything happened"
+    # anchor and makes the visual drift across frames easier to read.
     rows = []
-    for label, vals in others:
+    for label, vals in loaded:
         common = sorted(set(baseline_vals) & set(vals))
         for name in common:
             a, b = baseline_vals[name], vals[name]
@@ -182,11 +185,18 @@ def plot_scatter(
     y_lo, y_hi = df["ratio"].min(), df["ratio"].max()
     pad_y = max(0.05, (y_hi - y_lo) * 0.05)
 
-    animate = len(others) >= 2
+    # Clip the colour scale to the 95th-percentile absolute Δ so a single
+    # huge regression doesn't wash everything else to white. Outliers
+    # saturate at the bound, the rest stays readable.
+    clip = float(np.percentile(df["delta_abs"].abs(), 95)) if len(df) > 0 else 0.0
+    if clip == 0.0:
+        clip = float(df["delta_abs"].abs().max() or 1e-9)
+
+    animate = len(snapshots) >= 3
     extra: dict = {}
     if animate:
         extra["animation_frame"] = "version"
-        extra["category_orders"] = {"version": [label for label, _ in others]}
+        extra["category_orders"] = {"version": [label for label, _ in loaded]}
 
     fig = px.scatter(
         df,
@@ -195,6 +205,7 @@ def plot_scatter(
         color="delta_abs",
         color_continuous_scale=["green", "white", "red"],
         color_continuous_midpoint=0,
+        range_color=[-clip, clip],
         log_x=True,
         range_x=[x_lo * 0.5, x_hi * 2],
         range_y=[y_lo - pad_y, y_hi + pad_y],
@@ -215,7 +226,7 @@ def plot_scatter(
             "baseline_time": f"baseline {metric} (s, log scale)",
             "ratio": f"{metric} ratio  (candidate / baseline)",
             "candidate_time": "candidate",
-            "delta_abs": "Δ (s)",
+            "delta_abs": "Δ (s, p95-clipped)",
         },
         **extra,
     )

From a0d4b7a5d9292cdb1f0528af66e437bf2fb13681 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 16:18:06 +0200
Subject: [PATCH 26/68] =?UTF-8?q?benchmarks:=20scatter=20=E2=80=94=20cente?=
 =?UTF-8?q?r=20y-axis=20symmetrically=20around=201.0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The "no change" line sits at y=1, but with asymmetric data (e.g. some
2x regression, no symmetric speedup), it landed near the bottom of the
visible range and improvements got squeezed near the floor.

Now: ``max_dist = max(|1 - y_lo|, |y_hi - 1|)`` and ``range_y = [1 -
max_dist, 1 + max_dist]``. Pure min/max coverage (no clipping) but the
window is symmetric around 1.0, so regressions above and improvements
below are equally readable regardless of the data skew.

The colour scale keeps the p95-clipped centred-at-0 treatment from the
previous commit.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/plotting.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index 417f7a43..ec7e715a 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -182,8 +182,14 @@ def plot_scatter(
     # Fix the axis ranges so the animation doesn't jitter; pad by a small
     # margin so points on the edges aren't clipped.
     x_lo, x_hi = df["baseline_time"].min(), df["baseline_time"].max()
+    # y-range uses min/max but is centered symmetrically around 1.0 (the
+    # "no change" line), so regressions above and improvements below are
+    # equally readable. Asymmetric data still resolves — the larger side
+    # just dictates how wide the symmetric window is.
     y_lo, y_hi = df["ratio"].min(), df["ratio"].max()
-    pad_y = max(0.05, (y_hi - y_lo) * 0.05)
+    max_dist = max(abs(1.0 - y_lo), abs(y_hi - 1.0), 0.05)
+    pad_y = max(0.05, max_dist * 0.05)
+    y_range = [1.0 - max_dist - pad_y, 1.0 + max_dist + pad_y]
 
     # Clip the colour scale to the 95th-percentile absolute Δ so a single
     # huge regression doesn't wash everything else to white. Outliers
@@ -208,7 +214,7 @@ def plot_scatter(
         range_color=[-clip, clip],
         log_x=True,
         range_x=[x_lo * 0.5, x_hi * 2],
-        range_y=[y_lo - pad_y, y_hi + pad_y],
+        range_y=y_range,
         hover_name="test",
         hover_data={
             "baseline_time": ":.4g",

From 45700e7ac910c79a9576b1be82d4af9f38c78e31 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 16:22:17 +0200
Subject: [PATCH 27/68] =?UTF-8?q?benchmarks:=20address=20review=20?=
 =?UTF-8?q?=E2=80=94=20row=20height,=20scaling-from-params,=20mismatch=20w?=
 =?UTF-8?q?arn?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three safe fixes from a code review of benchmarks/plotting.py:

- Row-height multiplier 14 → 22 in plot_compare and plot_sweep, with
  the floor bumped from 400 to 500. At 25+ tests the y-axis labels
  were colliding; now they breathe.
- plot_scaling reads ``params.size`` (the cleanly-stored int from
  parametrize) and only falls back to the id regex if absent. The
  ``model`` name still needs the regex because pytest-benchmark
  serializes our ModelSpec as ``UNSERIALIZABLE[ModelSpec(...)]``, so a
  full params switch isn't possible here — but the size path is now
  robust to test-id rename.
- plot_compare surfaces the mismatch between snapshots: prints a
  stderr line with the test counts only in A / only in B / common,
  and embeds the same as a subtitle in the figure. Silent intersection
  was the worst-case footgun.

Skipped (per review note): the default-view swap for 3+ snapshots
(sweep → scatter) is a judgement call left for the user. Default
output filename change (clobber on each run) also skipped — they want
to decide whether per-view filenames are worth the API change.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/plotting.py | 50 ++++++++++++++++++++++++++++++++++--------
 1 file changed, 41 insertions(+), 9 deletions(-)

diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index ec7e715a..c062c44a 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -65,8 +65,19 @@ def plot_compare(
         _load_snapshot(snapshots[1], metric),
     )
     common = sorted(set(a_vals) & set(b_vals))
+    only_a = sorted(set(a_vals) - set(b_vals))
+    only_b = sorted(set(b_vals) - set(a_vals))
     if not common:
         raise ValueError("no tests in common between the two snapshots")
+    if only_a or only_b:
+        # Surface the mismatch so silent intersection isn't a footgun.
+        import sys
+
+        print(
+            f"compare: {len(only_a)} test(s) only in {a_label}, "
+            f"{len(only_b)} only in {b_label} (intersection: {len(common)}).",
+            file=sys.stderr,
+        )
 
     rows = [
         {
@@ -89,6 +100,10 @@ def plot_compare(
         x_label = f"{metric} delta %"
         text_fmt = ".1f"
 
+    title = f"{metric} delta ({sort}): {a_label} → {b_label} (positive = slower)"
+    if only_a or only_b:
+        title += f"<br><sub>{len(only_a)} only in {a_label}, {len(only_b)} only in {b_label}</sub>"
+
     fig = px.bar(
         df,
         x=x_col,
@@ -97,7 +112,7 @@ def plot_compare(
         color=x_col,
         color_continuous_scale=["green", "white", "red"],
         color_continuous_midpoint=0,
-        title=f"{metric} delta ({sort}): {a_label} → {b_label} (positive = slower)",
+        title=title,
         labels={x_col: x_label, "test": ""},
         text_auto=text_fmt,
         hover_data={
@@ -110,7 +125,7 @@ def plot_compare(
     if sort == "absolute":
         # SI-prefixed time on the x-axis (e.g. 24 ms, 2.4 ms, 240 µs).
         fig.update_xaxes(tickformat=".2s", ticksuffix="s")
-    fig.update_layout(height=max(400, len(df) * 14), showlegend=False)
+    fig.update_layout(height=max(500, len(df) * 22), showlegend=False)
     fig.write_html(output)
     return len(df)
 
@@ -299,7 +314,7 @@ def plot_sweep(
             "<extra></extra>"
         ),
     )
-    fig.update_layout(height=max(400, len(df) * 14))
+    fig.update_layout(height=max(500, len(df) * 22))
     fig.write_html(output)
     return len(df)
 
@@ -314,19 +329,36 @@ def plot_scaling(
     import pandas as pd
     import plotly.express as px
 
-    _, vals = _load_snapshot(snapshots[0], metric)
+    # Read the raw JSON so we can pull ``params`` per benchmark. ``size``
+    # comes from there as a clean int — any future rename of the test id
+    # format won't silently produce 0 rows. ``model`` still needs the id
+    # regex because spec is stored as an unserializable repr in params.
+    data = json.loads(snapshots[0].read_text())
     rows = []
-    for name, t in vals.items():
+    for bm in data["benchmarks"]:
+        name = bm["fullname"]
+        t = bm["stats"][metric]
+        params = bm.get("params") or {}
+
+        size = params.get("size")
+        if not isinstance(size, int):
+            # Fall back to the id regex.
+            m = _SIZE_RE.match(name)
+            if not m:
+                continue
+            size = int(m.group(3))
+
         m = _SIZE_RE.match(name)
         if not m:
             continue
-        phase_path, model, n = m.groups()
-        phase = phase_path.split("::")[-1]
-        rows.append({"phase": phase, "model": model, "n": int(n), metric: t})
+        phase = m.group(1).split("::")[-1]
+        model = m.group(2)
+        rows.append({"phase": phase, "model": model, "n": size, metric: t})
 
     if not rows:
         raise ValueError(
-            "no size-parametrized tests found (expected ``...[<model>-n=<N>]``)"
+            "no size-parametrized tests found (expected ``...[<model>-n=<N>]`` "
+            "or a ``params.size`` int)"
         )
 
     df = pd.DataFrame(rows).sort_values(["phase", "model", "n"])

From ad7aa53fdcfab453fd7afa9b751827be6112cd69 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 16:28:53 +0200
Subject: [PATCH 28/68] =?UTF-8?q?benchmarks:=20plot=20returns=20Figure,=20?=
 =?UTF-8?q?default=20output=20=E2=86=92=20.benchmarks/plots/<view>.html?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two coupled changes setting up the notebook-embedding path:

- ``plot_compare`` / ``plot_scatter`` / ``plot_sweep`` / ``plot_scaling``
  in ``benchmarks/plotting.py`` now return a ``plotly.graph_objects.Figure``
  instead of writing to disk + returning a count. The CLI does the
  ``fig.write_html(output)`` step. ``benchmarks.plotting.n_points(fig)``
  is exported as a helper so the CLI still emits a "N points → path"
  status line.

  This unblocks rendering plots directly in jupyter — call
  ``plot_compare(...)`` and Jupyter's display hook renders the Figure
  inline.

- Default ``-o`` for ``plot`` is now ``.benchmarks/plots/<view>.html``
  (was ``benchmark-plot.html`` in cwd). Matches where snapshots already
  land (and is gitignored), and the per-view filename means consecutive
  runs of different views don't clobber each other.

Bonus: two ``numpy_array or fallback`` bugs in scatter (``df.abs().max()
or 1e-9``) and the new ``n_points`` helper (``trace.x or trace.z``) —
both triggered ``ValueError: The truth value of an array with more
than one element is ambiguous``. Replaced with explicit ``is None``
checks.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py      | 30 ++++++++++++++++++++------
 benchmarks/plotting.py | 49 ++++++++++++++++++++++++++----------------
 2 files changed, 53 insertions(+), 26 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 510b396a..4d37b566 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -640,9 +640,17 @@ def plot(
         ),
     ] = "absolute",
     output: Annotated[
-        Path,
-        typer.Option("--output", "-o", help="Where to write the HTML."),
-    ] = Path("benchmark-plot.html"),
+        Path | None,
+        typer.Option(
+            "--output",
+            "-o",
+            help=(
+                "Where to write the HTML. Defaults to "
+                "``.benchmarks/plots/<view>.html`` (gitignored) so "
+                "different views don't clobber each other."
+            ),
+        ),
+    ] = None,
     open_browser: Annotated[
         bool,
         typer.Option("--open/--no-open", help="Open the result in a browser."),
@@ -701,7 +709,7 @@ def plot(
         raise typer.Exit(code=2)
 
     try:
-        from benchmarks.plotting import RENDERERS
+        from benchmarks.plotting import RENDERERS, n_points
     except ImportError as exc:
         typer.secho(
             "plotly is required for ``plot`` — ``pip install plotly``",
@@ -710,15 +718,23 @@ def plot(
         )
         raise typer.Exit(code=2) from exc
 
-    output.parent.mkdir(parents=True, exist_ok=True)
+    # Default filename: ``.benchmarks/plots/<view>.html``. Matches where
+    # snapshots already live (and is gitignored), and the per-view name
+    # means consecutive ``plot`` calls don't clobber each other.
+    if output is None:
+        output = Path(".benchmarks") / "plots" / f"{chosen}.html"
+
     try:
-        rendered = RENDERERS[chosen](snapshots, output, metric, sort)
+        fig = RENDERERS[chosen](snapshots, metric, sort)
     except ValueError as exc:
         typer.secho(str(exc), fg=typer.colors.RED, err=True)
         raise typer.Exit(code=1) from exc
 
+    output.parent.mkdir(parents=True, exist_ok=True)
+    fig.write_html(output)
+
     typer.secho(
-        f"{chosen} view ({metric}): {rendered} tests → {output}",
+        f"{chosen} view ({metric}): {n_points(fig)} points → {output}",
         fg=typer.colors.GREEN,
     )
     if open_browser:
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index c062c44a..cb96f373 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -25,7 +25,10 @@
 import re
 from collections.abc import Callable
 from pathlib import Path
-from typing import Literal
+from typing import TYPE_CHECKING, Literal
+
+if TYPE_CHECKING:
+    from plotly.graph_objects import Figure
 
 PlotView = Literal["compare", "scatter", "sweep", "scaling"]
 Metric = Literal["min", "median", "mean", "max"]
@@ -43,10 +46,9 @@ def _load_snapshot(path: Path, metric: Metric = "min") -> tuple[str, dict[str, f
 
 def plot_compare(
     snapshots: list[Path],
-    output: Path,
     metric: Metric = "min",
     sort: SortMode = "absolute",
-) -> int:
+) -> Figure:
     """
     Bar chart of delta per test, sorted by magnitude.
 
@@ -126,16 +128,14 @@ def plot_compare(
         # SI-prefixed time on the x-axis (e.g. 24 ms, 2.4 ms, 240 µs).
         fig.update_xaxes(tickformat=".2s", ticksuffix="s")
     fig.update_layout(height=max(500, len(df) * 22), showlegend=False)
-    fig.write_html(output)
-    return len(df)
+    return fig
 
 
 def plot_scatter(
     snapshots: list[Path],
-    output: Path,
     metric: Metric = "min",
     sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
-) -> int:
+) -> Figure:
     """
     Two-axis scatter — baseline cost on log-x, ratio on y.
 
@@ -211,7 +211,8 @@ def plot_scatter(
     # saturate at the bound, the rest stays readable.
     clip = float(np.percentile(df["delta_abs"].abs(), 95)) if len(df) > 0 else 0.0
     if clip == 0.0:
-        clip = float(df["delta_abs"].abs().max() or 1e-9)
+        max_abs = float(df["delta_abs"].abs().max())
+        clip = max_abs if max_abs > 0 else 1e-9
 
     animate = len(snapshots) >= 3
     extra: dict = {}
@@ -256,16 +257,14 @@ def plot_scatter(
     )
     fig.update_traces(marker=dict(size=8, line=dict(width=0.5, color="DarkSlateGrey")))
     fig.update_layout(height=600)
-    fig.write_html(output)
-    return len(df)
+    return fig
 
 
 def plot_sweep(
     snapshots: list[Path],
-    output: Path,
     metric: Metric = "min",
     sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
-) -> int:
+) -> Figure:
     """Heatmap of per-test ratio relative to the first snapshot."""
     import pandas as pd
     import plotly.express as px
@@ -315,16 +314,14 @@ def plot_sweep(
         ),
     )
     fig.update_layout(height=max(500, len(df) * 22))
-    fig.write_html(output)
-    return len(df)
+    return fig
 
 
 def plot_scaling(
     snapshots: list[Path],
-    output: Path,
     metric: Metric = "min",
     sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
-) -> int:
+) -> Figure:
     """Log-log time vs N for size-parametrized tests, faceted by phase."""
     import pandas as pd
     import plotly.express as px
@@ -375,13 +372,27 @@ def plot_scaling(
         title=f"Scaling: {metric} time vs problem size ({snapshots[0].stem})",
     )
     fig.update_layout(height=max(400, ((df["phase"].nunique() + 2) // 3) * 350))
-    fig.write_html(output)
-    return len(df)
+    return fig
 
 
-RENDERERS: dict[PlotView, Callable[[list[Path], Path, Metric, SortMode], int]] = {
+RENDERERS: dict[PlotView, Callable[[list[Path], Metric, SortMode], Figure]] = {
     "compare": plot_compare,
     "scatter": plot_scatter,
     "sweep": plot_sweep,
     "scaling": plot_scaling,
 }
+
+
+def n_points(fig: Figure) -> int:
+    """Count points across all traces — useful for the CLI status line."""
+    total = 0
+    for trace in fig.data:
+        x = getattr(trace, "x", None)
+        if x is not None:
+            total += len(x)
+            continue
+        z = getattr(trace, "z", None)
+        if z is not None:
+            # ``z`` is 2D for heatmaps.
+            total += sum(len(row) for row in z)
+    return total

From 7c7bab21c0ed4c743c315c00f25010e544ede696 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 16:30:47 +0200
Subject: [PATCH 29/68] =?UTF-8?q?benchmarks:=20plot=20renderers=20return?=
 =?UTF-8?q?=20(Figure,=20n=5Ftests)=20=E2=80=94=20drop=20trace=20introspec?=
 =?UTF-8?q?tion?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ``n_points(fig)`` helper added in the previous commit walked
``fig.data`` traces and called ``len(trace.x)`` to recover the test
count. That's backwards — the count is sitting right there in the
source DataFrame at render time, no need to reach into the rendered
plot.

Renderers now return ``tuple[Figure, int]`` directly. ``len(df)`` for
compare / sweep / scaling; ``df["test"].nunique()`` for scatter
(rows are per-(test, version) so the raw len double-counts).

n_points helper dropped.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py      |  6 +++---
 benchmarks/plotting.py | 35 +++++++++++------------------------
 2 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 4d37b566..1c9aab8b 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -709,7 +709,7 @@ def plot(
         raise typer.Exit(code=2)
 
     try:
-        from benchmarks.plotting import RENDERERS, n_points
+        from benchmarks.plotting import RENDERERS
     except ImportError as exc:
         typer.secho(
             "plotly is required for ``plot`` — ``pip install plotly``",
@@ -725,7 +725,7 @@ def plot(
         output = Path(".benchmarks") / "plots" / f"{chosen}.html"
 
     try:
-        fig = RENDERERS[chosen](snapshots, metric, sort)
+        fig, n_tests = RENDERERS[chosen](snapshots, metric, sort)
     except ValueError as exc:
         typer.secho(str(exc), fg=typer.colors.RED, err=True)
         raise typer.Exit(code=1) from exc
@@ -734,7 +734,7 @@ def plot(
     fig.write_html(output)
 
     typer.secho(
-        f"{chosen} view ({metric}): {n_points(fig)} points → {output}",
+        f"{chosen} view ({metric}): {n_tests} tests → {output}",
         fg=typer.colors.GREEN,
     )
     if open_browser:
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index cb96f373..51976370 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -48,7 +48,7 @@ def plot_compare(
     snapshots: list[Path],
     metric: Metric = "min",
     sort: SortMode = "absolute",
-) -> Figure:
+) -> tuple[Figure, int]:
     """
     Bar chart of delta per test, sorted by magnitude.
 
@@ -128,14 +128,14 @@ def plot_compare(
         # SI-prefixed time on the x-axis (e.g. 24 ms, 2.4 ms, 240 µs).
         fig.update_xaxes(tickformat=".2s", ticksuffix="s")
     fig.update_layout(height=max(500, len(df) * 22), showlegend=False)
-    return fig
+    return fig, len(df)
 
 
 def plot_scatter(
     snapshots: list[Path],
     metric: Metric = "min",
     sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
-) -> Figure:
+) -> tuple[Figure, int]:
     """
     Two-axis scatter — baseline cost on log-x, ratio on y.
 
@@ -257,14 +257,14 @@ def plot_scatter(
     )
     fig.update_traces(marker=dict(size=8, line=dict(width=0.5, color="DarkSlateGrey")))
     fig.update_layout(height=600)
-    return fig
+    return fig, int(df["test"].nunique())
 
 
 def plot_sweep(
     snapshots: list[Path],
     metric: Metric = "min",
     sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
-) -> Figure:
+) -> tuple[Figure, int]:
     """Heatmap of per-test ratio relative to the first snapshot."""
     import pandas as pd
     import plotly.express as px
@@ -314,14 +314,14 @@ def plot_sweep(
         ),
     )
     fig.update_layout(height=max(500, len(df) * 22))
-    return fig
+    return fig, len(df)
 
 
 def plot_scaling(
     snapshots: list[Path],
     metric: Metric = "min",
     sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
-) -> Figure:
+) -> tuple[Figure, int]:
     """Log-log time vs N for size-parametrized tests, faceted by phase."""
     import pandas as pd
     import plotly.express as px
@@ -372,27 +372,14 @@ def plot_scaling(
         title=f"Scaling: {metric} time vs problem size ({snapshots[0].stem})",
     )
     fig.update_layout(height=max(400, ((df["phase"].nunique() + 2) // 3) * 350))
-    return fig
+    return fig, len(df)
 
 
-RENDERERS: dict[PlotView, Callable[[list[Path], Metric, SortMode], Figure]] = {
+RENDERERS: dict[
+    PlotView, Callable[[list[Path], Metric, SortMode], tuple[Figure, int]]
+] = {
     "compare": plot_compare,
     "scatter": plot_scatter,
     "sweep": plot_sweep,
     "scaling": plot_scaling,
 }
-
-
-def n_points(fig: Figure) -> int:
-    """Count points across all traces — useful for the CLI status line."""
-    total = 0
-    for trace in fig.data:
-        x = getattr(trace, "x", None)
-        if x is not None:
-            total += len(x)
-            continue
-        z = getattr(trace, "z", None)
-        if z is not None:
-            # ``z`` is 2D for heatmaps.
-            total += sum(len(row) for row in z)
-    return total

From 6a8a16dc9966b4b37a1a4b87259fc675cee82d68 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 16:41:42 +0200
Subject: [PATCH 30/68] benchmarks: notebook plot demo uses the CLI + tqdm
 progress

Two refinements to the end-to-end plotting section:

- tqdm wraps the subprocess loop that generates the two snapshots.
  Each ``--quick --phase build`` run takes ~10 s; tqdm makes the
  ~20 s wait visible. ``tqdm.auto`` auto-picks the notebook widget
  vs console bar based on context.

- Plots are now rendered via ``python -m benchmarks plot --view <name>``
  rather than direct ``plot_compare`` / ``plot_scatter`` imports.
  A small ``cli_plot(view, snapshots)`` helper runs the subprocess,
  reads the generated HTML, and inlines it via ``IPython.display.HTML``.
  Demonstrates the actual user-facing CLI path inside the notebook
  rather than the internal API.

Notebook end-to-end runtime: ~37 s (~33 s for the run loop + plotting
overhead).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/notebooks/registry_usage.ipynb | 137 +++++++++++++++++++++-
 1 file changed, 136 insertions(+), 1 deletion(-)

diff --git a/benchmarks/notebooks/registry_usage.ipynb b/benchmarks/notebooks/registry_usage.ipynb
index 79801dff..8b5bb568 100644
--- a/benchmarks/notebooks/registry_usage.ipynb
+++ b/benchmarks/notebooks/registry_usage.ipynb
@@ -490,6 +490,141 @@
    "cell_type": "markdown",
    "id": "32",
    "metadata": {},
+   "source": [
+    "## Plotting — end-to-end demo\n",
+    "\n",
+    "The CLI `plot` subcommand wraps the same functions that\n",
+    "`benchmarks.plotting` exposes — call them directly here to render\n",
+    "inline. Below we generate two real `--quick` snapshots (≈ 20 s each in\n",
+    "CI), then walk the two two-snapshot views.\n",
+    "\n",
+    "The diff between the two runs is just measurement noise — that's\n",
+    "expected. On a real PR you'd compare master against your branch."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "33",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import tempfile  # noqa: E402\n",
+    "\n",
+    "from tqdm.auto import tqdm  # noqa: E402\n",
+    "\n",
+    "# Focused run: --quick + just the build phase. ~10 s per snapshot, two\n",
+    "# runs ≈ 20 s. tqdm shows live progress as each subprocess finishes.\n",
+    "_plot_tmp = Path(tempfile.mkdtemp(prefix=\"benchmarks-plot-demo-\"))\n",
+    "_env = {**os.environ, \"PYTHONPATH\": str(_p)}\n",
+    "\n",
+    "\n",
+    "def _run_benchmark(label: str) -> Path:\n",
+    "    snap = _plot_tmp / f\"{label}.json\"\n",
+    "    subprocess.run(\n",
+    "        [\n",
+    "            sys.executable,\n",
+    "            \"-m\",\n",
+    "            \"benchmarks\",\n",
+    "            \"run\",\n",
+    "            \"--quick\",\n",
+    "            \"--phase\",\n",
+    "            \"build\",\n",
+    "            \"--json\",\n",
+    "            str(snap),\n",
+    "        ],\n",
+    "        env=_env,\n",
+    "        cwd=str(_p),  # subprocesses inherit notebook cwd; pin to repo root\n",
+    "        check=True,\n",
+    "        capture_output=True,\n",
+    "    )\n",
+    "    return snap\n",
+    "\n",
+    "\n",
+    "labels = (\"baseline\", \"candidate\")\n",
+    "snaps = {\n",
+    "    label: _run_benchmark(label)\n",
+    "    for label in tqdm(labels, desc=\"benchmark runs\", unit=\"run\")\n",
+    "}\n",
+    "baseline_snap, candidate_snap = snaps[\"baseline\"], snaps[\"candidate\"]\n",
+    "print(f\"baseline:  {baseline_snap.stat().st_size // 1024} KB\")\n",
+    "print(f\"candidate: {candidate_snap.stat().st_size // 1024} KB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "34",
+   "metadata": {},
+   "source": [
+    "### Rendering via the CLI\n",
+    "\n",
+    "`python -m benchmarks plot` writes HTML to disk by default. We can drive\n",
+    "it from the notebook as a subprocess and inline the generated file via\n",
+    "`IPython.display.HTML`. This is the same code path users hit from the\n",
+    "terminal — nothing notebook-specific."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "35",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from IPython.display import HTML  # noqa: E402\n",
+    "\n",
+    "\n",
+    "def cli_plot(view: str, snapshots: list[Path]) -> HTML:\n",
+    "    \"\"\"Run ``python -m benchmarks plot --view <view> ...`` and inline the result.\"\"\"\n",
+    "    out = _plot_tmp / f\"{view}.html\"\n",
+    "    subprocess.run(\n",
+    "        [\n",
+    "            sys.executable,\n",
+    "            \"-m\",\n",
+    "            \"benchmarks\",\n",
+    "            \"plot\",\n",
+    "            \"--view\",\n",
+    "            view,\n",
+    "            \"-o\",\n",
+    "            str(out),\n",
+    "            *(str(s) for s in snapshots),\n",
+    "        ],\n",
+    "        env=_env,\n",
+    "        cwd=str(_p),\n",
+    "        check=True,\n",
+    "        capture_output=True,\n",
+    "    )\n",
+    "    return HTML(out.read_text())\n",
+    "\n",
+    "\n",
+    "# Compare view: per-test delta, sorted by absolute Δ (the default).\n",
+    "cli_plot(\"compare\", [baseline_snap, candidate_snap])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "36",
+   "metadata": {},
+   "source": [
+    "### Scatter view via the CLI\n",
+    "\n",
+    "Same CLI shape, different `--view` argument:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "37",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cli_plot(\"scatter\", [baseline_snap, candidate_snap])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "38",
+   "metadata": {},
    "source": [
     "## Extending\n",
     "\n",
@@ -506,7 +641,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "33",
+   "id": "39",
    "metadata": {},
    "source": [
     "### Regenerating the lockfile\n",

From 09dad9d44dc932268809e0ad6abc80c99d6badcd Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 16:50:45 +0200
Subject: [PATCH 31/68] benchmarks: notebook plot demo accepts the full CLI
 command string

Two readability improvements:

- Lead with the scatter view (top-right = real regressions) since it's
  the recommended exploratory plot. Compare bar chart moves to a
  secondary "alternative" position.

- ``cli_plot(view, snapshots)`` renamed to ``bench(cli)`` and takes the
  literal shell command as one string: ``bench(f"python -m benchmarks
  plot --view scatter {baseline_snap} {candidate_snap}")``. Strips the
  optional leading ``python -m benchmarks`` (or just ``benchmarks``),
  shlex-splits the rest, and runs it. For ``plot`` subcommands an
  ``-o`` is injected to a tempdir so the resulting HTML can be inlined
  via ``IPython.display.HTML``; other subcommands just print stdout.

The cell now reads exactly like a terminal command, which is what a
user transferring from CLI to notebook would expect.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/notebooks/registry_usage.ipynb | 76 ++++++++++++++---------
 1 file changed, 46 insertions(+), 30 deletions(-)

diff --git a/benchmarks/notebooks/registry_usage.ipynb b/benchmarks/notebooks/registry_usage.ipynb
index 8b5bb568..ea3d6d62 100644
--- a/benchmarks/notebooks/registry_usage.ipynb
+++ b/benchmarks/notebooks/registry_usage.ipynb
@@ -556,12 +556,19 @@
    "id": "34",
    "metadata": {},
    "source": [
-    "### Rendering via the CLI\n",
+    "### Rendering via the CLI — scatter view\n",
     "\n",
-    "`python -m benchmarks plot` writes HTML to disk by default. We can drive\n",
-    "it from the notebook as a subprocess and inline the generated file via\n",
-    "`IPython.display.HTML`. This is the same code path users hit from the\n",
-    "terminal — nothing notebook-specific."
+    "`python -m benchmarks plot` writes HTML to disk by default. The helper\n",
+    "below shells out to that exact command and inlines the generated file\n",
+    "via `IPython.display.HTML` — same CLI path users hit from the terminal,\n",
+    "just rendered in the notebook. Pass the arguments as a single string,\n",
+    "exactly how you'd type them on the command line.\n",
+    "\n",
+    "The scatter view (`--view scatter`) is the recommended exploratory plot\n",
+    "for regression hunting: x = baseline cost on a log axis, y = ratio,\n",
+    "colour = absolute Δ. Top-right = slow tests that got slower (the \"fix\n",
+    "this\" zone). Top-left = noisy microbenchmarks. Bottom-right =\n",
+    "already-slow tests that didn't move."
    ]
   },
   {
@@ -571,34 +578,41 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import shlex  # noqa: E402\n",
+    "\n",
     "from IPython.display import HTML  # noqa: E402\n",
     "\n",
     "\n",
-    "def cli_plot(view: str, snapshots: list[Path]) -> HTML:\n",
-    "    \"\"\"Run ``python -m benchmarks plot --view <view> ...`` and inline the result.\"\"\"\n",
-    "    out = _plot_tmp / f\"{view}.html\"\n",
-    "    subprocess.run(\n",
-    "        [\n",
-    "            sys.executable,\n",
-    "            \"-m\",\n",
-    "            \"benchmarks\",\n",
-    "            \"plot\",\n",
-    "            \"--view\",\n",
-    "            view,\n",
-    "            \"-o\",\n",
-    "            str(out),\n",
-    "            *(str(s) for s in snapshots),\n",
-    "        ],\n",
-    "        env=_env,\n",
-    "        cwd=str(_p),\n",
-    "        check=True,\n",
-    "        capture_output=True,\n",
+    "def bench(cli: str) -> HTML | None:\n",
+    "    \"\"\"\n",
+    "    Run a ``python -m benchmarks ...`` command exactly as you'd type it.\n",
+    "\n",
+    "    The leading ``python -m benchmarks`` (or just ``benchmarks``) is\n",
+    "    optional — strip whichever you prefer. For ``plot`` subcommands the\n",
+    "    output is captured into a tempdir and returned as inlinable\n",
+    "    ``HTML``; other subcommands print their stdout and return ``None``::\n",
+    "\n",
+    "        bench(f\"python -m benchmarks plot --view scatter {baseline_snap} {candidate_snap}\")\n",
+    "    \"\"\"\n",
+    "    cli = cli.removeprefix(\"python -m benchmarks \").removeprefix(\"benchmarks \")\n",
+    "    args = shlex.split(cli)\n",
+    "    cmd = [sys.executable, \"-m\", \"benchmarks\", *args]\n",
+    "\n",
+    "    if args and args[0] == \"plot\" and not (set(args) & {\"-o\", \"--output\"}):\n",
+    "        # Inject -o so we can read the result back for inline rendering.\n",
+    "        out = _plot_tmp / \"out.html\"\n",
+    "        cmd += [\"-o\", str(out)]\n",
+    "        subprocess.run(cmd, env=_env, cwd=str(_p), check=True, capture_output=True)\n",
+    "        return HTML(out.read_text())\n",
+    "\n",
+    "    result = subprocess.run(\n",
+    "        cmd, env=_env, cwd=str(_p), check=True, capture_output=True, text=True\n",
     "    )\n",
-    "    return HTML(out.read_text())\n",
+    "    print(result.stdout)\n",
+    "    return None\n",
     "\n",
     "\n",
-    "# Compare view: per-test delta, sorted by absolute Δ (the default).\n",
-    "cli_plot(\"compare\", [baseline_snap, candidate_snap])"
+    "bench(f\"python -m benchmarks plot --view scatter {baseline_snap} {candidate_snap}\")"
    ]
   },
   {
@@ -606,9 +620,11 @@
    "id": "36",
    "metadata": {},
    "source": [
-    "### Scatter view via the CLI\n",
+    "### Compare view — bar chart alternative\n",
     "\n",
-    "Same CLI shape, different `--view` argument:"
+    "Same `cli_plot` helper, different `--view`. The compare bar chart is\n",
+    "useful when you want a sorted list of per-test deltas (in seconds by\n",
+    "default; pass `--sort relative` for percent)."
    ]
   },
   {
@@ -618,7 +634,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "cli_plot(\"scatter\", [baseline_snap, candidate_snap])"
+    "bench(f\"python -m benchmarks plot --view compare {baseline_snap} {candidate_snap}\")"
    ]
   },
   {

From 2ece2c156a2d3cdb09436ef6608c7374b62cf735 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 17:08:04 +0200
Subject: [PATCH 32/68] benchmarks: memory tracks all phases via
 memray.Tracker; README accuracy fixes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three reviewer items rolled into one PR.

memory.py — direct memray.Tracker, all phases
=============================================

Previous: regex-scraped pytest-memray's terminal output, ran only
``test_build.py`` because pytest-memray can't isolate setup from the
timed phase.

Now: a small runner iterates the registry per phase in a subprocess
each (one phase per process, same isolation pattern). Each
``(spec, size)`` measurement builds the model *outside* the
``memray.Tracker`` context and runs only the phase work inside, so the
peak reflects the phase, not model construction:

  m = spec.build(size)
  with memray.Tracker(bin_path):
      wrapper(m)
  peak = FileReader(bin_path).metadata.peak_memory

Default phases: ``build``, ``matrices``, ``lp_write``, ``netcdf``,
``solver_handoff`` (was build-only). JSON output stays keyed by full
pytest-style test IDs (``benchmarks/test_<phase>.py::test_<phase>
[<spec>-n=<size>]``) so the existing ``compare()`` diffs cleanly.

Drops the ``MEMORY_RE`` regex, the ``--test-path`` plumbing, and the
pytest-memray output dependency. Worker writes JSON to a sidecar file
(not stdout) because solvers like HiGHS print to stdout from C inside
the tracked region.

CLI: ``memory save`` gains ``--phase`` (repeatable; default = all).

README — accuracy fixes
=======================

- New intro: four numbered phases (build / solver handoff / netCDF
  (de)serialization / end-to-end PyPSA). The old "round-trip via netCDF"
  framing oversold what's actually measured.
- New phase-coverage table that surfaces ``test_pypsa_carbon_management.py``
  as its own ``End-to-end (PyPSA)`` row.
- Short note explaining the netCDF benchmarks read hot-cache by design
  — we track (de)serialization code in linopy / xarray, not disk.
- Metrics section rewritten: memory is no longer build-only.

Hand-tested: ``python -m benchmarks memory save test --quick`` produces
a JSON with peaks for every ``(phase, model, size)`` combo (53 entries
in ~15s under --quick), and ``memory compare`` diffs them.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/README.md |  34 +++-
 benchmarks/cli.py    |  36 +++-
 benchmarks/memory.py | 384 +++++++++++++++++++++++++++++++++----------
 3 files changed, 351 insertions(+), 103 deletions(-)

diff --git a/benchmarks/README.md b/benchmarks/README.md
index df649ecc..721b1321 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,8 +1,28 @@
 # Internal Performance Benchmarks
 
-End-to-end timing and memory for the linopy half of an optimization run:
-build a model, hand it off to a solver, round-trip via netCDF. Solver
-algorithm runtime is intentionally out of scope.
+This suite benchmarks the **linopy part end-to-end** across four phases:
+
+1. **Build** — construct the linopy model.
+2. **Solver handoff** — convert a built model into solver-consumable form
+   (in-memory matrices, LP file, native solver instance).
+3. **netCDF serialization / deserialization** — `to_netcdf` / `read_netcdf`.
+4. **End-to-end** — a fixed real-world PyPSA model all the way to a solver
+   instance.
+
+Solver algorithm runtime is intentionally out of scope.
+
+| Phase                  | Test file                           | Measures                                                                  |
+| ---------------------- | ----------------------------------- | ------------------------------------------------------------------------- |
+| Build                  | `test_build.py`                     | constructing variables / expressions / constraints / objective            |
+| Solver handoff         | `test_matrices.py`                  | `A`, `b`, `c`, bounds, labels, `Q` for QP                                 |
+| Solver handoff         | `test_lp_write.py`                  | `model.to_file(...)` — LP / MPS serialization                             |
+| Solver handoff         | `test_solver_handoff.py`            | `lp.io.to_highspy` / `to_gurobipy` / `to_mosek` / `to_xpress`             |
+| netCDF (de)serialization | `test_netcdf.py`                  | `to_netcdf` and `read_netcdf` round-trip                                  |
+| End-to-end (PyPSA)     | `test_pypsa_carbon_management.py`   | Fixed real-world pypsa network through `network.optimize.create_model` and on to highspy; sweeps `freeze_constraints` and `set_names`. |
+
+The netCDF benchmarks reuse the same file path across pytest-benchmark
+iterations, so reads run hot-cache by design — what we want to track is
+the (de)serialization code in `linopy` / `xarray`, not disk hardware.
 
 > **Note:** `benchmark/` (singular) is for external framework comparisons.
 > `benchmarks/` (plural) is only for internal linopy performance tracking.
@@ -84,6 +104,8 @@ jupyter lab benchmarks/notebooks/registry_usage.ipynb
 - **Time** — pytest-benchmark median runtime (IQR for stability). Snapshots
   are JSON; pass `--json <path>` to `run` to save one, then diff against a
   baseline.
-- **Memory** — pytest-memray peak RSS (MiB), tracked for the build phase
-  only because later phases include build allocations and attribution
-  becomes unreliable. Use `memory save` / `memory compare`.
+- **Memory** — peak allocations (MiB) via `memray.Tracker`, measured per
+  `(phase, spec, size)` across all phases. The model is built *outside* the
+  tracked region so the peak reflects only the phase work, not model
+  construction. Use `memory save` (optionally `--phase` to scope) and
+  `memory compare`.
diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 1c9aab8b..f3ab61e2 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -754,18 +754,42 @@ def memory_save_cmd(
     quick: Annotated[
         bool, typer.Option("--quick", help="Use smaller problem sizes.")
     ] = False,
-    test_path: Annotated[
+    phase: Annotated[
         list[str] | None,
-        typer.Option("--test-path", help="Test file(s) to run; defaults to build."),
+        typer.Option(
+            "--phase",
+            help=(
+                "Restrict measurement to these phases. Pass multiple ``--phase`` "
+                "to select more than one. Default: all (build, matrices, lp_write,"
+                " netcdf, solver_handoff)."
+            ),
+        ),
     ] = None,
 ) -> None:
     """
-    Run the build phase under pytest-memray and save peak RSS to JSON.
+    Measure peak memory across the registry × phase grid via ``memray.Tracker``.
+
+    Each ``(phase, spec, size)`` runs under its own tracker so setup
+    allocations (model construction) are excluded from the peak — only the
+    phase work itself is counted. Phases run in separate subprocesses for
+    isolation.
 
-    Results land in ``.benchmarks/memory/<label>.json``. Use ``compare``
-    afterwards to diff two snapshots.
+    Results land in ``.benchmarks/memory/<label>.json``, keyed by full
+    pytest-style test IDs so ``compare`` diffs cleanly across runs that
+    selected different subsets.
     """
-    memory_save(label, quick=quick, test_paths=test_path)
+    from benchmarks.memory import DEFAULT_PHASES
+
+    if phase:
+        unknown = [p for p in phase if p not in DEFAULT_PHASES]
+        if unknown:
+            typer.secho(
+                f"unknown phase(s): {unknown}; valid options: {list(DEFAULT_PHASES)}",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            raise typer.Exit(code=2)
+    memory_save(label, quick=quick, phases=phase)
 
 
 @memory_app.command("compare")
diff --git a/benchmarks/memory.py b/benchmarks/memory.py
index b48a0be1..03cc4da0 100644
--- a/benchmarks/memory.py
+++ b/benchmarks/memory.py
@@ -1,128 +1,313 @@
 """
-Measure and compare peak memory using pytest-memray.
+Measure and compare peak memory across the registry × phase grid.
+
+Each measurement uses ``memray.Tracker`` directly so the model construction
+(setup) lives *outside* the tracked region and the peak reflects only the
+phase work itself::
+
+    m = spec.build(size)            # setup, not tracked
+    with memray.Tracker(bin_path):
+        wrapper(m)                  # tracked
+    peak = FileReader(bin_path).metadata.peak_memory
 
 This module exposes ``save(label, ...)`` and ``compare(label_a, label_b)`` as
 plain functions; user-facing invocation goes through the typer CLI::
 
-    python -m benchmarks memory save <label>
+    python -m benchmarks memory save <label> [--quick] [--phase build] ...
     python -m benchmarks memory compare <a> <b>
 
-Results are stored in ``.benchmarks/memory/``.
+Results land in ``.benchmarks/memory/`` as JSON keyed by full pytest-style
+test IDs (``benchmarks/test_<phase>.py::test_<phase>[<spec>-n=<size>]``)
+so cross-snapshot diffs work uniformly regardless of which phases were run.
 """
 
 from __future__ import annotations
 
+import argparse
+import gc
 import json
 import platform
-import re
 import subprocess
 import sys
+import tempfile
+from collections.abc import Callable, Iterator
 from pathlib import Path
+from typing import TYPE_CHECKING
 
 if platform.system() == "Windows":
     raise RuntimeError(
-        "memory.py requires pytest-memray which is not available on Windows. "
-        "Run memory benchmarks on Linux or macOS."
+        "memory measurement requires ``memray`` which is not available on "
+        "Windows. Run memory benchmarks on Linux or macOS."
     )
 
+if TYPE_CHECKING:
+    from benchmarks.registry import ModelSpec
+
 RESULTS_DIR = Path(".benchmarks/memory")
-MEMORY_RE = re.compile(
-    r"Allocation results for (.+?) at the high watermark\s+"
-    r"📦 Total memory allocated: ([\d.]+)(MiB|KiB|GiB|B)",
+DEFAULT_PHASES: tuple[str, ...] = (
+    "build",
+    "matrices",
+    "lp_write",
+    "netcdf",
+    "solver_handoff",
 )
-# Only the build phase is measured by default. Unlike timing benchmarks (where
-# pytest-benchmark isolates the measured function), memray tracks all allocations
-# within a test — including model construction in setup. This means LP write and
-# matrix tests would report build + phase memory combined, making the phase-specific
-# contribution hard to isolate. Since model construction dominates memory usage,
-# measuring build alone gives the most accurate and actionable numbers.
-DEFAULT_TEST_PATHS = [
-    "benchmarks/test_build.py",
-]
-
-
-def _to_mib(value: float, unit: str) -> float:
-    factors = {"B": 1 / 1048576, "KiB": 1 / 1024, "MiB": 1, "GiB": 1024}
-    return value * factors[unit]
-
-
-def _collect_test_ids(test_paths: list[str], quick: bool) -> list[str]:
-    """Collect test IDs without running them."""
-    cmd = [
-        sys.executable,
-        "-m",
-        "pytest",
-        *test_paths,
-        "--collect-only",
-        "-q",
-    ]
-    if quick:
-        cmd.append("--quick")
-    result = subprocess.run(cmd, capture_output=True, text=True)
-    return [
-        line.strip()
-        for line in result.stdout.splitlines()
-        if "::" in line and not line.startswith(("=", "-", " "))
-    ]
-
-
-def save(label: str, quick: bool = False, test_paths: list[str] | None = None) -> Path:
-    """Run each benchmark in a separate process for accurate memory measurement."""
-    if test_paths is None:
-        test_paths = DEFAULT_TEST_PATHS
-    test_ids = _collect_test_ids(test_paths, quick)
-    if not test_ids:
-        print("No tests collected.", file=sys.stderr)
-        sys.exit(1)
 
-    print(f"Running {len(test_ids)} tests (each in a separate process)...")
-    entries = {}
-    for i, test_id in enumerate(test_ids, 1):
-        short = test_id.split("::")[-1]
-        print(f"  [{i}/{len(test_ids)}] {short}...", end=" ", flush=True)
 
+def _phase_tag(phase: str) -> str:
+    """Map a phase name to the registry phase tag used by ``spec.applies_to``."""
+    from benchmarks.registry import (
+        BUILD,
+        LP_WRITE,
+        MATRICES,
+        NETCDF,
+        TO_HIGHSPY,
+    )
+
+    return {
+        "build": BUILD,
+        "matrices": MATRICES,
+        "lp_write": LP_WRITE,
+        "netcdf": NETCDF,
+        "solver_handoff": TO_HIGHSPY,  # we always measure the highs handoff
+    }[phase]
+
+
+def _measure_peak(action: Callable[[], object]) -> float:
+    """Run ``action()`` under ``memray.Tracker`` and return peak MiB."""
+    import memray
+
+    fd, tmp = tempfile.mkstemp(suffix=".bin")
+    Path(tmp).unlink()  # memray needs to create the file itself
+    # Close the fd; the path is what matters.
+    try:
+        from os import close as _close
+
+        _close(fd)
+    except OSError:
+        pass
+
+    try:
+        with memray.Tracker(tmp):
+            action()
+        peak_bytes = memray.FileReader(tmp).metadata.peak_memory
+        return round(peak_bytes / (1024**2), 3)
+    finally:
+        Path(tmp).unlink(missing_ok=True)
+
+
+def _measurements(
+    phase: str, spec: ModelSpec, size: int
+) -> Iterator[tuple[str, Callable[[], object]]]:
+    """
+    Yield ``(test_id, action)`` pairs for one ``(phase, spec, size)``.
+
+    ``action`` is a zero-arg callable; the caller runs it inside a tracker.
+    For non-build phases, the model is built once up front (outside the
+    tracker) and the action closes over it so only the phase work is
+    counted.
+    """
+    name = spec.name
+
+    if phase == "build":
+        yield (
+            f"benchmarks/test_build.py::test_build[{name}-n={size}]",
+            lambda: spec.build(size),
+        )
+        return
+
+    m = spec.build(size)
+
+    if phase == "matrices":
+
+        def access() -> None:
+            mats = m.matrices
+            for attr in ("A", "b", "c", "lb", "ub", "sense", "vlabels", "clabels"):
+                getattr(mats, attr)
+            if m.is_quadratic:
+                mats.Q
+
+        yield (
+            f"benchmarks/test_matrices.py::test_matrices[{name}-n={size}]",
+            access,
+        )
+
+    elif phase == "lp_write":
+        # ``to_file`` writes to disk; use a tempdir so we don't leak.
+        tmpdir = tempfile.TemporaryDirectory()
+        lp_path = Path(tmpdir.name) / "m.lp"
+
+        def write_lp() -> None:
+            m.to_file(lp_path, progress=False)
+
+        try:
+            yield (
+                f"benchmarks/test_lp_write.py::test_lp_write[{name}-n={size}]",
+                write_lp,
+            )
+        finally:
+            tmpdir.cleanup()
+
+    elif phase == "netcdf":
+        from linopy import read_netcdf
+
+        tmpdir = tempfile.TemporaryDirectory()
+        nc_path = Path(tmpdir.name) / "m.nc"
+
+        def write_nc() -> None:
+            m.to_netcdf(nc_path)
+
+        def read_nc() -> None:
+            read_netcdf(nc_path)
+
+        try:
+            yield (
+                f"benchmarks/test_netcdf.py::test_netcdf_write[{name}-n={size}]",
+                write_nc,
+            )
+            # ``write_nc`` was called by the caller as part of the
+            # measurement, so ``nc_path`` now exists for the read.
+            yield (
+                f"benchmarks/test_netcdf.py::test_netcdf_read[{name}-n={size}]",
+                read_nc,
+            )
+        finally:
+            tmpdir.cleanup()
+
+    elif phase == "solver_handoff":
+        from linopy.io import to_highspy
+
+        def handoff() -> None:
+            to_highspy(m)
+
+        yield (
+            (
+                f"benchmarks/test_solver_handoff.py::test_solver_handoff"
+                f"[highs-{name}-n={size}]"
+            ),
+            handoff,
+        )
+
+    else:
+        raise ValueError(f"unknown phase: {phase!r}")
+
+
+def run_phase(phase: str, quick: bool = False) -> dict[str, float]:
+    """
+    Measure peak memory for every applicable ``(spec, size)`` under one phase.
+
+    Returns a ``{test_id: peak_mib}`` mapping. Invoked once per phase as a
+    subprocess by :func:`save` for isolation.
+    """
+    from benchmarks import REGISTRY
+
+    tag = _phase_tag(phase)
+    results: dict[str, float] = {}
+
+    for spec in REGISTRY.values():
+        if not spec.applies_to(tag):
+            continue
+
+        # Optional-dep gate (e.g. pypsa_scigrid needs pypsa).
+        for mod in spec.requires:
+            try:
+                __import__(mod)
+            except ImportError:
+                break
+        else:
+            for size in spec.sizes:
+                if quick and size > spec.quick_threshold:
+                    continue
+                try:
+                    for test_id, action in _measurements(phase, spec, size):
+                        try:
+                            results[test_id] = _measure_peak(action)
+                            print(
+                                f"  {test_id} → {results[test_id]:.1f} MiB",
+                                file=sys.stderr,
+                            )
+                        except Exception as exc:  # noqa: BLE001
+                            print(
+                                f"  skip {test_id}: {type(exc).__name__}: {exc}",
+                                file=sys.stderr,
+                            )
+                except Exception as exc:  # noqa: BLE001
+                    print(
+                        f"  setup failed {spec.name}/{size}: "
+                        f"{type(exc).__name__}: {exc}",
+                        file=sys.stderr,
+                    )
+                gc.collect()
+
+    return results
+
+
+def save(
+    label: str,
+    quick: bool = False,
+    phases: list[str] | None = None,
+) -> Path:
+    """
+    Run one subprocess per phase and merge the results into ``<label>.json``.
+
+    Per-phase subprocesses keep allocations from one phase out of another's
+    measurement; ``memray.Tracker`` only counts what's allocated inside its
+    ``with`` block, but the subprocess boundary makes the isolation total.
+    """
+    phases = list(phases) if phases else list(DEFAULT_PHASES)
+
+    all_results: dict[str, float] = {}
+    for phase in phases:
+        print(f"\n=== {phase} ===", file=sys.stderr)
+        # Worker writes JSON to a sidecar file rather than stdout — HiGHS
+        # (and other solvers) print to stdout from C code inside the tracked
+        # region, which would pollute the data channel.
+        fd, out_tmp = tempfile.mkstemp(suffix=".json", prefix=f"mem-{phase}-")
+        from os import close as _close
+
+        _close(fd)
         cmd = [
             sys.executable,
             "-m",
-            "pytest",
-            test_id,
-            "--memray",
-            "--benchmark-disable",
-            "-v",
-            "--tb=short",
-            "-q",
+            "benchmarks.memory",
+            "_worker",
+            phase,
+            "--out",
+            out_tmp,
         ]
-        result = subprocess.run(cmd, capture_output=True, text=True)
-        output = result.stdout + result.stderr
-
-        match = MEMORY_RE.search(output)
-        if match:
-            value = float(match.group(2))
-            unit = match.group(3)
-            mib = round(_to_mib(value, unit), 3)
-            entries[test_id] = mib
-            print(f"{mib:.1f} MiB")
-        elif "SKIPPED" in output or "skipped" in output:
-            print("skipped")
-        else:
-            print(
-                "WARNING: no memray data (pytest-memray output format may have changed)",
-                file=sys.stderr,
-            )
+        if quick:
+            cmd.append("--quick")
+        try:
+            result = subprocess.run(cmd, check=False, capture_output=True, text=True)
+            if result.stderr:
+                sys.stderr.write(result.stderr)
+            if result.returncode != 0:
+                print(
+                    f"phase {phase} subprocess failed (exit {result.returncode})",
+                    file=sys.stderr,
+                )
+                continue
+            try:
+                phase_results = json.loads(Path(out_tmp).read_text())
+            except (json.JSONDecodeError, FileNotFoundError) as exc:
+                print(f"phase {phase} JSON parse error: {exc}", file=sys.stderr)
+                continue
+            all_results.update(phase_results)
+        finally:
+            Path(out_tmp).unlink(missing_ok=True)
 
-    if not entries:
-        print("No memray results found. Is pytest-memray installed?", file=sys.stderr)
+    if not all_results:
+        print("No measurements produced.", file=sys.stderr)
         sys.exit(1)
 
     RESULTS_DIR.mkdir(parents=True, exist_ok=True)
     out_path = RESULTS_DIR / f"{label}.json"
-    out_path.write_text(json.dumps({"label": label, "peak_mib": entries}, indent=2))
-    print(f"\nSaved {len(entries)} results to {out_path}")
+    out_path.write_text(json.dumps({"label": label, "peak_mib": all_results}, indent=2))
+    print(f"\nSaved {len(all_results)} measurements to {out_path}", file=sys.stderr)
     return out_path
 
 
 def compare(label_a: str, label_b: str) -> None:
-    """Compare two saved memory results."""
+    """Diff two saved memory snapshots side-by-side."""
     path_a = RESULTS_DIR / f"{label_a}.json"
     path_b = RESULTS_DIR / f"{label_b}.json"
     for p in (path_a, path_b):
@@ -135,8 +320,8 @@ def compare(label_a: str, label_b: str) -> None:
 
     all_tests = sorted(set(data_a) | set(data_b))
 
-    print(f"\n{'Test':<60} {label_a:>10} {label_b:>10} {'Change':>10}")
-    print("-" * 94)
+    print(f"\n{'Test':<70} {label_a:>10} {label_b:>10} {'Change':>10}")
+    print("-" * 104)
 
     for test in all_tests:
         a = data_a.get(test)
@@ -148,8 +333,25 @@ def compare(label_a: str, label_b: str) -> None:
             change = f"{pct:+.1f}%"
         else:
             change = "—"
-        # Shorten test name for readability
         short = test.split("::")[-1] if "::" in test else test
-        print(f"{short:<60} {a_str:>10} {b_str:>10} {change:>10}")
+        print(f"{short:<70} {a_str:>10} {b_str:>10} {change:>10}")
 
     print()
+
+
+# ---- subprocess worker ---------------------------------------------------
+
+if __name__ == "__main__":  # pragma: no cover
+    parser = argparse.ArgumentParser(description="memory.py worker")
+    parser.add_argument("cmd", choices=["_worker"])
+    parser.add_argument("phase")
+    parser.add_argument("--quick", action="store_true")
+    parser.add_argument(
+        "--out",
+        required=True,
+        help="Path to write the JSON result to (stdout is reserved for solver chatter).",
+    )
+    args = parser.parse_args()
+    if args.cmd == "_worker":
+        out = run_phase(args.phase, quick=args.quick)
+        Path(args.out).write_text(json.dumps(out))

From ea4bc76287a97b4b222e44263f7a0df6cf60b0d0 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 17:18:49 +0200
Subject: [PATCH 33/68] benchmarks: plot subcommand auto-detects memory
 snapshots alongside timing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

``_load_snapshot`` now sniffs the JSON shape:

- pytest-benchmark timing (``{"benchmarks": [{"stats": {...}}]}``) →
  value is ``stats[metric]`` in seconds.
- memory.py output (``{"peak_mib": {test_id: float}}``) → value is the
  peak in MiB; ``metric`` arg is ignored.

The same four views (compare bar, scatter, sweep heatmap, scaling
log-log) work on either. Titles, axis labels, hover-template suffixes,
and tick formatting switch between ``s`` (SI-prefixed) and ``MiB``
based on the loaded unit. Compare view's "positive = slower" caption
becomes "positive = more memory" when the inputs are MiB.

Guard rail: mixing a timing and a memory snapshot in one ``plot``
invocation raises ``snapshots mix units {'MiB', 's'}`` instead of
silently producing nonsense.

Hand-tested all four views against both ``.benchmarks/sweep/*.json``
(timing) and ``.benchmarks/memory/*.json`` (memory) — 207 and 53 tests
respectively, all render with the right labels.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/plotting.py | 105 +++++++++++++++++++++++++++++++----------
 1 file changed, 81 insertions(+), 24 deletions(-)

diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index 51976370..53249172 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -37,11 +37,41 @@
 _SIZE_RE = re.compile(r"(.*)\[([^\[\]]+?)-n=(\d+)\]")
 
 
-def _load_snapshot(path: Path, metric: Metric = "min") -> tuple[str, dict[str, float]]:
-    """Return ``(label, {fullname: <metric>_seconds})`` for one snapshot."""
+def _load_snapshot(
+    path: Path, metric: Metric = "min"
+) -> tuple[str, dict[str, float], str]:
+    """
+    Return ``(label, {fullname: value}, unit)`` for one snapshot.
+
+    Auto-detects the JSON shape:
+
+    - pytest-benchmark timing (``{"benchmarks": [{"stats": {...}}]}``) →
+      ``value`` is ``stats[metric]`` in **seconds**.
+    - memory.py output (``{"peak_mib": {test_id: float}}``) → ``value`` is
+      the peak in **MiB**; ``metric`` is ignored.
+    """
     data = json.loads(path.read_text())
+    if "peak_mib" in data:
+        return path.stem, dict(data["peak_mib"]), "MiB"
     values = {bm["fullname"]: bm["stats"][metric] for bm in data["benchmarks"]}
-    return path.stem, values
+    return path.stem, values, "s"
+
+
+def _check_same_unit(snapshots: list[tuple[str, dict[str, float], str]]) -> str:
+    """Validate that every snapshot has the same unit, return it."""
+    units = {u for _, _, u in snapshots}
+    if len(units) > 1:
+        raise ValueError(
+            f"snapshots mix units {units}; can't compare timing and memory"
+        )
+    return next(iter(units))
+
+
+def _axis_kwargs(unit: str) -> dict:
+    """Return ``update_xaxes`` kwargs for a given unit."""
+    if unit == "s":
+        return {"tickformat": ".2s", "ticksuffix": "s"}
+    return {"ticksuffix": f" {unit}"}
 
 
 def plot_compare(
@@ -62,10 +92,10 @@ def plot_compare(
     import pandas as pd
     import plotly.express as px
 
-    (a_label, a_vals), (b_label, b_vals) = (
-        _load_snapshot(snapshots[0], metric),
-        _load_snapshot(snapshots[1], metric),
-    )
+    loaded = [_load_snapshot(p, metric) for p in snapshots[:2]]
+    unit = _check_same_unit(loaded)
+    metric_label = metric if unit == "s" else "peak"
+    (a_label, a_vals, _), (b_label, b_vals, _) = loaded
     common = sorted(set(a_vals) & set(b_vals))
     only_a = sorted(set(a_vals) - set(b_vals))
     only_b = sorted(set(b_vals) - set(a_vals))
@@ -96,13 +126,16 @@ def plot_compare(
     df = df.reindex(df[x_col].abs().sort_values(ascending=True).index)
 
     if sort == "absolute":
-        x_label = f"{metric} delta (s)"
-        text_fmt = ".2s"
+        x_label = f"{metric_label} delta ({unit})"
+        text_fmt = ".2s" if unit == "s" else ".2f"
     else:
-        x_label = f"{metric} delta %"
+        x_label = f"{metric_label} delta %"
         text_fmt = ".1f"
 
-    title = f"{metric} delta ({sort}): {a_label} → {b_label} (positive = slower)"
+    direction = "slower" if unit == "s" else "more memory"
+    title = (
+        f"{metric_label} delta ({sort}): {a_label} → {b_label} (positive = {direction})"
+    )
     if only_a or only_b:
         title += f"<br><sub>{len(only_a)} only in {a_label}, {len(only_b)} only in {b_label}</sub>"
 
@@ -125,8 +158,9 @@ def plot_compare(
         },
     )
     if sort == "absolute":
-        # SI-prefixed time on the x-axis (e.g. 24 ms, 2.4 ms, 240 µs).
-        fig.update_xaxes(tickformat=".2s", ticksuffix="s")
+        # SI-prefixed time on the x-axis (e.g. 24 ms, 2.4 ms, 240 µs) for
+        # timing snapshots; plain MiB for memory.
+        fig.update_xaxes(**_axis_kwargs(unit))
     fig.update_layout(height=max(500, len(df) * 22), showlegend=False)
     return fig, len(df)
 
@@ -162,7 +196,10 @@ def plot_scatter(
     if len(snapshots) < 2:
         raise ValueError("scatter needs at least 2 snapshots (baseline + 1)")
 
-    loaded = [_load_snapshot(p, metric) for p in snapshots]
+    raw = [_load_snapshot(p, metric) for p in snapshots]
+    unit = _check_same_unit(raw)
+    metric_label = metric if unit == "s" else "peak"
+    loaded = [(label, vals) for label, vals, _ in raw]
     baseline_label, baseline_vals = loaded[0]
 
     # Include the baseline itself as the first animation frame (all points
@@ -241,14 +278,14 @@ def plot_scatter(
             "version": True,
         },
         title=(
-            f"{metric} scatter vs baseline ({baseline_label}) — "
-            "top-right = slow tests that got slower"
+            f"{metric_label} scatter vs baseline ({baseline_label}) — "
+            "top-right = the regressed corner"
         ),
         labels={
-            "baseline_time": f"baseline {metric} (s, log scale)",
-            "ratio": f"{metric} ratio  (candidate / baseline)",
+            "baseline_time": f"baseline {metric_label} ({unit}, log scale)",
+            "ratio": f"{metric_label} ratio  (candidate / baseline)",
             "candidate_time": "candidate",
-            "delta_abs": "Δ (s, p95-clipped)",
+            "delta_abs": f"Δ ({unit}, p95-clipped)",
         },
         **extra,
     )
@@ -269,7 +306,10 @@ def plot_sweep(
     import pandas as pd
     import plotly.express as px
 
-    loaded = [_load_snapshot(p, metric) for p in snapshots]
+    raw = [_load_snapshot(p, metric) for p in snapshots]
+    unit = _check_same_unit(raw)
+    metric_label = metric if unit == "s" else "peak"
+    loaded = [(label, vals) for label, vals, _ in raw]
     versions = [label for label, _ in loaded]
     baseline = loaded[0][1]
     all_tests = sorted(set().union(*[set(vals) for _, vals in loaded]))
@@ -298,7 +338,7 @@ def plot_sweep(
         color_continuous_scale=["green", "white", "red"],
         color_continuous_midpoint=1.0,
         aspect="auto",
-        title=f"{metric} ratio relative to baseline ({versions[0]})",
+        title=f"{metric_label} ratio relative to baseline ({versions[0]})",
         labels={"x": "version", "y": "test", "color": "ratio"},
         text_auto=".2f",
     )
@@ -309,7 +349,7 @@ def plot_sweep(
             "test: %{y}<br>"
             "version: %{x}<br>"
             "ratio: %{z:.3f}<br>"
-            f"{metric}: %{{customdata:.4g}}s"
+            f"{metric_label}: %{{customdata:.4g}}{unit}"
             "<extra></extra>"
         ),
     )
@@ -331,8 +371,23 @@ def plot_scaling(
     # format won't silently produce 0 rows. ``model`` still needs the id
     # regex because spec is stored as an unserializable repr in params.
     data = json.loads(snapshots[0].read_text())
+
+    # Memory snapshots have a flat {test_id: peak_mib} structure and no
+    # benchmark params — fall back to id-regex extraction for size + model.
+    is_memory = "peak_mib" in data
+    unit = "MiB" if is_memory else "s"
+    metric_label = "peak" if is_memory else metric
+
+    if is_memory:
+        benchmarks_iter = [
+            {"fullname": tid, "stats": {metric: val}, "params": {}}
+            for tid, val in data["peak_mib"].items()
+        ]
+    else:
+        benchmarks_iter = data["benchmarks"]
+
     rows = []
-    for bm in data["benchmarks"]:
+    for bm in benchmarks_iter:
         name = bm["fullname"]
         t = bm["stats"][metric]
         params = bm.get("params") or {}
@@ -369,7 +424,9 @@ def plot_scaling(
         log_x=True,
         log_y=True,
         markers=True,
-        title=f"Scaling: {metric} time vs problem size ({snapshots[0].stem})",
+        title=(
+            f"Scaling: {metric_label} ({unit}) vs problem size ({snapshots[0].stem})"
+        ),
     )
     fig.update_layout(height=max(400, ((df["phase"].nunique() + 2) // 3) * 350))
     return fig, len(df)

From cccd476f9623e2e01e813de39f8116f60825ca19 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 17:35:10 +0200
Subject: [PATCH 34/68] benchmarks: compare view drops unchanged tests (esp.
 memory)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

For memory snapshots with --quick model sizes, most peaks round to the
same MiB value across runs, so the compare bar chart had ~80% zero-
length bars at the top — looked empty until you scrolled to the
bottom where the few real movers sit.

Drop rows where ``a == b`` (both snapshots reported the same value).
Title subtitle now reads ``N unchanged (hidden)`` alongside the
existing ``N only in <label>`` note so the filtering is visible.

For timing data this rarely affects anything (float deltas are
essentially never exactly equal); for memory data it surfaces only
the (phase, spec, size) combos that actually moved.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/plotting.py | 40 +++++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index 53249172..8936cc81 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -111,16 +111,27 @@ def plot_compare(
             file=sys.stderr,
         )
 
-    rows = [
-        {
-            "test": name,
-            a_label: a_vals[name],
-            b_label: b_vals[name],
-            "delta_abs": b_vals[name] - a_vals[name],
-            "delta_pct": (b_vals[name] - a_vals[name]) / a_vals[name] * 100.0,
-        }
-        for name in common
-    ]
+    rows = []
+    unchanged = 0
+    for name in common:
+        a_v, b_v = a_vals[name], b_vals[name]
+        if a_v == b_v:
+            # No-change entries pad the chart with zero-length bars (a
+            # common situation for memory peaks rounded to MiB). Drop
+            # them so the visible chart only shows tests that moved.
+            unchanged += 1
+            continue
+        rows.append(
+            {
+                "test": name,
+                a_label: a_v,
+                b_label: b_v,
+                "delta_abs": b_v - a_v,
+                "delta_pct": (b_v - a_v) / a_v * 100.0 if a_v else float("inf"),
+            }
+        )
+    if not rows:
+        raise ValueError("no tests changed between the two snapshots")
     df = pd.DataFrame(rows)
     x_col = "delta_abs" if sort == "absolute" else "delta_pct"
     df = df.reindex(df[x_col].abs().sort_values(ascending=True).index)
@@ -136,8 +147,15 @@ def plot_compare(
     title = (
         f"{metric_label} delta ({sort}): {a_label} → {b_label} (positive = {direction})"
     )
+    sub_parts = []
+    if unchanged:
+        sub_parts.append(f"{unchanged} unchanged (hidden)")
     if only_a or only_b:
-        title += f"<br><sub>{len(only_a)} only in {a_label}, {len(only_b)} only in {b_label}</sub>"
+        sub_parts.append(
+            f"{len(only_a)} only in {a_label}, {len(only_b)} only in {b_label}"
+        )
+    if sub_parts:
+        title += "<br><sub>" + " · ".join(sub_parts) + "</sub>"
 
     fig = px.bar(
         df,

From d34824a3c403e1abc43ba16990d3f2064c36d58c Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 17:39:57 +0200
Subject: [PATCH 35/68] benchmarks: fix compare y-axis collision; revert
 unchanged-row filter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Real bug: the row dict literal used ``"test"`` as both the test-name
key and (when a snapshot file was literally named ``test.json``) as the
baseline-value key from ``a_label = path.stem``. Dict-literal semantics
meant the second write silently overwrote the first, so the "test"
column ended up holding baseline values. Plotly rendered those as a
numeric y-axis — the visible chart had no test names and the bars
landed at y-positions corresponding to baseline peaks (which often sit
outside the x-axis range, giving the "empty" appearance).

Fix: rename the test-id column to ``_test_id`` (unlikely to collide
with any reasonable filename) and pass ``labels={"_test_id": ""}`` so
the y-axis label stays blank as before.

Reverting the unchanged-row filter from the previous commit — that was
treating the symptom. With the collision fixed, all 53 tests render
correctly (zero-delta rows just show as zero-length bars, which is
fine and informative for confirmed-no-change cases).

Verified: memory compare shows 53 tests with proper test-name labels;
timing compare unchanged at 207.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/plotting.py | 53 ++++++++++++++++++------------------------
 1 file changed, 23 insertions(+), 30 deletions(-)

diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index 8936cc81..fb3a9869 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -111,27 +111,24 @@ def plot_compare(
             file=sys.stderr,
         )
 
-    rows = []
-    unchanged = 0
-    for name in common:
-        a_v, b_v = a_vals[name], b_vals[name]
-        if a_v == b_v:
-            # No-change entries pad the chart with zero-length bars (a
-            # common situation for memory peaks rounded to MiB). Drop
-            # them so the visible chart only shows tests that moved.
-            unchanged += 1
-            continue
-        rows.append(
-            {
-                "test": name,
-                a_label: a_v,
-                b_label: b_v,
-                "delta_abs": b_v - a_v,
-                "delta_pct": (b_v - a_v) / a_v * 100.0 if a_v else float("inf"),
-            }
-        )
-    if not rows:
-        raise ValueError("no tests changed between the two snapshots")
+    # Build the dataframe with a uniquely-named test-id column. Snapshot
+    # labels come from filenames (e.g. ``.benchmarks/memory/test.json`` →
+    # ``"test"``) and become column names below; if the test-id column
+    # used the same name, the dict literal would silently overwrite it
+    # and plotly would render the snapshot values on the y-axis instead
+    # of the test names.
+    rows = [
+        {
+            "_test_id": name,
+            a_label: a_vals[name],
+            b_label: b_vals[name],
+            "delta_abs": b_vals[name] - a_vals[name],
+            "delta_pct": (b_vals[name] - a_vals[name]) / a_vals[name] * 100.0
+            if a_vals[name]
+            else float("inf"),
+        }
+        for name in common
+    ]
     df = pd.DataFrame(rows)
     x_col = "delta_abs" if sort == "absolute" else "delta_pct"
     df = df.reindex(df[x_col].abs().sort_values(ascending=True).index)
@@ -147,26 +144,22 @@ def plot_compare(
     title = (
         f"{metric_label} delta ({sort}): {a_label} → {b_label} (positive = {direction})"
     )
-    sub_parts = []
-    if unchanged:
-        sub_parts.append(f"{unchanged} unchanged (hidden)")
     if only_a or only_b:
-        sub_parts.append(
-            f"{len(only_a)} only in {a_label}, {len(only_b)} only in {b_label}"
+        title += (
+            f"<br><sub>{len(only_a)} only in {a_label}, "
+            f"{len(only_b)} only in {b_label}</sub>"
         )
-    if sub_parts:
-        title += "<br><sub>" + " · ".join(sub_parts) + "</sub>"
 
     fig = px.bar(
         df,
         x=x_col,
-        y="test",
+        y="_test_id",
         orientation="h",
         color=x_col,
         color_continuous_scale=["green", "white", "red"],
         color_continuous_midpoint=0,
         title=title,
-        labels={x_col: x_label, "test": ""},
+        labels={x_col: x_label, "_test_id": ""},
         text_auto=text_fmt,
         hover_data={
             a_label: ":.4g",

From d88f2355f2282bd535f9350b86ee591412999c4a Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 17:41:52 +0200
Subject: [PATCH 36/68] benchmarks: compare view renders value text outside
 bars
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

px.bar's default is ``textposition="inside"``, which made the labels
illegible on short bars (most rows for memory snapshots, plus the
small-Δ rows for timing). ``update_traces(textposition="outside")``
puts the text past the end of the bar instead.

``cliponaxis=False`` keeps the text visible even when a bar's value
sits near the plot edge — without it, the text gets clipped by the
axis line.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/plotting.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index fb3a9869..9b1e51ea 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -172,6 +172,9 @@ def plot_compare(
         # SI-prefixed time on the x-axis (e.g. 24 ms, 2.4 ms, 240 µs) for
         # timing snapshots; plain MiB for memory.
         fig.update_xaxes(**_axis_kwargs(unit))
+    # Render the value text outside the bar (default is inside) so the
+    # number stays readable even when a bar is very short.
+    fig.update_traces(textposition="outside", cliponaxis=False)
     fig.update_layout(height=max(500, len(df) * 22), showlegend=False)
     return fig, len(df)
 

From abb3f14105063fd6e68f62817662a78842e0f197 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 17:43:05 +0200
Subject: [PATCH 37/68] benchmarks: compare bars keep alphabetical test-id
 order
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bars and scatter answer different questions:

- Scatter is for *scanning* — top-right corner highlights the biggest
  regressions in any test, anywhere on the suite.
- Bars are for *regions* — what happened to test_build[*],
  test_lp_write[*], etc. as a group.

The magnitude-sort reindex on the compare bar chart scrambled related
tests across the chart, which fights the "look at this region"
workflow. Dropping it leaves the dataframe in the alphabetical order
from ``sorted(common)``, so phases / models cluster contiguously down
the y-axis.

``--sort {absolute,relative}`` still controls which delta the bar
*shows* (and colour-maps); it no longer reorders the rows.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/plotting.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index 9b1e51ea..33c368e6 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -131,7 +131,10 @@ def plot_compare(
     ]
     df = pd.DataFrame(rows)
     x_col = "delta_abs" if sort == "absolute" else "delta_pct"
-    df = df.reindex(df[x_col].abs().sort_values(ascending=True).index)
+    # No reindex by magnitude — alphabetical test_id order (from
+    # ``sorted(common)``) keeps related tests (``test_build[basic-*]``,
+    # ``test_lp_write[knapsack-*]``, ...) visually grouped. The scatter
+    # view is what you use for spotting the biggest outliers.
 
     if sort == "absolute":
         x_label = f"{metric_label} delta ({unit})"

From 914efbf060ae24efe85410508e09fb383edc1155 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 17:48:10 +0200
Subject: [PATCH 38/68] benchmarks: plot gains ``--facets {phase,model}`` for
 compare + scatter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Composes with the existing views instead of adding a new ``facets``
``--view``. ``compare`` (bar chart) and ``scatter`` both honour the
option; ``sweep`` and ``scaling`` accept it (uniform signature) but
ignore it — sweep is already a 2D heatmap and scaling already facets
internally.

What it does:

- ``--facets phase`` splits the chart into one subplot per ``test_*``
  file: ``test_build``, ``test_lp_write``, ``test_matrices``,
  ``test_netcdf_*``, ``test_solver_handoff``. Best for "did everything
  in this phase regress together?".
- ``--facets model`` splits per parametrize model name: ``basic``,
  ``knapsack``, ``qp``, etc. Best for "what happened across this
  model's variants?".
- No facets (default): flat alphabetical bar chart.

Tests whose ids don't match ``[<model>-n=<size>]`` (PyPSA
carbon-management scenarios) land in an ``other`` facet so they
don't silently disappear.

The compare bar's y-axis switches to the short ``model-n=size`` label
inside a facet (the facet header already conveys the phase or model);
without facets the full test id is shown for self-identification.

Hand-tested on timing + memory snapshots; both compare-with-facets
and scatter-with-facets render correctly.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py      |  16 +++++-
 benchmarks/plotting.py | 110 ++++++++++++++++++++++++++++++++---------
 2 files changed, 101 insertions(+), 25 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index f3ab61e2..c255743d 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -29,7 +29,7 @@
 )
 from benchmarks.memory import compare as memory_compare
 from benchmarks.memory import save as memory_save
-from benchmarks.plotting import Metric, PlotView, SortMode
+from benchmarks.plotting import FacetBy, Metric, PlotView, SortMode
 
 app = typer.Typer(
     help=(
@@ -639,6 +639,18 @@ def plot(
             )
         ),
     ] = "absolute",
+    facets: Annotated[
+        FacetBy | None,
+        typer.Option(
+            "--facets",
+            help=(
+                "Split compare / scatter into subplots by ``phase`` (test "
+                "file) or ``model`` (parametrize id). Default: no faceting. "
+                "Tests whose ids don't match ``[<model>-n=<size>]`` (e.g. "
+                "PyPSA carbon-management) land in an ``other`` facet."
+            ),
+        ),
+    ] = None,
     output: Annotated[
         Path | None,
         typer.Option(
@@ -725,7 +737,7 @@ def plot(
         output = Path(".benchmarks") / "plots" / f"{chosen}.html"
 
     try:
-        fig, n_tests = RENDERERS[chosen](snapshots, metric, sort)
+        fig, n_tests = RENDERERS[chosen](snapshots, metric, sort, facets)
     except ValueError as exc:
         typer.secho(str(exc), fg=typer.colors.RED, err=True)
         raise typer.Exit(code=1) from exc
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index 33c368e6..b16cc4e9 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -33,6 +33,7 @@
 PlotView = Literal["compare", "scatter", "sweep", "scaling"]
 Metric = Literal["min", "median", "mean", "max"]
 SortMode = Literal["absolute", "relative"]
+FacetBy = Literal["phase", "model"]
 
 _SIZE_RE = re.compile(r"(.*)\[([^\[\]]+?)-n=(\d+)\]")
 
@@ -78,16 +79,29 @@ def plot_compare(
     snapshots: list[Path],
     metric: Metric = "min",
     sort: SortMode = "absolute",
+    facets: FacetBy | None = None,
 ) -> tuple[Figure, int]:
     """
-    Bar chart of delta per test, sorted by magnitude.
-
-    ``sort="absolute"`` (default): bar = (b - a) seconds, sort by the
-    largest actual time impact. Best for "what change actually affected
-    total runtime?" — avoids over-weighting cheap microsecond tests.
-
-    ``sort="relative"``: bar = (b/a - 1) * 100 %, sort by the largest
-    proportional change. Best for "what got proportionally worse?".
+    Bar chart of delta per test, in alphabetical test-id order.
+
+    ``sort`` chooses the bar *dimension*: ``absolute`` (default) plots
+    ``b - a`` in the data's native unit; ``relative`` plots the percent
+    change. Bars are not reordered by magnitude — alphabetical ids keep
+    related tests visually grouped. Use the scatter view for hunting
+    outliers.
+
+    ``facets`` splits the chart into subplots:
+
+    - ``None`` (default): one flat bar chart.
+    - ``"phase"``: facet by the test file (``test_build``,
+      ``test_lp_write``, ...). Best for "everything in this phase moved
+      together?".
+    - ``"model"``: facet by the model name (``basic``, ``knapsack``, ...).
+      Best for "what happened across all the basic-sized variants?".
+
+    Tests whose IDs don't match the standard ``[<model>-n=<size>]``
+    parametrize shape (e.g. PyPSA carbon-management) land in an
+    ``other`` facet.
     """
     import pandas as pd
     import plotly.express as px
@@ -117,18 +131,34 @@ def plot_compare(
     # used the same name, the dict literal would silently overwrite it
     # and plotly would render the snapshot values on the y-axis instead
     # of the test names.
-    rows = [
-        {
-            "_test_id": name,
-            a_label: a_vals[name],
-            b_label: b_vals[name],
-            "delta_abs": b_vals[name] - a_vals[name],
-            "delta_pct": (b_vals[name] - a_vals[name]) / a_vals[name] * 100.0
-            if a_vals[name]
-            else float("inf"),
-        }
-        for name in common
-    ]
+    rows = []
+    for name in common:
+        m = _SIZE_RE.match(name)
+        if m:
+            phase_path, model, n = m.groups()
+            phase = phase_path.split("::")[-1]
+            short = f"{model}-n={n}"
+        else:
+            # Tests that don't match the parametrize pattern (PyPSA
+            # carbon-management scenarios, etc.) — keep them visible
+            # under an "other" bucket.
+            phase = "other"
+            model = "other"
+            short = name.split("::")[-1] if "::" in name else name
+        rows.append(
+            {
+                "_test_id": name,
+                "_phase": phase,
+                "_model": model,
+                "_short": short,
+                a_label: a_vals[name],
+                b_label: b_vals[name],
+                "delta_abs": b_vals[name] - a_vals[name],
+                "delta_pct": (b_vals[name] - a_vals[name]) / a_vals[name] * 100.0
+                if a_vals[name]
+                else float("inf"),
+            }
+        )
     df = pd.DataFrame(rows)
     x_col = "delta_abs" if sort == "absolute" else "delta_pct"
     # No reindex by magnitude — alphabetical test_id order (from
@@ -153,23 +183,38 @@ def plot_compare(
             f"{len(only_b)} only in {b_label}</sub>"
         )
 
+    # Faceted layout uses the short ``model-n=size`` y-label (the facet
+    # already conveys the phase or model); flat layout uses the full
+    # test-id so each bar is self-identifying.
+    facet_kwargs: dict = {}
+    if facets == "phase":
+        facet_kwargs = {"facet_col": "_phase", "facet_col_wrap": 2}
+        y_col = "_short"
+    elif facets == "model":
+        facet_kwargs = {"facet_col": "_model", "facet_col_wrap": 3}
+        y_col = "_short"
+    else:
+        y_col = "_test_id"
+
     fig = px.bar(
         df,
         x=x_col,
-        y="_test_id",
+        y=y_col,
         orientation="h",
         color=x_col,
         color_continuous_scale=["green", "white", "red"],
         color_continuous_midpoint=0,
         title=title,
-        labels={x_col: x_label, "_test_id": ""},
+        labels={x_col: x_label, y_col: ""},
         text_auto=text_fmt,
         hover_data={
+            "_test_id": True,
             a_label: ":.4g",
             b_label: ":.4g",
             "delta_abs": ":.4g",
             "delta_pct": ":.2f",
         },
+        **facet_kwargs,
     )
     if sort == "absolute":
         # SI-prefixed time on the x-axis (e.g. 24 ms, 2.4 ms, 240 µs) for
@@ -186,6 +231,7 @@ def plot_scatter(
     snapshots: list[Path],
     metric: Metric = "min",
     sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
+    facets: FacetBy | None = None,
 ) -> tuple[Figure, int]:
     """
     Two-axis scatter — baseline cost on log-x, ratio on y.
@@ -229,6 +275,13 @@ def plot_scatter(
             a, b = baseline_vals[name], vals[name]
             if a <= 0:
                 continue
+            m = _SIZE_RE.match(name)
+            if m:
+                phase_path, model, _ = m.groups()
+                phase = phase_path.split("::")[-1]
+            else:
+                phase = "other"
+                model = "other"
             rows.append(
                 {
                     "test": name,
@@ -238,6 +291,8 @@ def plot_scatter(
                     "ratio": b / a,
                     "delta_abs": b - a,
                     "delta_pct": (b - a) / a * 100.0,
+                    "_phase": phase,
+                    "_model": model,
                 }
             )
 
@@ -273,6 +328,12 @@ def plot_scatter(
     if animate:
         extra["animation_frame"] = "version"
         extra["category_orders"] = {"version": [label for label, _ in loaded]}
+    if facets == "phase":
+        extra["facet_col"] = "_phase"
+        extra["facet_col_wrap"] = 2
+    elif facets == "model":
+        extra["facet_col"] = "_model"
+        extra["facet_col_wrap"] = 3
 
     fig = px.scatter(
         df,
@@ -318,6 +379,7 @@ def plot_sweep(
     snapshots: list[Path],
     metric: Metric = "min",
     sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
+    facets: FacetBy | None = None,  # noqa: ARG001  (uniform signature, unused here)
 ) -> tuple[Figure, int]:
     """Heatmap of per-test ratio relative to the first snapshot."""
     import pandas as pd
@@ -378,6 +440,7 @@ def plot_scaling(
     snapshots: list[Path],
     metric: Metric = "min",
     sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
+    facets: FacetBy | None = None,  # noqa: ARG001  (uniform signature, unused here)
 ) -> tuple[Figure, int]:
     """Log-log time vs N for size-parametrized tests, faceted by phase."""
     import pandas as pd
@@ -450,7 +513,8 @@ def plot_scaling(
 
 
 RENDERERS: dict[
-    PlotView, Callable[[list[Path], Metric, SortMode], tuple[Figure, int]]
+    PlotView,
+    Callable[[list[Path], Metric, SortMode, FacetBy | None], tuple[Figure, int]],
 ] = {
     "compare": plot_compare,
     "scatter": plot_scatter,

From eb687f1fa385134088d06d28ca6a31a54b09968e Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 17:52:19 +0200
Subject: [PATCH 39/68] benchmarks: faceted compare/scatter share one x + y
 axis label
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Plotly express renders the per-facet axis titles on every subplot by
default — with 5-6 facets wrapped into 2 columns that's a lot of
repeated noise around the edges.

New ``_share_axis_labels`` helper:
- Clears every individual x/y axis title via ``for_each_*axis``.
- Adds one rotated y-axis annotation on the left and one x-axis
  annotation at the bottom, in paper coordinates so they span the
  whole figure.
- Bumps the left + bottom margins so the shared labels don't get
  clipped.

Applied to ``plot_compare`` (y="test", x=<delta unit>) and
``plot_scatter`` (y="<metric> ratio", x="baseline (<unit>, log scale)")
only when ``facets`` is set. Without facets nothing changes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/plotting.py | 45 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 45 insertions(+)

diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index b16cc4e9..c45775bc 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -75,6 +75,43 @@ def _axis_kwargs(unit: str) -> dict:
     return {"ticksuffix": f" {unit}"}
 
 
+def _share_axis_labels(fig, y_label: str, x_label: str) -> None:
+    """
+    Replace per-facet axis titles with one shared label per axis.
+
+    Plotly express renders the x/y titles on every facet by default,
+    which is noisy when faceting wraps a 5+ subplot grid. This clears
+    them and adds two ``paper``-coordinate annotations: one on the
+    left (rotated) for ``y_label``, one on the bottom for ``x_label``.
+    Leave either blank to skip that side.
+    """
+    fig.for_each_yaxis(lambda yaxis: yaxis.update(title_text=""))
+    fig.for_each_xaxis(lambda xaxis: xaxis.update(title_text=""))
+    if y_label:
+        fig.add_annotation(
+            text=y_label,
+            xref="paper",
+            yref="paper",
+            x=-0.05,
+            y=0.5,
+            textangle=-90,
+            showarrow=False,
+            font={"size": 13},
+        )
+    if x_label:
+        fig.add_annotation(
+            text=x_label,
+            xref="paper",
+            yref="paper",
+            x=0.5,
+            y=-0.08,
+            showarrow=False,
+            font={"size": 13},
+        )
+    # Give the annotations room.
+    fig.update_layout(margin={"l": 90, "b": 70})
+
+
 def plot_compare(
     snapshots: list[Path],
     metric: Metric = "min",
@@ -223,6 +260,8 @@ def plot_compare(
     # Render the value text outside the bar (default is inside) so the
     # number stays readable even when a bar is very short.
     fig.update_traces(textposition="outside", cliponaxis=False)
+    if facets is not None:
+        _share_axis_labels(fig, y_label="test", x_label=x_label)
     fig.update_layout(height=max(500, len(df) * 22), showlegend=False)
     return fig, len(df)
 
@@ -371,6 +410,12 @@ def plot_scatter(
         y=1.0, line_dash="dash", line_color="grey", annotation_text="no change"
     )
     fig.update_traces(marker=dict(size=8, line=dict(width=0.5, color="DarkSlateGrey")))
+    if facets is not None:
+        _share_axis_labels(
+            fig,
+            y_label=f"{metric_label} ratio (candidate / baseline)",
+            x_label=f"baseline {metric_label} ({unit}, log scale)",
+        )
     fig.update_layout(height=600)
     return fig, int(df["test"].nunique())
 

From 5a08e79ed63ab58a21afc836c38b07ed32f2999a Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 17:56:58 +0200
Subject: [PATCH 40/68] benchmarks: notebook showcases ``--facets phase`` after
 compare/scatter

Adds a new section after the compare bar demo:

- Markdown intro explains ``--facets phase`` (one subplot per
  ``test_*`` file) and notes ``--facets model`` works the same way.
- Code cell calls ``bench(f"python -m benchmarks plot --view scatter
  --facets phase {baseline_snap} {candidate_snap}")`` to inline the
  faceted view.

Notebook end-to-end runtime: ~38s (was ~37s; the extra plot
generation is essentially free since the snapshots are already built).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/notebooks/registry_usage.ipynb | 30 ++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/benchmarks/notebooks/registry_usage.ipynb b/benchmarks/notebooks/registry_usage.ipynb
index ea3d6d62..f68f35f4 100644
--- a/benchmarks/notebooks/registry_usage.ipynb
+++ b/benchmarks/notebooks/registry_usage.ipynb
@@ -641,6 +641,34 @@
    "cell_type": "markdown",
    "id": "38",
    "metadata": {},
+   "source": [
+    "### Faceting — one subplot per phase\n",
+    "\n",
+    "`--facets phase` splits the chart so all `test_build[*]` tests sit in\n",
+    "one subplot, all `test_lp_write[*]` in another, etc. Best for \"did\n",
+    "something move across this whole phase?\" Works on both `--view compare`\n",
+    "and `--view scatter`; `--facets model` does the same split keyed by the\n",
+    "parametrize model name. Shared axis labels and a tidy 2-column wrap\n",
+    "keep the layout legible."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "39",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "bench(\n",
+    "    f\"python -m benchmarks plot --view scatter --facets phase \"\n",
+    "    f\"{baseline_snap} {candidate_snap}\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "40",
+   "metadata": {},
    "source": [
     "## Extending\n",
     "\n",
@@ -657,7 +685,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "39",
+   "id": "41",
    "metadata": {},
    "source": [
     "### Regenerating the lockfile\n",

From e24451a625e0ef0cf90461dbdea746cc2b8f9cf3 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 18:04:23 +0200
Subject: [PATCH 41/68] =?UTF-8?q?benchmarks:=20faceted=20compare=20?=
 =?UTF-8?q?=E2=80=94=20per-facet=20rows,=20shared=20y-tick=20labels=20per?=
 =?UTF-8?q?=20row?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three coupled fixes after --facets model produced empty bars across
every facet:

1. Phase-aware short y-labels. Facet by phase → ``model-n=N``; facet
   by model → ``phase-n=N``. The facet header already encodes the
   other dimension.
2. Independent y-axes per facet (``matches=None``). Each facet's
   y-axis lists only its own categories — no empty rows for tests
   that belong to other facets.
3. Shared y-tick labels per row via ``_hide_non_leftmost_yticks``.
   Hidden on every facet except the leftmost column of the wrap grid
   so labels appear once per row, not at every subplot's edge.
4. Per-facet height calculation. Without faceting we sized for total
   bar count; with independent per-facet category sets we now size
   for ``max-rows-in-any-facet × n-facet-rows``.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/plotting.py | 55 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 46 insertions(+), 9 deletions(-)

diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index c45775bc..d690e72d 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -75,6 +75,22 @@ def _axis_kwargs(unit: str) -> dict:
     return {"ticksuffix": f" {unit}"}
 
 
+def _hide_non_leftmost_yticks(fig, wrap: int) -> None:
+    """
+    Hide y-axis tick labels on every facet except the leftmost column.
+
+    Plotly express lays facets out left-to-right, top-to-bottom: with
+    ``facet_col_wrap=N`` the leftmost facets are at indices 0, N, 2N…
+    Hiding tick labels on the rest keeps the row labels visible only
+    once per row instead of repeating at every subplot's left edge.
+    """
+    yaxes = []
+    fig.for_each_yaxis(lambda y: yaxes.append(y))
+    for idx, yaxis in enumerate(yaxes):
+        if idx % wrap != 0:
+            yaxis.update(showticklabels=False)
+
+
 def _share_axis_labels(fig, y_label: str, x_label: str) -> None:
     """
     Replace per-facet axis titles with one shared label per axis.
@@ -174,20 +190,28 @@ def plot_compare(
         if m:
             phase_path, model, n = m.groups()
             phase = phase_path.split("::")[-1]
-            short = f"{model}-n={n}"
+            n_str = f"n={n}"
+            # The y-label inside a facet should be whichever attributes
+            # *vary* there — facet by phase → label is model+size; facet
+            # by model → label is phase+size; otherwise the full id.
+            short_by_phase_facet = f"{model}-{n_str}"
+            short_by_model_facet = f"{phase}-{n_str}"
         else:
             # Tests that don't match the parametrize pattern (PyPSA
             # carbon-management scenarios, etc.) — keep them visible
             # under an "other" bucket.
             phase = "other"
             model = "other"
-            short = name.split("::")[-1] if "::" in name else name
+            tail = name.split("::")[-1] if "::" in name else name
+            short_by_phase_facet = tail
+            short_by_model_facet = tail
         rows.append(
             {
                 "_test_id": name,
                 "_phase": phase,
                 "_model": model,
-                "_short": short,
+                "_short_phase": short_by_phase_facet,
+                "_short_model": short_by_model_facet,
                 a_label: a_vals[name],
                 b_label: b_vals[name],
                 "delta_abs": b_vals[name] - a_vals[name],
@@ -220,16 +244,16 @@ def plot_compare(
             f"{len(only_b)} only in {b_label}</sub>"
         )
 
-    # Faceted layout uses the short ``model-n=size`` y-label (the facet
-    # already conveys the phase or model); flat layout uses the full
-    # test-id so each bar is self-identifying.
+    # Faceted layout uses a phase-aware short y-label so the y-axis only
+    # shows the attributes that vary inside the facet; flat layout uses
+    # the full test-id so each bar is self-identifying.
     facet_kwargs: dict = {}
     if facets == "phase":
         facet_kwargs = {"facet_col": "_phase", "facet_col_wrap": 2}
-        y_col = "_short"
+        y_col = "_short_phase"
     elif facets == "model":
         facet_kwargs = {"facet_col": "_model", "facet_col_wrap": 3}
-        y_col = "_short"
+        y_col = "_short_model"
     else:
         y_col = "_test_id"
 
@@ -261,8 +285,21 @@ def plot_compare(
     # number stays readable even when a bar is very short.
     fig.update_traces(textposition="outside", cliponaxis=False)
     if facets is not None:
+        # Each facet keeps its own y category list (no shared rows full
+        # of empty bars), but we hide tick labels on non-leftmost facets
+        # within each row so the labels only appear once per row instead
+        # of being repeated at every subplot's left edge.
+        fig.update_yaxes(matches=None)
+        _hide_non_leftmost_yticks(fig, wrap=facet_kwargs["facet_col_wrap"])
         _share_axis_labels(fig, y_label="test", x_label=x_label)
-    fig.update_layout(height=max(500, len(df) * 22), showlegend=False)
+        facet_count = df[facet_kwargs["facet_col"]].nunique()
+        rows_per_facet = df.groupby(facet_kwargs["facet_col"])[y_col].nunique().max()
+        wrap = facet_kwargs["facet_col_wrap"]
+        n_rows = (facet_count + wrap - 1) // wrap
+        height = max(500, int(n_rows * rows_per_facet * 24) + 100)
+    else:
+        height = max(500, len(df) * 22)
+    fig.update_layout(height=height, showlegend=False)
     return fig, len(df)
 
 

From f4917dd7430658c14fdca3d8ccc80fd213c048b6 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 18:37:39 +0200
Subject: [PATCH 42/68] benchmarks: scatter as default compare view + expose
 load_long_df
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two small UX wins for plot consumers:

- Default view for 2 snapshots is now ``scatter`` (was ``compare``).
  Each test sits at its own ``(baseline_time, ratio)`` coords with
  hover labels — no aggregation, so size-semantics mismatches across
  models (n=100 for ``basic`` vs ``pypsa_scigrid``) don't muddy the
  picture. ``compare`` (delta bars) is still one ``--view compare``
  away.
- Re-export ``load_long_df`` from ``benchmarks`` so callers can grab
  the tidy DataFrame in one line without importing the plotting
  module. ``df, unit = load_long_df([Path(...), Path(...)])``.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/__init__.py |  21 ++-
 benchmarks/cli.py      |   5 +-
 benchmarks/plotting.py | 353 ++++++++++++++++++-----------------------
 3 files changed, 179 insertions(+), 200 deletions(-)

diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
index ceaaff77..9c709d73 100644
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
@@ -24,7 +24,25 @@
 
 # Importing the models package triggers each module's ``register(...)`` call.
 from benchmarks import models  # noqa: F401, E402
-from benchmarks.registry import (  # noqa: F401 — re-export
+
+
+def load_long_df(snapshots, metric="min"):
+    """
+    Load one or more pytest-benchmark JSON snapshots into a tidy DataFrame.
+
+    Thin re-export of :func:`benchmarks.plotting.load_long_df` so callers
+    can do their own analysis without importing the plotting module
+    (which pulls in plotly). Returns ``(df, unit)`` where ``df`` has one
+    row per ``(snapshot, test_id)`` with columns ``snapshot, test_id,
+    phase, model, size, value``, and ``unit`` is ``"s"`` (timing) or
+    ``"MiB"`` (memory).
+    """
+    from benchmarks.plotting import load_long_df as _impl
+
+    return _impl(snapshots, metric)
+
+
+from benchmarks.registry import (  # noqa: F401, E402 — re-export
     ALL_FEATURES,
     ALL_PHASES,
     BINARY,
@@ -78,6 +96,7 @@
     "filter_by",
     "get",
     "iter_params",
+    "load_long_df",
     "param_ids",
     "register",
 ]
diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index c255743d..547f580d 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -614,7 +614,8 @@ def plot(
         typer.Option(
             help=(
                 "Which plot to produce. Default: ``scaling`` for 1 input, "
-                "``compare`` for 2, ``sweep`` for 3+."
+                "``scatter`` for 2, ``sweep`` for 3+. ``compare`` (delta "
+                "bar chart) is still available via ``--view compare``."
             )
         ),
     ] = None,
@@ -698,7 +699,7 @@ def plot(
     chosen = view or (
         "scaling"
         if len(snapshots) == 1
-        else "compare"
+        else "scatter"
         if len(snapshots) == 2
         else "sweep"
     )
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index d690e72d..fa93078f 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -68,6 +68,57 @@ def _check_same_unit(snapshots: list[tuple[str, dict[str, float], str]]) -> str:
     return next(iter(units))
 
 
+def _parse_test_id(test_id: str) -> tuple[str, str, int | None]:
+    """
+    Return ``(phase, model, size)`` for a pytest test id.
+
+    Falls back to ``("other", "other", None)`` for ids that don't match
+    the ``benchmarks/test_<phase>.py::test_<phase>[<model>-n=<size>]``
+    parametrize shape (e.g. ``test_pypsa_carbon_management``).
+    """
+    m = _SIZE_RE.match(test_id)
+    if m:
+        phase = m.group(1).split("::")[-1]
+        return phase, m.group(2), int(m.group(3))
+    return "other", "other", None
+
+
+def load_long_df(snapshots: list[Path], metric: Metric = "min"):
+    """
+    Return ``(df, unit)`` — one row per ``(snapshot, test_id)`` pair.
+
+    Columns: ``snapshot``, ``test_id``, ``phase``, ``model``, ``size``
+    (``Int64``-nullable for the "other" bucket), ``value``. ``unit`` is
+    the shared unit string (``"s"`` for timing, ``"MiB"`` for memory)
+    — every loaded snapshot must agree.
+
+    Every plot view downstream pivots or filters this single frame so
+    test-id parsing, unit checking, and the "x snapshots, y tests"
+    matrix logic all live in one place.
+    """
+    import pandas as pd
+
+    raw = [_load_snapshot(p, metric) for p in snapshots]
+    unit = _check_same_unit(raw)
+    rows = []
+    for label, vals, _ in raw:
+        for test_id, value in vals.items():
+            phase, model, size = _parse_test_id(test_id)
+            rows.append(
+                {
+                    "snapshot": label,
+                    "test_id": test_id,
+                    "phase": phase,
+                    "model": model,
+                    "size": size,
+                    "value": value,
+                }
+            )
+    df = pd.DataFrame(rows)
+    df["size"] = df["size"].astype("Int64")
+    return df, unit
+
+
 def _axis_kwargs(unit: str) -> dict:
     """Return ``update_xaxes`` kwargs for a given unit."""
     if unit == "s":
@@ -156,76 +207,44 @@ def plot_compare(
     parametrize shape (e.g. PyPSA carbon-management) land in an
     ``other`` facet.
     """
-    import pandas as pd
+    import sys
+
     import plotly.express as px
 
-    loaded = [_load_snapshot(p, metric) for p in snapshots[:2]]
-    unit = _check_same_unit(loaded)
+    df_long, unit = load_long_df(snapshots[:2], metric)
     metric_label = metric if unit == "s" else "peak"
-    (a_label, a_vals, _), (b_label, b_vals, _) = loaded
-    common = sorted(set(a_vals) & set(b_vals))
-    only_a = sorted(set(a_vals) - set(b_vals))
-    only_b = sorted(set(b_vals) - set(a_vals))
-    if not common:
-        raise ValueError("no tests in common between the two snapshots")
-    if only_a or only_b:
-        # Surface the mismatch so silent intersection isn't a footgun.
-        import sys
 
+    labels = df_long["snapshot"].drop_duplicates().tolist()
+    a_label, b_label = labels[0], labels[1]
+
+    # Pivot to wide: one row per test, baseline + candidate as columns,
+    # phase / model / size carried through. Then compute deltas
+    # vectorised — no per-row dict construction.
+    wide = (
+        df_long.pivot(
+            index=["test_id", "phase", "model", "size"],
+            columns="snapshot",
+            values="value",
+        )
+        .reset_index()
+        .rename_axis(columns=None)
+    )
+    only_a = wide[wide[a_label].notna() & wide[b_label].isna()]
+    only_b = wide[wide[a_label].isna() & wide[b_label].notna()]
+    df = wide.dropna(subset=[a_label, b_label]).copy()
+    if df.empty:
+        raise ValueError("no tests in common between the two snapshots")
+    if len(only_a) or len(only_b):
         print(
             f"compare: {len(only_a)} test(s) only in {a_label}, "
-            f"{len(only_b)} only in {b_label} (intersection: {len(common)}).",
+            f"{len(only_b)} only in {b_label} (intersection: {len(df)}).",
             file=sys.stderr,
         )
 
-    # Build the dataframe with a uniquely-named test-id column. Snapshot
-    # labels come from filenames (e.g. ``.benchmarks/memory/test.json`` →
-    # ``"test"``) and become column names below; if the test-id column
-    # used the same name, the dict literal would silently overwrite it
-    # and plotly would render the snapshot values on the y-axis instead
-    # of the test names.
-    rows = []
-    for name in common:
-        m = _SIZE_RE.match(name)
-        if m:
-            phase_path, model, n = m.groups()
-            phase = phase_path.split("::")[-1]
-            n_str = f"n={n}"
-            # The y-label inside a facet should be whichever attributes
-            # *vary* there — facet by phase → label is model+size; facet
-            # by model → label is phase+size; otherwise the full id.
-            short_by_phase_facet = f"{model}-{n_str}"
-            short_by_model_facet = f"{phase}-{n_str}"
-        else:
-            # Tests that don't match the parametrize pattern (PyPSA
-            # carbon-management scenarios, etc.) — keep them visible
-            # under an "other" bucket.
-            phase = "other"
-            model = "other"
-            tail = name.split("::")[-1] if "::" in name else name
-            short_by_phase_facet = tail
-            short_by_model_facet = tail
-        rows.append(
-            {
-                "_test_id": name,
-                "_phase": phase,
-                "_model": model,
-                "_short_phase": short_by_phase_facet,
-                "_short_model": short_by_model_facet,
-                a_label: a_vals[name],
-                b_label: b_vals[name],
-                "delta_abs": b_vals[name] - a_vals[name],
-                "delta_pct": (b_vals[name] - a_vals[name]) / a_vals[name] * 100.0
-                if a_vals[name]
-                else float("inf"),
-            }
-        )
-    df = pd.DataFrame(rows)
+    df["delta_abs"] = df[b_label] - df[a_label]
+    df["delta_pct"] = (df["delta_abs"] / df[a_label]) * 100.0
+    df = df.sort_values("test_id").reset_index(drop=True)
     x_col = "delta_abs" if sort == "absolute" else "delta_pct"
-    # No reindex by magnitude — alphabetical test_id order (from
-    # ``sorted(common)``) keeps related tests (``test_build[basic-*]``,
-    # ``test_lp_write[knapsack-*]``, ...) visually grouped. The scatter
-    # view is what you use for spotting the biggest outliers.
 
     if sort == "absolute":
         x_label = f"{metric_label} delta ({unit})"
@@ -238,24 +257,29 @@ def plot_compare(
     title = (
         f"{metric_label} delta ({sort}): {a_label} → {b_label} (positive = {direction})"
     )
-    if only_a or only_b:
+    if len(only_a) or len(only_b):
         title += (
             f"<br><sub>{len(only_a)} only in {a_label}, "
             f"{len(only_b)} only in {b_label}</sub>"
         )
 
-    # Faceted layout uses a phase-aware short y-label so the y-axis only
-    # shows the attributes that vary inside the facet; flat layout uses
-    # the full test-id so each bar is self-identifying.
+    # Inside a facet the y-axis labels whatever *varies* — drop the
+    # facetted dimension from the label, keep the rest. Flat ⇒ the full
+    # test_id so each bar is self-identifying.
     facet_kwargs: dict = {}
-    if facets == "phase":
-        facet_kwargs = {"facet_col": "_phase", "facet_col_wrap": 2}
-        y_col = "_short_phase"
-    elif facets == "model":
-        facet_kwargs = {"facet_col": "_model", "facet_col_wrap": 3}
-        y_col = "_short_model"
+    if facets is None:
+        y_col = "test_id"
     else:
-        y_col = "_test_id"
+        varying = "model" if facets == "phase" else "phase"
+        size_str = df["size"].astype("Int64").astype(str)
+        df["_short"] = df[varying] + "-n=" + size_str
+        other_mask = df["phase"] == "other"
+        df.loc[other_mask, "_short"] = (
+            df.loc[other_mask, "test_id"].str.split("::").str[-1]
+        )
+        y_col = "_short"
+        facet_kwargs = {"facet_col": facets}
+        facet_kwargs["facet_col_wrap"] = 2 if facets == "phase" else 3
 
     fig = px.bar(
         df,
@@ -269,7 +293,7 @@ def plot_compare(
         labels={x_col: x_label, y_col: ""},
         text_auto=text_fmt,
         hover_data={
-            "_test_id": True,
+            "test_id": True,
             a_label: ":.4g",
             b_label: ":.4g",
             "delta_abs": ":.4g",
@@ -287,16 +311,19 @@ def plot_compare(
     if facets is not None:
         # Each facet keeps its own y category list (no shared rows full
         # of empty bars), but we hide tick labels on non-leftmost facets
-        # within each row so the labels only appear once per row instead
-        # of being repeated at every subplot's left edge.
+        # within each row so labels appear once per row.
         fig.update_yaxes(matches=None)
-        _hide_non_leftmost_yticks(fig, wrap=facet_kwargs["facet_col_wrap"])
-        _share_axis_labels(fig, y_label="test", x_label=x_label)
-        facet_count = df[facet_kwargs["facet_col"]].nunique()
-        rows_per_facet = df.groupby(facet_kwargs["facet_col"])[y_col].nunique().max()
         wrap = facet_kwargs["facet_col_wrap"]
-        n_rows = (facet_count + wrap - 1) // wrap
-        height = max(500, int(n_rows * rows_per_facet * 24) + 100)
+        _hide_non_leftmost_yticks(fig, wrap=wrap)
+        _share_axis_labels(fig, y_label="test", x_label=x_label)
+        # Per-wrap-row equal-share layout is plotly's default. Facets
+        # with fewer categories than the row max will show empty space
+        # below their bars — visually loose but the facet header
+        # annotations stay correctly positioned, which a manual
+        # ``domain`` override would scramble.
+        rows_per_facet = df.groupby(facets)[y_col].nunique().max()
+        n_wrap_rows = (df[facets].nunique() + wrap - 1) // wrap
+        height = max(500, int(n_wrap_rows * rows_per_facet * 24) + 100)
     else:
         height = max(500, len(df) * 22)
     fig.update_layout(height=height, showlegend=False)
@@ -329,56 +356,38 @@ def plot_scatter(
     to see; the colour encodes absolute Δ as a third channel.
     """
     import numpy as np
-    import pandas as pd
     import plotly.express as px
 
     if len(snapshots) < 2:
         raise ValueError("scatter needs at least 2 snapshots (baseline + 1)")
 
-    raw = [_load_snapshot(p, metric) for p in snapshots]
-    unit = _check_same_unit(raw)
+    df_long, unit = load_long_df(snapshots, metric)
     metric_label = metric if unit == "s" else "peak"
-    loaded = [(label, vals) for label, vals, _ in raw]
-    baseline_label, baseline_vals = loaded[0]
-
-    # Include the baseline itself as the first animation frame (all points
-    # at ratio=1, Δ=0). Gives the animation a "before anything happened"
-    # anchor and makes the visual drift across frames easier to read.
-    rows = []
-    for label, vals in loaded:
-        common = sorted(set(baseline_vals) & set(vals))
-        for name in common:
-            a, b = baseline_vals[name], vals[name]
-            if a <= 0:
-                continue
-            m = _SIZE_RE.match(name)
-            if m:
-                phase_path, model, _ = m.groups()
-                phase = phase_path.split("::")[-1]
-            else:
-                phase = "other"
-                model = "other"
-            rows.append(
-                {
-                    "test": name,
-                    "version": label,
-                    "baseline_time": a,
-                    "candidate_time": b,
-                    "ratio": b / a,
-                    "delta_abs": b - a,
-                    "delta_pct": (b - a) / a * 100.0,
-                    "_phase": phase,
-                    "_model": model,
-                }
-            )
 
-    if not rows:
+    labels = df_long["snapshot"].drop_duplicates().tolist()
+    baseline_label = labels[0]
+
+    # Attach the baseline value to every row via a per-test groupby (each
+    # test's baseline = its value on the first snapshot). Tests with no
+    # baseline row (only in non-baseline snapshots) are dropped. Tests
+    # with non-positive baseline are dropped because the ratio is
+    # undefined for them.
+    baseline_vals = df_long.loc[
+        df_long["snapshot"] == baseline_label, ["test_id", "value"]
+    ].rename(columns={"value": "baseline_time"})
+    df = df_long.merge(baseline_vals, on="test_id", how="inner")
+    df = df[df["baseline_time"] > 0].copy()
+    if df.empty:
         raise ValueError(
             f"no tests in common between baseline ({baseline_label}) "
             "and any of the other snapshots"
         )
 
-    df = pd.DataFrame(rows)
+    df = df.rename(columns={"snapshot": "version", "value": "candidate_time"})
+    df["ratio"] = df["candidate_time"] / df["baseline_time"]
+    df["delta_abs"] = df["candidate_time"] - df["baseline_time"]
+    df["delta_pct"] = df["delta_abs"] / df["baseline_time"] * 100.0
+    df = df.rename(columns={"test_id": "test"})
     # Fix the axis ranges so the animation doesn't jitter; pad by a small
     # margin so points on the edges aren't clipped.
     x_lo, x_hi = df["baseline_time"].min(), df["baseline_time"].max()
@@ -403,13 +412,10 @@ def plot_scatter(
     extra: dict = {}
     if animate:
         extra["animation_frame"] = "version"
-        extra["category_orders"] = {"version": [label for label, _ in loaded]}
-    if facets == "phase":
-        extra["facet_col"] = "_phase"
-        extra["facet_col_wrap"] = 2
-    elif facets == "model":
-        extra["facet_col"] = "_model"
-        extra["facet_col_wrap"] = 3
+        extra["category_orders"] = {"version": labels}
+    if facets is not None:
+        extra["facet_col"] = facets
+        extra["facet_col_wrap"] = 2 if facets == "phase" else 3
 
     fig = px.scatter(
         df,
@@ -464,35 +470,25 @@ def plot_sweep(
     facets: FacetBy | None = None,  # noqa: ARG001  (uniform signature, unused here)
 ) -> tuple[Figure, int]:
     """Heatmap of per-test ratio relative to the first snapshot."""
-    import pandas as pd
     import plotly.express as px
 
-    raw = [_load_snapshot(p, metric) for p in snapshots]
-    unit = _check_same_unit(raw)
+    df_long, unit = load_long_df(snapshots, metric)
     metric_label = metric if unit == "s" else "peak"
-    loaded = [(label, vals) for label, vals, _ in raw]
-    versions = [label for label, _ in loaded]
-    baseline = loaded[0][1]
-    all_tests = sorted(set().union(*[set(vals) for _, vals in loaded]))
-
-    ratios: dict[str, list[float | None]] = {}
-    absolutes: dict[str, list[float | None]] = {}
-    for test in all_tests:
-        base = baseline.get(test)
-        if not base:
-            continue
-        ratios[test] = []
-        absolutes[test] = []
-        for _, vals in loaded:
-            t = vals.get(test)
-            ratios[test].append(t / base if t else None)
-            absolutes[test].append(t)
-
-    if not ratios:
-        raise ValueError(f"no overlap with baseline snapshot {versions[0]}")
-
-    df = pd.DataFrame(ratios, index=versions).T  # rows = tests, cols = versions
-    abs_df = pd.DataFrame(absolutes, index=versions).T
+    versions = df_long["snapshot"].drop_duplicates().tolist()
+    baseline_label = versions[0]
+
+    # Pivot absolutes (rows=tests, cols=versions), then drop tests with
+    # no baseline reading and divide every column by the baseline column
+    # to get ratios in one shot.
+    abs_df = df_long.pivot(index="test_id", columns="snapshot", values="value").reindex(
+        columns=versions
+    )
+    abs_df = abs_df.dropna(subset=[baseline_label])
+    if abs_df.empty:
+        raise ValueError(f"no overlap with baseline snapshot {baseline_label}")
+    df = abs_df.div(abs_df[baseline_label], axis=0)
+    abs_df.index.name = "test"
+    df.index.name = "test"
 
     fig = px.imshow(
         df,
@@ -525,57 +521,20 @@ def plot_scaling(
     facets: FacetBy | None = None,  # noqa: ARG001  (uniform signature, unused here)
 ) -> tuple[Figure, int]:
     """Log-log time vs N for size-parametrized tests, faceted by phase."""
-    import pandas as pd
     import plotly.express as px
 
-    # Read the raw JSON so we can pull ``params`` per benchmark. ``size``
-    # comes from there as a clean int — any future rename of the test id
-    # format won't silently produce 0 rows. ``model`` still needs the id
-    # regex because spec is stored as an unserializable repr in params.
-    data = json.loads(snapshots[0].read_text())
-
-    # Memory snapshots have a flat {test_id: peak_mib} structure and no
-    # benchmark params — fall back to id-regex extraction for size + model.
-    is_memory = "peak_mib" in data
-    unit = "MiB" if is_memory else "s"
-    metric_label = "peak" if is_memory else metric
-
-    if is_memory:
-        benchmarks_iter = [
-            {"fullname": tid, "stats": {metric: val}, "params": {}}
-            for tid, val in data["peak_mib"].items()
-        ]
-    else:
-        benchmarks_iter = data["benchmarks"]
-
-    rows = []
-    for bm in benchmarks_iter:
-        name = bm["fullname"]
-        t = bm["stats"][metric]
-        params = bm.get("params") or {}
-
-        size = params.get("size")
-        if not isinstance(size, int):
-            # Fall back to the id regex.
-            m = _SIZE_RE.match(name)
-            if not m:
-                continue
-            size = int(m.group(3))
-
-        m = _SIZE_RE.match(name)
-        if not m:
-            continue
-        phase = m.group(1).split("::")[-1]
-        model = m.group(2)
-        rows.append({"phase": phase, "model": model, "n": size, metric: t})
-
-    if not rows:
+    df_long, unit = load_long_df(snapshots[:1], metric)
+    metric_label = metric if unit == "s" else "peak"
+    df = (
+        df_long.dropna(subset=["size"])
+        .rename(columns={"size": "n", "value": metric})
+        .sort_values(["phase", "model", "n"])
+    )
+    if df.empty:
         raise ValueError(
-            "no size-parametrized tests found (expected ``...[<model>-n=<N>]`` "
-            "or a ``params.size`` int)"
+            "no size-parametrized tests found (expected ``...[<model>-n=<N>]`` ids)"
         )
 
-    df = pd.DataFrame(rows).sort_values(["phase", "model", "n"])
     fig = px.line(
         df,
         x="n",

From 2993b953247f70bbbed5d6f524588795ec8cfd82 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 19:16:46 +0200
Subject: [PATCH 43/68] benchmarks: memory sweep + --rounds/--repeats overrides
 + centralized pins

Three pieces, one PR:

- ``memory sweep <versions>...`` mirrors the timing sweep but runs
  ``memray.Tracker`` measurements inside each per-version uv venv.
  Output lands at ``.benchmarks/memory/linopy-<ver>.json`` and ``plot``
  auto-detects the ``peak_mib`` shape.

- Symmetric override flags so cross-version sweeps can use uniform
  measurement counts: ``--rounds N`` on ``sweep`` / ``run`` forces
  ``--benchmark-min-rounds=N --benchmark-max-time=0`` (default is
  pytest-benchmark's per-test auto-tuning), and ``--repeats N`` on
  ``memory sweep`` / ``memory save`` takes min-of-N peak per
  measurement (default 1; memory peaks are mostly deterministic).

- Single source of truth for sweep dep pins. Both sweeps now read the
  ``[benchmarks]`` extra from ``pyproject.toml`` at runtime via
  ``_benchmarks_extra_pins()`` instead of duplicating two short pin
  lists in ``cli.py``. Bumping the extra propagates automatically.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py    | 238 +++++++++++++++++++++++++++++++++++++++++--
 benchmarks/memory.py |  65 ++++++++----
 2 files changed, 275 insertions(+), 28 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 547f580d..8505ecfc 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -49,6 +49,22 @@
 
 PhaseName = Literal["build", "matrices", "lp_write", "netcdf", "solver_handoff"]
 
+
+def _benchmarks_extra_pins() -> list[str]:
+    """
+    Return the pins from ``pyproject.toml``'s ``[benchmarks]`` extra.
+
+    Used by both sweeps as the ``--no-use-lock`` fallback so the
+    pin-bump path stays single-source: edit the extra in pyproject and
+    both ``sweep`` and ``memory sweep`` pick up the change.
+    """
+    import tomllib
+
+    pyproject = Path(__file__).resolve().parents[1] / "pyproject.toml"
+    data = tomllib.loads(pyproject.read_text())
+    return list(data["project"]["optional-dependencies"]["benchmarks"])
+
+
 _PHASE_TEST_FILE: dict[PhaseName, str] = {
     "build": "benchmarks/test_build.py",
     "matrices": "benchmarks/test_matrices.py",
@@ -211,6 +227,19 @@ def run(
         Path | None,
         typer.Option("--json", help="Save pytest-benchmark JSON to this path."),
     ] = None,
+    rounds: Annotated[
+        int | None,
+        typer.Option(
+            "--rounds",
+            help=(
+                "Force pytest-benchmark to run exactly N rounds per test "
+                "(passes ``--benchmark-min-rounds=N --benchmark-max-time=0``). "
+                "Default: pytest-benchmark auto-tunes per test (5–40+ rounds "
+                "depending on cost). Use a fixed N for uniform measurement "
+                "across versions in a sweep."
+            ),
+        ),
+    ] = None,
 ) -> None:
     """
     Default timing run. Records timings with pytest-benchmark.
@@ -235,6 +264,8 @@ def run(
     args.append("--benchmark-only")
     if json_out is not None:
         args.extend(["--benchmark-json", str(json_out)])
+    if rounds is not None:
+        args.extend([f"--benchmark-min-rounds={rounds}", "--benchmark-max-time=0"])
 
     k_parts = [p for p in (model, filter_expr) if p]
     if k_parts:
@@ -342,6 +373,17 @@ def sweep(
             help="Install ``benchmarks/requirements.lock`` in each venv.",
         ),
     ] = True,
+    rounds: Annotated[
+        int | None,
+        typer.Option(
+            "--rounds",
+            help=(
+                "Force pytest-benchmark to run exactly N rounds per test in "
+                "every version (uniform measurement across the sweep). "
+                "Default: pytest-benchmark auto-tunes per test."
+            ),
+        ),
+    ] = None,
 ) -> None:
     """
     Run the benchmark suite against several linopy versions.
@@ -429,12 +471,7 @@ def sweep(
             if use_lock:
                 install_args += ["-r", str(lockfile)]
             else:
-                install_args += [
-                    "pytest==9.0.3",
-                    "pytest-benchmark==5.2.3",
-                    "highspy==1.13.1",
-                    "netcdf4==1.7.4",
-                ]
+                install_args += _benchmarks_extra_pins()
             install_args.append(spec)
             r = subprocess.run(install_args, check=False)
             if r.returncode != 0:
@@ -465,6 +502,10 @@ def sweep(
                 pytest_cmd.append("--quick")
             elif long:
                 pytest_cmd.append("--long")
+            if rounds is not None:
+                pytest_cmd.extend(
+                    [f"--benchmark-min-rounds={rounds}", "--benchmark-max-time=0"]
+                )
 
             k_parts = [p for p in (model, filter_expr) if p]
             if k_parts:
@@ -778,6 +819,18 @@ def memory_save_cmd(
             ),
         ),
     ] = None,
+    repeats: Annotated[
+        int,
+        typer.Option(
+            "--repeats",
+            help=(
+                "Re-run each measurement N times and keep the min peak. Default "
+                "1 (single shot). Memory peaks have ~1–3 %% wobble from GC "
+                "timing, lazy-import priming, and netcdf page-cache effects — "
+                "min-of-3 tightens that signal."
+            ),
+        ),
+    ] = 1,
 ) -> None:
     """
     Measure peak memory across the registry × phase grid via ``memray.Tracker``.
@@ -802,7 +855,178 @@ def memory_save_cmd(
                 err=True,
             )
             raise typer.Exit(code=2)
-    memory_save(label, quick=quick, phases=phase)
+    memory_save(label, quick=quick, phases=phase, repeats=repeats)
+
+
+@memory_app.command("sweep")
+def memory_sweep_cmd(
+    versions: Annotated[
+        list[str],
+        typer.Argument(help="linopy versions, e.g. 0.4.0 0.5.0 (or any pip spec)."),
+    ],
+    output_dir: Annotated[
+        Path,
+        typer.Option(
+            "--output-dir",
+            "-o",
+            help="Where to save snapshot JSONs.",
+        ),
+    ] = Path(".benchmarks/memory"),
+    quick: Annotated[
+        bool,
+        typer.Option("--quick", help="Use only the smallest sizes (faster sweep)."),
+    ] = False,
+    phase: Annotated[
+        list[str] | None,
+        typer.Option(
+            "--phase",
+            help=(
+                "Restrict each version's run to these phases. Pass multiple "
+                "``--phase`` to select more than one."
+            ),
+        ),
+    ] = None,
+    repeats: Annotated[
+        int,
+        typer.Option(
+            "--repeats",
+            help="min-of-N peak per measurement (default 1).",
+        ),
+    ] = 1,
+    use_lock: Annotated[
+        bool,
+        typer.Option(
+            "--use-lock/--no-use-lock",
+            help="Install ``benchmarks/requirements.lock`` in each venv.",
+        ),
+    ] = True,
+) -> None:
+    """
+    Sweep peak-memory measurements across several linopy versions.
+
+    Mirrors the timing :func:`sweep` but invokes ``memory save`` inside
+    each per-version uv venv. Each version's snapshot lands at
+    ``<output-dir>/linopy-<version>.json`` and is auto-detected by
+    ``plot`` (the ``peak_mib`` key distinguishes memory from timing).
+
+    Memory peaks are much more deterministic than wall time, so
+    ``--repeats 1`` (default) is usually plenty. Use ``--repeats 3``
+    if you need <5%% regression detection.
+    """
+    from benchmarks.memory import DEFAULT_PHASES
+
+    if phase:
+        unknown = [p for p in phase if p not in DEFAULT_PHASES]
+        if unknown:
+            typer.secho(
+                f"unknown phase(s): {unknown}; valid options: {list(DEFAULT_PHASES)}",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            raise typer.Exit(code=2)
+
+    if shutil.which("uv") is None:
+        typer.secho(
+            "uv not found on PATH — install via https://docs.astral.sh/uv/",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
+
+    repo_root = Path.cwd()
+    lockfile = repo_root / "benchmarks" / "requirements.lock"
+    if use_lock and not lockfile.exists():
+        typer.secho(
+            f"--use-lock set but {lockfile} is missing — "
+            "regenerate it via ``uv pip compile`` or pass ``--no-use-lock``.",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    failed: list[str] = []
+    for version in versions:
+        typer.secho(f"\n=== linopy {version} ===", fg=typer.colors.CYAN, bold=True)
+        with tempfile.TemporaryDirectory(prefix="linopy-mem-") as tmp:
+            venv = Path(tmp) / "venv"
+
+            r = subprocess.run(
+                ["uv", "venv", "--python", sys.executable, str(venv)],
+                check=False,
+            )
+            if r.returncode != 0:
+                typer.secho(
+                    f"venv creation failed: {version}",
+                    fg=typer.colors.RED,
+                    err=True,
+                )
+                failed.append(version)
+                continue
+
+            vpy = _venv_python(venv)
+            spec = _linopy_install_spec(version)
+
+            install_args = ["uv", "pip", "install", "--python", str(vpy)]
+            if use_lock:
+                install_args += ["-r", str(lockfile)]
+            else:
+                install_args += _benchmarks_extra_pins()
+            install_args.append(spec)
+            r = subprocess.run(install_args, check=False)
+            if r.returncode != 0:
+                typer.secho(f"install failed: {version}", fg=typer.colors.RED, err=True)
+                failed.append(version)
+                continue
+
+            env = os.environ.copy()
+            env["PYTHONPATH"] = str(repo_root)
+
+            # ``memory save`` writes to ``.benchmarks/memory/<label>.json``
+            # under cwd, so we cd back into the repo root via env and let
+            # ``output_dir`` resolve naturally.
+            label = f"linopy-{version}"
+            mem_cmd = [
+                str(vpy),
+                "-m",
+                "benchmarks",
+                "memory",
+                "save",
+                label,
+            ]
+            if quick:
+                mem_cmd.append("--quick")
+            for ph in phase or []:
+                mem_cmd.extend(["--phase", ph])
+            if repeats > 1:
+                mem_cmd.extend(["--repeats", str(repeats)])
+
+            typer.secho(f"$ {' '.join(mem_cmd)}", fg=typer.colors.BRIGHT_BLACK)
+            subprocess.run(mem_cmd, env=env, cwd=str(repo_root), check=False)
+
+            # memory.save writes to .benchmarks/memory/<label>.json relative
+            # to its cwd. Move it under output_dir if the user asked for a
+            # custom location.
+            default_path = repo_root / ".benchmarks" / "memory" / f"{label}.json"
+            target = output_dir / f"{label}.json"
+            if default_path.exists() and default_path.resolve() != target.resolve():
+                target.parent.mkdir(parents=True, exist_ok=True)
+                default_path.replace(target)
+
+            if target.exists():
+                typer.secho(f"saved {target}", fg=typer.colors.GREEN)
+            else:
+                typer.secho(
+                    f"no snapshot produced for {version}",
+                    fg=typer.colors.RED,
+                    err=True,
+                )
+                failed.append(version)
+
+    if failed:
+        typer.secho(f"\nFailed versions: {failed}", fg=typer.colors.RED, err=True)
+        raise typer.Exit(code=1)
 
 
 @memory_app.command("compare")
diff --git a/benchmarks/memory.py b/benchmarks/memory.py
index 03cc4da0..f2ad3329 100644
--- a/benchmarks/memory.py
+++ b/benchmarks/memory.py
@@ -72,27 +72,40 @@ def _phase_tag(phase: str) -> str:
     }[phase]
 
 
-def _measure_peak(action: Callable[[], object]) -> float:
-    """Run ``action()`` under ``memray.Tracker`` and return peak MiB."""
+def _measure_peak(action: Callable[[], object], repeats: int = 1) -> float:
+    """
+    Run ``action()`` under ``memray.Tracker`` and return peak MiB.
+
+    With ``repeats > 1`` the action runs that many times in fresh
+    trackers and the *minimum* peak is returned — peak memory is
+    noisier than naive expectations (GC timing, lazy-import priming,
+    file-system page cache for netcdf) so the min-of-N is the cleanest
+    estimate of "the floor this code can hit".
+    """
     import memray
 
-    fd, tmp = tempfile.mkstemp(suffix=".bin")
-    Path(tmp).unlink()  # memray needs to create the file itself
-    # Close the fd; the path is what matters.
-    try:
-        from os import close as _close
+    peaks: list[float] = []
+    for _ in range(max(1, repeats)):
+        fd, tmp = tempfile.mkstemp(suffix=".bin")
+        Path(tmp).unlink()  # memray needs to create the file itself
+        # Close the fd; the path is what matters.
+        try:
+            from os import close as _close
 
-        _close(fd)
-    except OSError:
-        pass
+            _close(fd)
+        except OSError:
+            pass
 
-    try:
-        with memray.Tracker(tmp):
-            action()
-        peak_bytes = memray.FileReader(tmp).metadata.peak_memory
-        return round(peak_bytes / (1024**2), 3)
-    finally:
-        Path(tmp).unlink(missing_ok=True)
+        try:
+            with memray.Tracker(tmp):
+                action()
+            peak_bytes = memray.FileReader(tmp).metadata.peak_memory
+            peaks.append(round(peak_bytes / (1024**2), 3))
+        finally:
+            Path(tmp).unlink(missing_ok=True)
+        gc.collect()
+
+    return min(peaks)
 
 
 def _measurements(
@@ -191,12 +204,13 @@ def handoff() -> None:
         raise ValueError(f"unknown phase: {phase!r}")
 
 
-def run_phase(phase: str, quick: bool = False) -> dict[str, float]:
+def run_phase(phase: str, quick: bool = False, repeats: int = 1) -> dict[str, float]:
     """
     Measure peak memory for every applicable ``(spec, size)`` under one phase.
 
     Returns a ``{test_id: peak_mib}`` mapping. Invoked once per phase as a
-    subprocess by :func:`save` for isolation.
+    subprocess by :func:`save` for isolation. ``repeats`` is forwarded to
+    :func:`_measure_peak` so callers can dial up signal-to-noise.
     """
     from benchmarks import REGISTRY
 
@@ -220,7 +234,7 @@ def run_phase(phase: str, quick: bool = False) -> dict[str, float]:
                 try:
                     for test_id, action in _measurements(phase, spec, size):
                         try:
-                            results[test_id] = _measure_peak(action)
+                            results[test_id] = _measure_peak(action, repeats=repeats)
                             print(
                                 f"  {test_id} → {results[test_id]:.1f} MiB",
                                 file=sys.stderr,
@@ -245,6 +259,7 @@ def save(
     label: str,
     quick: bool = False,
     phases: list[str] | None = None,
+    repeats: int = 1,
 ) -> Path:
     """
     Run one subprocess per phase and merge the results into ``<label>.json``.
@@ -276,6 +291,8 @@ def save(
         ]
         if quick:
             cmd.append("--quick")
+        if repeats > 1:
+            cmd.extend(["--repeats", str(repeats)])
         try:
             result = subprocess.run(cmd, check=False, capture_output=True, text=True)
             if result.stderr:
@@ -346,6 +363,12 @@ def compare(label_a: str, label_b: str) -> None:
     parser.add_argument("cmd", choices=["_worker"])
     parser.add_argument("phase")
     parser.add_argument("--quick", action="store_true")
+    parser.add_argument(
+        "--repeats",
+        type=int,
+        default=1,
+        help="Run each measurement N times and keep the min peak (default 1).",
+    )
     parser.add_argument(
         "--out",
         required=True,
@@ -353,5 +376,5 @@ def compare(label_a: str, label_b: str) -> None:
     )
     args = parser.parse_args()
     if args.cmd == "_worker":
-        out = run_phase(args.phase, quick=args.quick)
+        out = run_phase(args.phase, quick=args.quick, repeats=args.repeats)
         Path(args.out).write_text(json.dumps(out))

From ac1df5325043ab6d1ca63341015915839957fe7b Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 19:57:18 +0200
Subject: [PATCH 44/68] benchmarks: CodSpeed CI + Dependabot perf attribution
 loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Per-PR cachegrind-based regression detection plus a feedback loop that
attributes upstream dep perf changes to specific Dependabot bumps:

- New ``codspeed`` job in ``benchmark-smoke.yml``: runs ``pytest
  --quick --codspeed`` on every PR via the CodSpeedHQ action. Needs
  a ``CODSPEED_TOKEN`` repo secret to post comments; without it the
  job fails gracefully.

- ``--quick`` now skips PyPSA end-to-end via a collection hook in
  ``conftest.py``. The PyPSA network is ~30s native; cachegrind would
  make it minutes, and the signal CodSpeed is meant to catch lives in
  the micro paths.

- Pin tiering in ``[benchmarks]``: perf-relevant deps (``numpy``,
  ``scipy``, ``xarray``, ``pandas``, ``polars``, ``dask``, ``highspy``,
  ``netcdf4``) get individual ``==`` pins so each Dependabot bump
  produces one attributed CodSpeed delta. Tooling deps (``pytest`` &
  plugins, ``nbconvert``, ``typer``, ``plotly``) are also pinned but
  grouped in ``dependabot.yml`` so they batch into a single PR.

- Loose ``[project.dependencies]`` stays untouched — downstream linopy
  consumers keep their existing resolve.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/dependabot.yml                | 27 +++++++++++++++++++++
 .github/workflows/benchmark-smoke.yml | 34 +++++++++++++++++++++++++++
 benchmarks/conftest.py                | 16 +++++++++++++
 pyproject.toml                        | 19 +++++++++++++--
 4 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index f8f779b5..e7750e98 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -12,3 +12,30 @@ updates:
     github-actions:
       patterns:
       - '*'
+
+# Pinned ``[benchmarks]`` extra in pyproject.toml. One PR per dep bump
+# → CodSpeed CI runs and attributes any perf delta to that specific
+# bump. Keeps the cross-version ``sweep`` baseline (lockfile-pinned)
+# stable while still surfacing upstream perf changes per-PR with
+# eyes-open review. Loose ``[project.dependencies]`` (numpy, scipy, ...)
+# have no version specifier so Dependabot leaves them alone — only the
+# ``==`` pins in ``[benchmarks]`` produce PRs.
+- package-ecosystem: pip
+  directory: /
+  schedule:
+    interval: monthly
+  open-pull-requests-limit: 5
+  groups:
+    # Measurement scaffolding + CLI/notebook tooling. Perf-irrelevant —
+    # they don't move CodSpeed signal, so batching into one PR cuts
+    # review noise. Perf-relevant deps (numpy, xarray, highspy, …) stay
+    # un-grouped so each gets its own attributed CodSpeed delta.
+    benchmark-tooling:
+      patterns:
+        - pytest
+        - pytest-benchmark
+        - pytest-memray
+        - pytest-codspeed
+        - nbconvert
+        - typer
+        - plotly
diff --git a/.github/workflows/benchmark-smoke.yml b/.github/workflows/benchmark-smoke.yml
index 25fee396..e6ea6c65 100644
--- a/.github/workflows/benchmark-smoke.yml
+++ b/.github/workflows/benchmark-smoke.yml
@@ -45,3 +45,37 @@ jobs:
       # registry" walkthrough and must stay runnable end-to-end.
       run: |
         python -m benchmarks notebook
+
+  codspeed:
+    name: CodSpeed (micro regression detection)
+    runs-on: ubuntu-latest
+    # Cachegrind is ~10–20× slower than native, so we restrict to ``--quick``
+    # (smallest size per spec) and skip PyPSA end-to-end. The signal we want
+    # here is "did this PR change the instruction count of the hot paths?";
+    # full wall-clock cross-version comparison stays in ``sweep``.
+    steps:
+    - uses: actions/checkout@v6
+      with:
+        fetch-depth: 0  # setuptools_scm
+
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v6
+      with:
+        python-version: "3.12"
+
+    - name: Install pinned benchmark environment
+      # Install from the pinned ``[benchmarks]`` extra (not the lockfile)
+      # so Dependabot can auto-detect pyproject.toml and propose bumps
+      # to top-level deps. Each bump → one attributed CodSpeed delta.
+      # The full transitive lockfile is reserved for cross-version
+      # ``sweep`` reproducibility, where machine variance matters more.
+      run: |
+        python -m pip install uv
+        uv pip install --system -e ".[dev,benchmarks]"
+
+    - name: Run benchmarks under CodSpeed
+      uses: CodSpeedHQ/action@v3
+      with:
+        token: ${{ secrets.CODSPEED_TOKEN }}
+        run: |
+          pytest benchmarks/ --quick --codspeed
diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py
index de416167..fea6ba6b 100644
--- a/benchmarks/conftest.py
+++ b/benchmarks/conftest.py
@@ -25,6 +25,22 @@ def pytest_addoption(parser):
     )
 
 
+def pytest_collection_modifyitems(config, items):
+    """
+    Drop PyPSA end-to-end tests under ``--quick``.
+
+    The PyPSA carbon-management network is ~30s by itself; CodSpeed under
+    cachegrind would make it minutes. ``--quick`` is for sub-30s sweeps,
+    so the end-to-end module doesn't belong there.
+    """
+    if not config.getoption("--quick"):
+        return
+    skip = pytest.mark.skip(reason="--quick: pypsa end-to-end skipped")
+    for item in items:
+        if "test_pypsa_carbon_management" in item.nodeid:
+            item.add_marker(skip)
+
+
 def maybe_skip(request: pytest.FixtureRequest, spec: ModelSpec, size: int) -> None:
     """
     Apply size-tier skips and ``spec.requires`` importorskips.
diff --git a/pyproject.toml b/pyproject.toml
index ff114e10..38d39b8a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -96,11 +96,26 @@ dev = [
 # ``highspy`` follows the project-wide ``!=1.14.0`` exclusion (see the
 # ``solvers`` extra).
 benchmarks = [
+    # Perf-relevant deps measured directly by the suite. Individual
+    # Dependabot PRs → CodSpeed attributes deltas to specific bumps.
+    "highspy==1.13.1",
+    "netcdf4==1.7.4",
+    # Perf-sensitive runtime deps. Pinned here (not in ``[project
+    # .dependencies]``) so downstream linopy consumers keep their loose
+    # resolve while the benchmark environment is fixed.
+    "numpy==1.26.4",
+    "scipy==1.16.3",
+    "xarray==2025.9.0",
+    "pandas==2.3.3",
+    "polars==1.35.2",
+    "dask==2025.11.0",
+    # Measurement scaffolding + CLI/notebook tooling. Pinned for
+    # reproducibility but grouped in dependabot.yml so bumps batch into
+    # one PR instead of cluttering review.
     "pytest==9.0.3",
     "pytest-benchmark==5.2.3",
     "pytest-memray==1.8.0",
-    "highspy==1.13.1",
-    "netcdf4==1.7.4",
+    "pytest-codspeed==5.0.3",
     "nbconvert==7.17.1",
     "typer==0.26.2",
     "plotly==6.7.0",

From 0e6ec41e9c232dd47c18b64eaa336e6351aea611 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 22:27:49 +0200
Subject: [PATCH 45/68] benchmarks: drop lockfile, relocate walkthrough,
 Jupytext --build flow
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Drop ``benchmarks/requirements.lock`` and the ``--use-lock/--no-use-lock``
  toggle on ``sweep`` and ``memory sweep``. The ``[benchmarks]`` extra in
  pyproject already pins every measurement-relevant direct dep
  (numpy/scipy/xarray/pandas/polars/dask/highspy/…) — uv resolves the same
  pyproject input deterministically into each per-version venv, so "same
  deps, only linopy varies" comes for free without maintaining a separate
  lockfile.
- Relocate the walkthrough out of ``benchmarks/notebooks/`` to
  ``benchmarks/walkthrough.md`` (next to README), and delete the now-empty
  notebooks/ directory along with the obsolete ``registry_usage.ipynb``.
- Replace ``jupyter nbconvert`` with Jupytext for the walkthrough. The
  ``notebook`` subcommand now executes the ``.md`` directly; ``--build``
  regenerates a gitignored sibling ``walkthrough.ipynb`` for editor-agnostic
  viewing (one-way conversion — no bidirectional pairing, so PyCharm/VSCode
  work the same as JupyterLab). Add ``jupytext==1.17.4`` to the
  ``[benchmarks]`` extra.
- Condense ``benchmarks/README.md`` from 128 → 45 lines: scope/install/lockfile
  rationale + walkthrough launch only. Phase coverage, CLI surfaces, metric
  rationale, memory commands, and extending guide are now load-bearing in
  the walkthrough.
- ``list --details`` and ``show`` now use ``typer.secho`` for dim headers /
  cyan spec names / dim attribute labels (auto-strips when piped — ``list |
  grep`` stays clean).
- Drop the "microbenchmark" framing where it overreached: the suite has
  millisecond-to-second-scale tests, not microbenchmarks. Rephrase the
  scatter-quadrant prose ("cheap tests with big ratio swings — noise, not
  real change") in plotting.py, cli.py, and the README metric rationale.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitignore                                |   6 +
 benchmarks/README.md                      | 117 +---
 benchmarks/cli.py                         | 188 +++---
 benchmarks/notebooks/registry_usage.ipynb | 729 ----------------------
 benchmarks/plotting.py                    |  15 +-
 benchmarks/requirements.lock              | 513 ---------------
 pyproject.toml                            |   1 +
 7 files changed, 150 insertions(+), 1419 deletions(-)
 delete mode 100644 benchmarks/notebooks/registry_usage.ipynb
 delete mode 100644 benchmarks/requirements.lock

diff --git a/.gitignore b/.gitignore
index 654b686d..45c1fb7b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,6 +45,12 @@ benchmark/scripts/__pycache__
 benchmark/scripts/benchmarks-pypsa-eur/__pycache__
 benchmark/scripts/leftovers/
 
+# Benchmarks (internal suite in benchmarks/) — the .md walkthrough is
+# canonical; ``python -m benchmarks notebook --build`` regenerates the
+# .ipynb sibling as a throwaway viewing/running artifact.
+benchmarks/walkthrough.ipynb
+benchmarks/.ipynb_checkpoints/
+
 # IDE
 .idea/
 
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 721b1321..320bbcb3 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,111 +1,44 @@
 # Internal Performance Benchmarks
 
-This suite benchmarks the **linopy part end-to-end** across four phases:
+End-to-end performance tracking for `linopy` — build → solver handoff
+→ netCDF (de)serialization → fixed PyPSA model. Solver algorithm
+runtime is out of scope.
 
-1. **Build** — construct the linopy model.
-2. **Solver handoff** — convert a built model into solver-consumable form
-   (in-memory matrices, LP file, native solver instance).
-3. **netCDF serialization / deserialization** — `to_netcdf` / `read_netcdf`.
-4. **End-to-end** — a fixed real-world PyPSA model all the way to a solver
-   instance.
+**The walkthrough is load-bearing.** Phase coverage, CLI introspection,
+the two-snapshot regression workflow with inline Plotly views, and
+how to extend the suite live in [`walkthrough.md`](walkthrough.md).
+This README only covers install and how to open the walkthrough.
 
-Solver algorithm runtime is intentionally out of scope.
+> `benchmark/` (singular) is the legacy external-framework suite.
+> `benchmarks/` (plural) is this internal suite.
 
-| Phase                  | Test file                           | Measures                                                                  |
-| ---------------------- | ----------------------------------- | ------------------------------------------------------------------------- |
-| Build                  | `test_build.py`                     | constructing variables / expressions / constraints / objective            |
-| Solver handoff         | `test_matrices.py`                  | `A`, `b`, `c`, bounds, labels, `Q` for QP                                 |
-| Solver handoff         | `test_lp_write.py`                  | `model.to_file(...)` — LP / MPS serialization                             |
-| Solver handoff         | `test_solver_handoff.py`            | `lp.io.to_highspy` / `to_gurobipy` / `to_mosek` / `to_xpress`             |
-| netCDF (de)serialization | `test_netcdf.py`                  | `to_netcdf` and `read_netcdf` round-trip                                  |
-| End-to-end (PyPSA)     | `test_pypsa_carbon_management.py`   | Fixed real-world pypsa network through `network.optimize.create_model` and on to highspy; sweeps `freeze_constraints` and `set_names`. |
-
-The netCDF benchmarks reuse the same file path across pytest-benchmark
-iterations, so reads run hot-cache by design — what we want to track is
-the (de)serialization code in `linopy` / `xarray`, not disk hardware.
-
-> **Note:** `benchmark/` (singular) is for external framework comparisons.
-> `benchmarks/` (plural) is only for internal linopy performance tracking.
-
-## Setup
-
-Two install paths:
+## Install
 
 ```bash
-# Development / casual benchmark runs — loose constraints from pyproject
 uv sync --extra dev --extra benchmarks
 source .venv/bin/activate
-
-# Stable measurement environment — fully resolved lockfile
-uv pip install -r benchmarks/requirements.lock
-uv pip install -e .            # current linopy
-# — or —
-uv pip install linopy==0.5.0   # cross-version sweep target
 ```
 
-`pypsa` is an **optional** benchmark dep — the `pypsa_scigrid` registry
-spec and `test_pypsa_carbon_management.py` skip gracefully without it.
-Install separately when you want them:
-
-```bash
-uv pip install pypsa
-```
-
-The lockfile excludes linopy itself so the same lockfile works for both
-current-tip regression runs and `sweep` against older releases. Absolute
-benchmark numbers are still machine-dependent (CPU, cache, memory
-bandwidth) — what the lockfile gives you is consistency over time on the
-same machine, so deltas reflect linopy changes, not a numpy upgrade.
-
-Regenerate after bumping the `[benchmarks]` pins in `pyproject.toml`:
-
-```bash
-uv pip compile pyproject.toml --extra benchmarks --extra dev --extra solvers \
-  --no-emit-package linopy \
-  -o benchmarks/requirements.lock
-```
+`pypsa` is optional — `pypsa_scigrid` and
+`test_pypsa_carbon_management.py` skip gracefully without it. Install
+when you need them: `uv pip install pypsa`.
 
-## Run
+The `[benchmarks]` extra in `pyproject.toml` pins every direct dep that
+affects measurement (`numpy`, `scipy`, `xarray`, `pandas`, `polars`,
+`dask`, etc.). `sweep` installs these into each per-version venv, so
+"same deps, only linopy varies" comes for free without a separate
+lockfile — bump the pins in pyproject and the next sweep picks them up.
 
-Everything is exposed through one typer CLI. Its `--help` is the source of
-truth — no command menu duplicated here:
+## Open the walkthrough
 
 ```bash
-python -m benchmarks --help
-python -m benchmarks <command> --help
+python -m benchmarks notebook --build       # (re)generate walkthrough.ipynb
+jupyter lab benchmarks/walkthrough.ipynb    # ...or PyCharm / VSCode
 ```
 
-Three size tiers, configured per spec via `quick_threshold` / `long_threshold`:
-
-| Mode         | Sizes included            | Typical use                              |
-| ------------ | ------------------------- | ---------------------------------------- |
-| `smoke`      | `size <= quick_threshold` | CI smoke (~18 s), fast local sanity      |
-| `run`        | `size <= long_threshold`  | Default regression timing (~45 s)        |
-| `run --long` | all sizes                 | Full sweep — the slow stuff (~2 min)     |
+The `.md` is the source of truth; the `.ipynb` is a disposable,
+gitignored build artifact. Edit the `.md`, re-run `--build`, re-open.
+Same workflow in any editor.
 
-Pytest still works directly for power users (`pytest benchmarks/ ...`).
-
-## Walkthrough
-
-[`notebooks/registry_usage.ipynb`](notebooks/registry_usage.ipynb) is the
-canonical walkthrough: import the registry, look up / iterate / filter
-specs, build a model, parametrize your own pytest test off the registry,
-spot-profile memory. GitHub renders it inline; CI executes it on every PR
+CI executes the walkthrough end-to-end on every PR
 (`python -m benchmarks notebook`) so the examples can't silently rot.
-
-Open it locally with JupyterLab launched from the repo root:
-
-```bash
-jupyter lab benchmarks/notebooks/registry_usage.ipynb
-```
-
-## Metrics
-
-- **Time** — pytest-benchmark median runtime (IQR for stability). Snapshots
-  are JSON; pass `--json <path>` to `run` to save one, then diff against a
-  baseline.
-- **Memory** — peak allocations (MiB) via `memray.Tracker`, measured per
-  `(phase, spec, size)` across all phases. The model is built *outside* the
-  tracked region so the peak reflects only the phase work, not model
-  construction. Use `memory save` (optionally `--phase` to scope) and
-  `memory compare`.
diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 8505ecfc..bed4ee14 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -54,9 +54,11 @@ def _benchmarks_extra_pins() -> list[str]:
     """
     Return the pins from ``pyproject.toml``'s ``[benchmarks]`` extra.
 
-    Used by both sweeps as the ``--no-use-lock`` fallback so the
-    pin-bump path stays single-source: edit the extra in pyproject and
-    both ``sweep`` and ``memory sweep`` pick up the change.
+    Both ``sweep`` and ``memory sweep`` install these into each
+    per-version venv. Direct pins are kept in pyproject as the single
+    source of truth — bump them there and both sweeps pick up the
+    change. Transitive deps resolve fresh per venv; uv's deterministic
+    resolution gives identical results across versions within one sweep.
     """
     import tomllib
 
@@ -106,10 +108,16 @@ def list_(
     ]
     name_w = max(len(r[0]) for r in rows)
     feat_w = max(len(r[1]) for r in rows)
-    typer.echo(f"{'name':<{name_w}}  {'features':<{feat_w}}  sizes")
-    typer.echo("-" * (name_w + feat_w + 20))
+    # ``secho`` strips colour automatically when stdout isn't a TTY, so
+    # piping ``list --details | grep`` still gets plain text.
+    typer.secho(
+        f"{'name':<{name_w}}  {'features':<{feat_w}}  sizes",
+        dim=True,
+    )
+    typer.secho("-" * (name_w + feat_w + 20), dim=True)
     for name, feats, sizes in rows:
-        typer.echo(f"{name:<{name_w}}  {feats:<{feat_w}}  {sizes}")
+        typer.secho(f"{name:<{name_w}}", fg=typer.colors.CYAN, nl=False)
+        typer.echo(f"  {feats:<{feat_w}}  {sizes}")
 
 
 @app.command()
@@ -130,13 +138,20 @@ def show(
         typer.echo(f"available: {', '.join(sorted(REGISTRY))}", err=True)
         raise typer.Exit(code=2) from exc
     typer.echo(repr(spec))
-    typer.echo(f"  sizes:           {spec.sizes}")
-    typer.echo(f"  features:        {sorted(spec.features)}")
-    typer.echo(f"  phases:          {sorted(spec.phases)}")
-    typer.echo(f"  quick_threshold: {spec.quick_threshold}")
-    typer.echo(f"  long_threshold:  {spec.long_threshold}")
+
+    def _row(label: str, value: object) -> None:
+        # Dim the label so the eye lands on the value first; ``secho``
+        # auto-strips colour when stdout isn't a TTY.
+        typer.secho(f"  {label:<17}", dim=True, nl=False)
+        typer.echo(value)
+
+    _row("sizes:", spec.sizes)
+    _row("features:", sorted(spec.features))
+    _row("phases:", sorted(spec.phases))
+    _row("quick_threshold:", spec.quick_threshold)
+    _row("long_threshold:", spec.long_threshold)
     if spec.requires:
-        typer.echo(f"  requires:        {list(spec.requires)}")
+        _row("requires:", list(spec.requires))
 
 
 @app.command("filter")
@@ -276,37 +291,79 @@ def run(
 
 
 @app.command()
-def notebook() -> None:
+def notebook(
+    build: Annotated[
+        bool,
+        typer.Option(
+            "--build",
+            help=(
+                "Regenerate ``walkthrough.ipynb`` from the ``.md`` source. "
+                "One-way build — the ``.ipynb`` is a throwaway artifact for "
+                "opening in any editor (JupyterLab, PyCharm, VSCode), the "
+                "``.md`` stays canonical. Re-run after editing the ``.md``. "
+                "The ``.ipynb`` is gitignored."
+            ),
+        ),
+    ] = False,
+) -> None:
     """
-    Execute the registry-usage notebook end-to-end.
+    Execute the walkthrough notebook end-to-end (default) or rebuild the
+    ``.ipynb`` artifact for interactive viewing (``--build``).
+
+    The walkthrough is a Jupytext MyST markdown file
+    (``benchmarks/walkthrough.md``) — diffs cleanly in git, runs as a
+    notebook in Jupyter. The ``.md`` is the source of truth; the paired
+    ``.ipynb`` is generated output. Edit the ``.md``, re-run ``--build``,
+    open the ``.ipynb`` in your editor of choice.
 
-    Used by CI to catch doc rot — if any cell raises, the workflow fails.
-    The executed copy is written to a tempdir and discarded, so the
-    in-tree notebook stays output-free (nbstripout doesn't have to chase
-    a populated file).
+    CI calls this with no flags to catch doc rot; the executed copy goes
+    to a tempdir and is discarded so the source file stays output-free.
     """
-    nb = Path("benchmarks/notebooks/registry_usage.ipynb")
+    nb = Path("benchmarks/walkthrough.md")
     if not nb.exists():
-        typer.secho(f"notebook not found: {nb}", fg=typer.colors.RED, err=True)
+        typer.secho(f"walkthrough not found: {nb}", fg=typer.colors.RED, err=True)
         raise typer.Exit(code=1)
+
+    if build:
+        # ``--to ipynb`` is a one-way conversion (no ``formats`` metadata
+        # written into the .md). The generated .ipynb is editor-agnostic;
+        # contributors regenerate it after editing the .md.
+        cmd = [
+            sys.executable,
+            "-m",
+            "jupytext",
+            "--to",
+            "ipynb",
+            str(nb),
+        ]
+        typer.secho(f"$ {' '.join(cmd)}", fg=typer.colors.BRIGHT_BLACK)
+        result = subprocess.run(cmd, check=False)
+        if result.returncode != 0:
+            raise typer.Exit(code=result.returncode)
+        ipynb = nb.with_suffix(".ipynb")
+        typer.secho(f"built: {ipynb}  (regenerable from {nb})", fg=typer.colors.GREEN)
+        typer.echo(f"Open it:  jupyter lab {ipynb}    # or PyCharm / VSCode / …")
+        return
+
     with tempfile.TemporaryDirectory() as tmp:
+        # Jupytext sets the kernel cwd to the output directory (the
+        # tempdir here), so forward the repo root via
+        # ``LINOPY_REPO_ROOT`` for the walkthrough's first cell to find
+        # ``benchmarks/``.
+        env = {**os.environ, "LINOPY_REPO_ROOT": str(Path.cwd().resolve())}
         cmd = [
             sys.executable,
             "-m",
-            "jupyter",
-            "nbconvert",
+            "jupytext",
             "--to",
             "notebook",
             "--execute",
-            "--ExecutePreprocessor.timeout=300",
-            "--output-dir",
-            tmp,
             "--output",
-            "executed.ipynb",
+            str(Path(tmp) / "executed.ipynb"),
             str(nb),
         ]
         typer.secho(f"$ {' '.join(cmd)}", fg=typer.colors.BRIGHT_BLACK)
-        result = subprocess.run(cmd, check=False)
+        result = subprocess.run(cmd, env=env, check=False)
     if result.returncode != 0:
         raise typer.Exit(code=result.returncode)
 
@@ -366,13 +423,6 @@ def sweep(
             help="Arbitrary pytest ``-k`` expression (AND-ed with ``--model``).",
         ),
     ] = None,
-    use_lock: Annotated[
-        bool,
-        typer.Option(
-            "--use-lock/--no-use-lock",
-            help="Install ``benchmarks/requirements.lock`` in each venv.",
-        ),
-    ] = True,
     rounds: Annotated[
         int | None,
         typer.Option(
@@ -431,16 +481,6 @@ def sweep(
         raise typer.Exit(code=2)
 
     repo_root = Path.cwd()
-    lockfile = repo_root / "benchmarks" / "requirements.lock"
-    if use_lock and not lockfile.exists():
-        typer.secho(
-            f"--use-lock set but {lockfile} is missing — "
-            "regenerate it via ``uv pip compile`` or pass ``--no-use-lock``.",
-            fg=typer.colors.RED,
-            err=True,
-        )
-        raise typer.Exit(code=2)
-
     output_dir.mkdir(parents=True, exist_ok=True)
 
     failed: list[str] = []
@@ -466,13 +506,19 @@ def sweep(
             vpy = _venv_python(venv)
             spec = _linopy_install_spec(version)
 
-            # 2. Single install pass: infra (lockfile or pinned subset) + linopy.
-            install_args = ["uv", "pip", "install", "--python", str(vpy)]
-            if use_lock:
-                install_args += ["-r", str(lockfile)]
-            else:
-                install_args += _benchmarks_extra_pins()
-            install_args.append(spec)
+            # 2. Single install pass: pinned infra (from pyproject) + linopy.
+            #    Direct pins in [benchmarks] are sufficient for sweep
+            #    reproducibility — uv resolves the same input
+            #    deterministically into each per-version venv.
+            install_args = [
+                "uv",
+                "pip",
+                "install",
+                "--python",
+                str(vpy),
+                *_benchmarks_extra_pins(),
+                spec,
+            ]
             r = subprocess.run(install_args, check=False)
             if r.returncode != 0:
                 typer.secho(f"install failed: {version}", fg=typer.colors.RED, err=True)
@@ -618,7 +664,7 @@ def compare(ctx: typer.Context) -> None:
     # grouped by parametrize group, which is unreadable for two-snapshot diffs.
     # ``--group-by=fullname`` puts each test's (baseline, candidate) rows in
     # their own mini-table; ``--columns=min,iqr`` shows the lowest observed
-    # time (closest to "true" cost for microbenchmarks) plus the spread.
+    # time (approximates the no-noise floor) plus the spread.
     # Each default is only applied if the user didn't override it.
     if not any(a.startswith("--columns") for a in extra):
         extra.insert(0, "--columns=min,iqr")
@@ -721,9 +767,9 @@ def plot(
     - **scatter** (2 snapshots) — exploratory two-axis plot: baseline
       cost on log-x, ratio on y, absolute Δ encoded in colour. Tests
       in the top-right are the real regressions (slow tests that got
-      slower); top-left = noisy microbenchmarks; bottom-right =
-      already-slow-but-unchanged. Resolves the absolute-vs-relative
-      tension visually.
+      slower); top-left = cheap tests with big ratio swings (noise,
+      not real change); bottom-right = already-slow-but-unchanged.
+      Resolves the absolute-vs-relative tension visually.
     - **sweep** (3+ snapshots) — heatmap of ratio relative to the first
       snapshot, rows = tests, columns = snapshot labels.
     - **scaling** (1 snapshot) — log-log time vs ``n`` for
@@ -893,13 +939,6 @@ def memory_sweep_cmd(
             help="min-of-N peak per measurement (default 1).",
         ),
     ] = 1,
-    use_lock: Annotated[
-        bool,
-        typer.Option(
-            "--use-lock/--no-use-lock",
-            help="Install ``benchmarks/requirements.lock`` in each venv.",
-        ),
-    ] = True,
 ) -> None:
     """
     Sweep peak-memory measurements across several linopy versions.
@@ -934,16 +973,6 @@ def memory_sweep_cmd(
         raise typer.Exit(code=2)
 
     repo_root = Path.cwd()
-    lockfile = repo_root / "benchmarks" / "requirements.lock"
-    if use_lock and not lockfile.exists():
-        typer.secho(
-            f"--use-lock set but {lockfile} is missing — "
-            "regenerate it via ``uv pip compile`` or pass ``--no-use-lock``.",
-            fg=typer.colors.RED,
-            err=True,
-        )
-        raise typer.Exit(code=2)
-
     output_dir.mkdir(parents=True, exist_ok=True)
 
     failed: list[str] = []
@@ -968,12 +997,15 @@ def memory_sweep_cmd(
             vpy = _venv_python(venv)
             spec = _linopy_install_spec(version)
 
-            install_args = ["uv", "pip", "install", "--python", str(vpy)]
-            if use_lock:
-                install_args += ["-r", str(lockfile)]
-            else:
-                install_args += _benchmarks_extra_pins()
-            install_args.append(spec)
+            install_args = [
+                "uv",
+                "pip",
+                "install",
+                "--python",
+                str(vpy),
+                *_benchmarks_extra_pins(),
+                spec,
+            ]
             r = subprocess.run(install_args, check=False)
             if r.returncode != 0:
                 typer.secho(f"install failed: {version}", fg=typer.colors.RED, err=True)
diff --git a/benchmarks/notebooks/registry_usage.ipynb b/benchmarks/notebooks/registry_usage.ipynb
deleted file mode 100644
index f68f35f4..00000000
--- a/benchmarks/notebooks/registry_usage.ipynb
+++ /dev/null
@@ -1,729 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "0",
-   "metadata": {},
-   "source": [
-    "# Linopy benchmark suite — guide\n",
-    "\n",
-    "This notebook is the canonical documentation for the internal benchmark\n",
-    "suite under `benchmarks/`. CI executes it end-to-end on every PR via\n",
-    "`python -m benchmarks notebook`, so anything written here stays runnable.\n",
-    "\n",
-    "Two complementary surfaces:\n",
-    "\n",
-    "- **README** (`benchmarks/README.md`) — install / size-tier reference / metrics.\n",
-    "- **CLI help** — `python -m benchmarks --help` is the source of truth for\n",
-    "  command flags. This notebook embeds that output live (see the *Running*\n",
-    "  section) so it stays in sync with the actual implementation.\n",
-    "\n",
-    "What this notebook covers: the suite's architecture (registry × phases),\n",
-    "how to use the registry from your own code, how to run / interpret\n",
-    "benchmarks, and how to extend the suite."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "1",
-   "metadata": {},
-   "source": [
-    "## Architecture\n",
-    "\n",
-    "Two halves:\n",
-    "\n",
-    "1. **The model registry** (`benchmarks/registry.py`) — every benchmark\n",
-    "   model is a `ModelSpec` declaring how to build it, its sizes, the\n",
-    "   features it exercises, the phases it can drive, and the size tiers.\n",
-    "   Models self-register at import.\n",
-    "\n",
-    "2. **The phase tests** (`test_build.py`, `test_matrices.py`, …) — one\n",
-    "   pytest file per phase. Each iterates the registry via\n",
-    "   `iter_params(phase)` so adding a model to the registry automatically\n",
-    "   extends every applicable phase test.\n",
-    "\n",
-    "A typer CLI (`benchmarks/cli.py`) wraps pytest plus introspection and\n",
-    "memory snapshots."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "2",
-   "metadata": {},
-   "source": [
-    "## What each phase measures\n",
-    "\n",
-    "| Phase             | Test file                         | Measures                                                     |\n",
-    "| ----------------- | --------------------------------- | ------------------------------------------------------------ |\n",
-    "| `build`           | `test_build.py`                   | constructing variables / expressions / constraints / objective |\n",
-    "| `matrices`        | `test_matrices.py`                | `A`, `b`, `c`, bounds, labels, `Q` for QP                    |\n",
-    "| `lp_write`        | `test_lp_write.py`                | `model.to_file(...)` — LP / MPS serialization                |\n",
-    "| `netcdf`          | `test_netcdf.py`                  | `to_netcdf` and `read_netcdf` round-trip                     |\n",
-    "| `solver_handoff`  | `test_solver_handoff.py`          | `lp.io.to_highspy` / `to_gurobipy` / `to_mosek` / `to_xpress` |\n",
-    "| (PyPSA scenario)  | `test_pypsa_carbon_management.py` | `set_names` / `freeze_constraints` variants — *optional, needs `pypsa`* |\n",
-    "\n",
-    "Out of scope: solver algorithm runtime (i.e. `Solver.solve()`),\n",
-    "cross-solver ranking, nonlinear suites.\n",
-    "\n",
-    "`pypsa` is an optional benchmark dependency — install it (`pip install pypsa`)\n",
-    "if you want the `pypsa_scigrid` registry spec and the carbon-management\n",
-    "scenario to run; otherwise both skip gracefully."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3",
-   "metadata": {},
-   "source": [
-    "## The registry\n",
-    "\n",
-    "### Import\n",
-    "\n",
-    "Single entry point: `from benchmarks import REGISTRY` plus whichever\n",
-    "feature / phase tags you need for filtering. The cell below also defines\n",
-    "a `show_help(...)` helper used later to embed live `--help` output."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# The benchmark suite isn't shipped in the linopy wheel — it lives in-tree.\n",
-    "# Walk up from cwd to find the repo root and put it on sys.path so the\n",
-    "# import resolves whether jupyter was launched from the repo root, the\n",
-    "# notebooks directory, or anywhere in between.\n",
-    "import os\n",
-    "import subprocess\n",
-    "import sys\n",
-    "from pathlib import Path\n",
-    "\n",
-    "_p = Path.cwd()\n",
-    "while _p != _p.parent:\n",
-    "    if (_p / \"benchmarks\" / \"registry.py\").exists():\n",
-    "        if str(_p) not in sys.path:\n",
-    "            sys.path.insert(0, str(_p))\n",
-    "        break\n",
-    "    _p = _p.parent\n",
-    "\n",
-    "from benchmarks import (  # noqa: E402\n",
-    "    INTEGER,\n",
-    "    QUADRATIC,\n",
-    "    REGISTRY,\n",
-    "    TO_GUROBIPY,\n",
-    "    filter_by,\n",
-    "    get,\n",
-    ")\n",
-    "\n",
-    "\n",
-    "def show_help(*subcommand: str) -> None:\n",
-    "    \"\"\"\n",
-    "    Shell out to ``python -m benchmarks ... --help`` and print the output.\n",
-    "\n",
-    "    Subprocesses don't inherit ``sys.path`` so we forward the repo root via\n",
-    "    PYTHONPATH. ``NO_COLOR=1`` makes rich emit plain text suitable for the\n",
-    "    notebook's text-output mechanism.\n",
-    "    \"\"\"\n",
-    "    cmd = [sys.executable, \"-m\", \"benchmarks\", *subcommand, \"--help\"]\n",
-    "    env = {**os.environ, \"PYTHONPATH\": str(_p), \"NO_COLOR\": \"1\"}\n",
-    "    result = subprocess.run(\n",
-    "        cmd,\n",
-    "        capture_output=True,\n",
-    "        text=True,\n",
-    "        env=env,\n",
-    "        check=True,\n",
-    "    )\n",
-    "    print(result.stdout)\n",
-    "\n",
-    "\n",
-    "print(f\"{len(REGISTRY)} models registered: {sorted(REGISTRY)}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "5",
-   "metadata": {},
-   "source": [
-    "### Look up by name\n",
-    "\n",
-    "`REGISTRY[name]` returns a `ModelSpec` (frozen dataclass). Evaluating it\n",
-    "in Jupyter renders an attribute table via `_repr_html_`; `__repr__` gives\n",
-    "a one-line summary in scripts and `pytest -v` output."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "spec = REGISTRY[\"basic\"]\n",
-    "spec"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7",
-   "metadata": {},
-   "source": [
-    "`.build(size)` constructs and returns a `linopy.Model`. Models pick up\n",
-    "their own repr from linopy:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "spec.build(50)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9",
-   "metadata": {},
-   "source": [
-    "`get(\"name\")` is a functional equivalent to `REGISTRY[name]` — handy when\n",
-    "you don't want to import `REGISTRY` directly."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "10",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "assert get(\"basic\") is REGISTRY[\"basic\"]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "11",
-   "metadata": {},
-   "source": [
-    "### Iterate\n",
-    "\n",
-    "`REGISTRY.values()` yields every spec. Useful for sweeping your own\n",
-    "regression logic or any operation that should hold across every model."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "12",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "list(REGISTRY.values())"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "13",
-   "metadata": {},
-   "source": [
-    "### Filter\n",
-    "\n",
-    "`filter_by(has_feature=...)` and `filter_by(has_phase=...)` narrow to\n",
-    "specs that declare a given feature or phase. Tags exported from\n",
-    "`benchmarks`: `CONTINUOUS`, `BINARY`, `INTEGER`, `QUADRATIC`, `SOS`,\n",
-    "`PIECEWISE`, `MASKED`, plus `BUILD`, `MATRICES`, `LP_WRITE`, `NETCDF`,\n",
-    "`SOLVER_BUILD`, `TO_HIGHSPY`, `TO_GUROBIPY`, `TO_MOSEK`, `TO_XPRESS`."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "14",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "filter_by(has_feature=QUADRATIC)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "15",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "filter_by(has_feature=INTEGER)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "16",
-   "metadata": {},
-   "source": [
-    "Useful for solver-specific tests — find every spec that declares the\n",
-    "Gurobi handoff phase (i.e. claims Gurobi can ingest it):"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "17",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "filter_by(has_phase=TO_GUROBIPY)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "18",
-   "metadata": {},
-   "source": [
-    "## Reuse patterns\n",
-    "\n",
-    "### Parametrize your own pytest\n",
-    "\n",
-    "The same pattern the suite uses internally — `iter_params(phase)`\n",
-    "flattens `(spec, size)` pairs for one phase, and `param_ids(...)` builds\n",
-    "stable pytest IDs:\n",
-    "\n",
-    "```python\n",
-    "import pytest\n",
-    "from benchmarks import BUILD, iter_params, param_ids\n",
-    "\n",
-    "_PARAMS = iter_params(BUILD)\n",
-    "\n",
-    "@pytest.mark.parametrize(\"spec,size\", _PARAMS, ids=param_ids(_PARAMS))\n",
-    "def test_my_invariant(spec, size):\n",
-    "    m = spec.build(size)\n",
-    "    # ... assertion that should hold for every model\n",
-    "```"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "19",
-   "metadata": {},
-   "source": [
-    "### Spot-profile memory\n",
-    "\n",
-    "`tracemalloc` is a fast, in-process spot check. For real measurement\n",
-    "(peak RSS, separate process per benchmark) use\n",
-    "`python -m benchmarks memory save / compare`, which routes through\n",
-    "pytest-memray."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "20",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tracemalloc  # noqa: E402\n",
-    "\n",
-    "tracemalloc.start()\n",
-    "m = REGISTRY[\"sparse_network\"].build(100)\n",
-    "_current, peak = tracemalloc.get_traced_memory()\n",
-    "tracemalloc.stop()\n",
-    "\n",
-    "print(f\"sparse_network n=100: peak allocation ≈ {peak / 1e6:.1f} MB\")\n",
-    "print(\n",
-    "    f\"  {m.variables.nvars} scalar variables, {m.constraints.ncons} scalar constraints\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "21",
-   "metadata": {},
-   "source": [
-    "## Running\n",
-    "\n",
-    "All commands are subcommands of `python -m benchmarks`. The CLI is\n",
-    "self-documenting; the cells below embed its `--help` output live. If you\n",
-    "change a flag in `benchmarks/cli.py`, re-running this notebook updates\n",
-    "the documentation automatically.\n",
-    "\n",
-    "Three size tiers gate cost. Each spec declares its own thresholds:\n",
-    "\n",
-    "| Mode         | Sizes included            | Wall-clock |\n",
-    "| ------------ | ------------------------- | ---------- |\n",
-    "| `smoke`      | `size <= quick_threshold` | ~18 s      |\n",
-    "| `run`        | `size <= long_threshold`  | ~45 s      |\n",
-    "| `run --long` | all sizes                 | ~2 min     |\n",
-    "\n",
-    "Top-level command menu:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "22",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "show_help()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "23",
-   "metadata": {},
-   "source": [
-    "### Timing snapshots\n",
-    "\n",
-    "`run` is the main timing entry point. Its flags:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "24",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "show_help(\"run\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "25",
-   "metadata": {},
-   "source": [
-    "Use `--json` to save pytest-benchmark's output for later diffing — the\n",
-    "JSON includes per-test min / median / IQR over multiple iterations:\n",
-    "\n",
-    "```bash\n",
-    "# baseline (e.g. on master)\n",
-    "python -m benchmarks run --json .benchmarks/master.json\n",
-    "\n",
-    "# candidate (e.g. on your branch)\n",
-    "python -m benchmarks run --json .benchmarks/my-feature.json\n",
-    "\n",
-    "# pytest-benchmark ships its own diff tool:\n",
-    "pytest-benchmark compare .benchmarks/master.json .benchmarks/my-feature.json\n",
-    "```\n",
-    "\n",
-    "IQR is the metric to trust on short runs — it stays stable across noise\n",
-    "in a way that min / mean can't."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "26",
-   "metadata": {},
-   "source": [
-    "### Memory snapshots\n",
-    "\n",
-    "`memory save` runs the build phase under pytest-memray in a separate\n",
-    "process per benchmark (so peak RSS doesn't accumulate across tests) and\n",
-    "writes JSON. `memory compare` diffs two snapshots:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "27",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "show_help(\"memory\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "28",
-   "metadata": {},
-   "source": [
-    "```bash\n",
-    "python -m benchmarks memory save master\n",
-    "python -m benchmarks memory save \"$(git rev-parse --short HEAD)\"\n",
-    "python -m benchmarks memory compare master \"$(git rev-parse --short HEAD)\"\n",
-    "```\n",
-    "\n",
-    "Memory is build-only because later phases include build allocations and\n",
-    "attribution becomes unreliable."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "29",
-   "metadata": {},
-   "source": [
-    "### Cross-version sweep\n",
-    "\n",
-    "`sweep` bootstraps perf history against published linopy releases. For\n",
-    "each version it builds a fresh uv venv, installs the lockfile + that\n",
-    "linopy, runs the suite, and saves a JSON snapshot."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "30",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "show_help(\"sweep\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "31",
-   "metadata": {},
-   "source": [
-    "```bash\n",
-    "python -m benchmarks sweep 0.5.0 0.6.0 0.7.0 \\\n",
-    "    --output-dir .benchmarks/sweep\n",
-    "```\n",
-    "\n",
-    "The current (repo-tip) benchmark code runs against each linopy version,\n",
-    "so the measurement layer is constant. Specs whose APIs aren't present in\n",
-    "older linopy (currently `sos` and `piecewise`) skip themselves gracefully\n",
-    "via the `_API_AVAILABLE` gate at registration time."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "32",
-   "metadata": {},
-   "source": [
-    "## Plotting — end-to-end demo\n",
-    "\n",
-    "The CLI `plot` subcommand wraps the same functions that\n",
-    "`benchmarks.plotting` exposes — call them directly here to render\n",
-    "inline. Below we generate two real `--quick` snapshots (≈ 20 s each in\n",
-    "CI), then walk the two two-snapshot views.\n",
-    "\n",
-    "The diff between the two runs is just measurement noise — that's\n",
-    "expected. On a real PR you'd compare master against your branch."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "33",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import tempfile  # noqa: E402\n",
-    "\n",
-    "from tqdm.auto import tqdm  # noqa: E402\n",
-    "\n",
-    "# Focused run: --quick + just the build phase. ~10 s per snapshot, two\n",
-    "# runs ≈ 20 s. tqdm shows live progress as each subprocess finishes.\n",
-    "_plot_tmp = Path(tempfile.mkdtemp(prefix=\"benchmarks-plot-demo-\"))\n",
-    "_env = {**os.environ, \"PYTHONPATH\": str(_p)}\n",
-    "\n",
-    "\n",
-    "def _run_benchmark(label: str) -> Path:\n",
-    "    snap = _plot_tmp / f\"{label}.json\"\n",
-    "    subprocess.run(\n",
-    "        [\n",
-    "            sys.executable,\n",
-    "            \"-m\",\n",
-    "            \"benchmarks\",\n",
-    "            \"run\",\n",
-    "            \"--quick\",\n",
-    "            \"--phase\",\n",
-    "            \"build\",\n",
-    "            \"--json\",\n",
-    "            str(snap),\n",
-    "        ],\n",
-    "        env=_env,\n",
-    "        cwd=str(_p),  # subprocesses inherit notebook cwd; pin to repo root\n",
-    "        check=True,\n",
-    "        capture_output=True,\n",
-    "    )\n",
-    "    return snap\n",
-    "\n",
-    "\n",
-    "labels = (\"baseline\", \"candidate\")\n",
-    "snaps = {\n",
-    "    label: _run_benchmark(label)\n",
-    "    for label in tqdm(labels, desc=\"benchmark runs\", unit=\"run\")\n",
-    "}\n",
-    "baseline_snap, candidate_snap = snaps[\"baseline\"], snaps[\"candidate\"]\n",
-    "print(f\"baseline:  {baseline_snap.stat().st_size // 1024} KB\")\n",
-    "print(f\"candidate: {candidate_snap.stat().st_size // 1024} KB\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "34",
-   "metadata": {},
-   "source": [
-    "### Rendering via the CLI — scatter view\n",
-    "\n",
-    "`python -m benchmarks plot` writes HTML to disk by default. The helper\n",
-    "below shells out to that exact command and inlines the generated file\n",
-    "via `IPython.display.HTML` — same CLI path users hit from the terminal,\n",
-    "just rendered in the notebook. Pass the arguments as a single string,\n",
-    "exactly how you'd type them on the command line.\n",
-    "\n",
-    "The scatter view (`--view scatter`) is the recommended exploratory plot\n",
-    "for regression hunting: x = baseline cost on a log axis, y = ratio,\n",
-    "colour = absolute Δ. Top-right = slow tests that got slower (the \"fix\n",
-    "this\" zone). Top-left = noisy microbenchmarks. Bottom-right =\n",
-    "already-slow tests that didn't move."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "35",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import shlex  # noqa: E402\n",
-    "\n",
-    "from IPython.display import HTML  # noqa: E402\n",
-    "\n",
-    "\n",
-    "def bench(cli: str) -> HTML | None:\n",
-    "    \"\"\"\n",
-    "    Run a ``python -m benchmarks ...`` command exactly as you'd type it.\n",
-    "\n",
-    "    The leading ``python -m benchmarks`` (or just ``benchmarks``) is\n",
-    "    optional — strip whichever you prefer. For ``plot`` subcommands the\n",
-    "    output is captured into a tempdir and returned as inlinable\n",
-    "    ``HTML``; other subcommands print their stdout and return ``None``::\n",
-    "\n",
-    "        bench(f\"python -m benchmarks plot --view scatter {baseline_snap} {candidate_snap}\")\n",
-    "    \"\"\"\n",
-    "    cli = cli.removeprefix(\"python -m benchmarks \").removeprefix(\"benchmarks \")\n",
-    "    args = shlex.split(cli)\n",
-    "    cmd = [sys.executable, \"-m\", \"benchmarks\", *args]\n",
-    "\n",
-    "    if args and args[0] == \"plot\" and not (set(args) & {\"-o\", \"--output\"}):\n",
-    "        # Inject -o so we can read the result back for inline rendering.\n",
-    "        out = _plot_tmp / \"out.html\"\n",
-    "        cmd += [\"-o\", str(out)]\n",
-    "        subprocess.run(cmd, env=_env, cwd=str(_p), check=True, capture_output=True)\n",
-    "        return HTML(out.read_text())\n",
-    "\n",
-    "    result = subprocess.run(\n",
-    "        cmd, env=_env, cwd=str(_p), check=True, capture_output=True, text=True\n",
-    "    )\n",
-    "    print(result.stdout)\n",
-    "    return None\n",
-    "\n",
-    "\n",
-    "bench(f\"python -m benchmarks plot --view scatter {baseline_snap} {candidate_snap}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "36",
-   "metadata": {},
-   "source": [
-    "### Compare view — bar chart alternative\n",
-    "\n",
-    "Same `cli_plot` helper, different `--view`. The compare bar chart is\n",
-    "useful when you want a sorted list of per-test deltas (in seconds by\n",
-    "default; pass `--sort relative` for percent)."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "37",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "bench(f\"python -m benchmarks plot --view compare {baseline_snap} {candidate_snap}\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "38",
-   "metadata": {},
-   "source": [
-    "### Faceting — one subplot per phase\n",
-    "\n",
-    "`--facets phase` splits the chart so all `test_build[*]` tests sit in\n",
-    "one subplot, all `test_lp_write[*]` in another, etc. Best for \"did\n",
-    "something move across this whole phase?\" Works on both `--view compare`\n",
-    "and `--view scatter`; `--facets model` does the same split keyed by the\n",
-    "parametrize model name. Shared axis labels and a tidy 2-column wrap\n",
-    "keep the layout legible."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "39",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "bench(\n",
-    "    f\"python -m benchmarks plot --view scatter --facets phase \"\n",
-    "    f\"{baseline_snap} {candidate_snap}\"\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "40",
-   "metadata": {},
-   "source": [
-    "## Extending\n",
-    "\n",
-    "### Adding a new model\n",
-    "\n",
-    "1. Drop `benchmarks/models/<name>.py` with a `build_<name>(size) -> Model`.\n",
-    "2. Build a `ModelSpec` and call `register(...)` at module scope. Declare\n",
-    "   realistic `quick_threshold` / `long_threshold` so the smoke stays fast.\n",
-    "3. Add an import in `benchmarks/models/__init__.py` so registration fires.\n",
-    "\n",
-    "Every phase test picks the spec up automatically through\n",
-    "`iter_params(phase)`."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "41",
-   "metadata": {},
-   "source": [
-    "### Regenerating the lockfile\n",
-    "\n",
-    "After bumping pins in `pyproject.toml`'s `[benchmarks]` extra, regenerate\n",
-    "`benchmarks/requirements.lock`:\n",
-    "\n",
-    "```bash\n",
-    "uv pip compile pyproject.toml \\\n",
-    "    --extra benchmarks --extra dev --extra solvers \\\n",
-    "    --no-emit-package linopy \\\n",
-    "    -o benchmarks/requirements.lock\n",
-    "```\n",
-    "\n",
-    "The `--no-emit-package linopy` exclusion is critical — without it, the\n",
-    "lockfile pins linopy itself and `sweep` can't vary it."
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.11"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index fa93078f..45c2b010 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -10,9 +10,9 @@
   size-parametrized tests, faceted by phase.
 
 All three accept a ``metric`` argument selecting which pytest-benchmark
-stat drives the plot. Default is ``min`` — for microbenchmarks the
-lowest observed time is closest to the "true" cost (noise can only slow
-things down). ``median`` is more robust to a single weirdly-fast warmup
+stat drives the plot. Default is ``min`` — the fastest observed sample
+approximates the no-noise floor (GC, scheduling, cache thrash can only
+add time). ``median`` is more robust to a single weirdly-fast warmup
 round; ``mean`` and ``max`` are also accepted.
 
 plotly is imported lazily by the dispatcher so the rest of the benchmark
@@ -342,10 +342,11 @@ def plot_scatter(
     Designed as the single best exploratory plot for regression hunting
     across tests of wildly different magnitudes: a point lights up as
     "fix this" only if it sits in the top-right corner — slow tests
-    that got slower. Top-left (big ratio, tiny absolute) reads as
-    microbenchmark noise; bottom-right (big absolute, tiny ratio) is
-    already-slow-but-unchanged. The combined position resolves the
-    tension that pure relative or pure absolute sort each blind-spot.
+    that got slower. Top-left (big ratio, tiny absolute) is a cheap
+    test with noisy ratio swings — not a real change. Bottom-right (big
+    absolute, tiny ratio) is already-slow-but-unchanged. The combined
+    position resolves the tension that pure relative or pure absolute
+    sort each blind-spot.
 
     The first snapshot is the baseline. With 2 snapshots, a static
     scatter is drawn; with 3+, every subsequent snapshot becomes an
diff --git a/benchmarks/requirements.lock b/benchmarks/requirements.lock
deleted file mode 100644
index 72fbea79..00000000
--- a/benchmarks/requirements.lock
+++ /dev/null
@@ -1,513 +0,0 @@
-# This file was autogenerated by uv via the following command:
-#    uv pip compile pyproject.toml --extra benchmarks --extra dev --extra solvers --no-emit-package linopy -o benchmarks/requirements.lock
-annotated-doc==0.0.4
-    # via typer
-anyio==4.13.0
-    # via
-    #   httpx
-    #   jupyter-server
-appnope==0.1.4
-    # via ipykernel
-argon2-cffi==25.1.0
-    # via jupyter-server
-argon2-cffi-bindings==25.1.0
-    # via argon2-cffi
-arrow==1.4.0
-    # via isoduration
-ast-serialize==0.5.0
-    # via mypy
-asttokens==3.0.1
-    # via stack-data
-async-lru==2.3.0
-    # via jupyterlab
-attrs==26.1.0
-    # via
-    #   jsonschema
-    #   referencing
-babel==2.18.0
-    # via jupyterlab-server
-bcrypt==5.0.0
-    # via paramiko
-beautifulsoup4==4.14.3
-    # via nbconvert
-bleach==6.3.0
-    # via nbconvert
-bottleneck==1.6.0
-    # via linopy (pyproject.toml)
-certifi==2026.5.20
-    # via
-    #   httpcore
-    #   httpx
-    #   netcdf4
-    #   requests
-cffi==2.0.0
-    # via
-    #   argon2-cffi-bindings
-    #   cryptography
-    #   pynacl
-cfgv==3.5.0
-    # via pre-commit
-cftime==1.6.5
-    # via netcdf4
-charset-normalizer==3.4.7
-    # via requests
-click==8.4.1
-    # via dask
-cloudpickle==3.1.2
-    # via dask
-comm==0.2.3
-    # via
-    #   ipykernel
-    #   ipywidgets
-coptpy==8.0.4
-    # via linopy (pyproject.toml)
-coverage==7.14.1
-    # via pytest-cov
-cryptography==48.0.0
-    # via
-    #   paramiko
-    #   types-paramiko
-dask==2026.3.0
-    # via linopy (pyproject.toml)
-debugpy==1.8.20
-    # via ipykernel
-decorator==5.3.1
-    # via ipython
-defusedxml==0.7.1
-    # via nbconvert
-deprecation==2.1.0
-    # via linopy (pyproject.toml)
-distlib==0.4.0
-    # via virtualenv
-executing==2.2.1
-    # via stack-data
-fastjsonschema==2.21.2
-    # via nbformat
-filelock==3.29.0
-    # via
-    #   python-discovery
-    #   virtualenv
-fqdn==1.5.1
-    # via jsonschema
-fsspec==2026.4.0
-    # via dask
-gurobipy==13.0.2
-    # via linopy (pyproject.toml)
-h11==0.16.0
-    # via httpcore
-highspy==1.13.1
-    # via linopy (pyproject.toml)
-httpcore==1.0.9
-    # via httpx
-httpx==0.28.1
-    # via jupyterlab
-identify==2.6.19
-    # via pre-commit
-idna==3.16
-    # via
-    #   anyio
-    #   httpx
-    #   jsonschema
-    #   requests
-importlib-metadata==9.0.0
-    # via dask
-iniconfig==2.3.0
-    # via pytest
-invoke==3.0.3
-    # via paramiko
-ipykernel==7.2.0
-    # via
-    #   jupyter
-    #   jupyter-console
-    #   jupyterlab
-ipython==9.13.0
-    # via
-    #   ipykernel
-    #   ipywidgets
-    #   jupyter-console
-ipython-pygments-lexers==1.1.1
-    # via ipython
-ipywidgets==8.1.8
-    # via jupyter
-isoduration==20.11.0
-    # via jsonschema
-jedi==0.20.0
-    # via ipython
-jinja2==3.1.6
-    # via
-    #   jupyter-server
-    #   jupyterlab
-    #   jupyterlab-server
-    #   memray
-    #   nbconvert
-json5==0.14.0
-    # via jupyterlab-server
-jsonpointer==3.1.1
-    # via jsonschema
-jsonschema==4.26.0
-    # via
-    #   jupyter-events
-    #   jupyterlab-server
-    #   nbformat
-jsonschema-specifications==2025.9.1
-    # via jsonschema
-jupyter==1.1.1
-    # via linopy (pyproject.toml)
-jupyter-client==8.8.0
-    # via
-    #   ipykernel
-    #   jupyter-console
-    #   jupyter-server
-    #   nbclient
-jupyter-console==6.6.3
-    # via jupyter
-jupyter-core==5.9.1
-    # via
-    #   ipykernel
-    #   jupyter-client
-    #   jupyter-console
-    #   jupyter-server
-    #   jupyterlab
-    #   nbclient
-    #   nbconvert
-    #   nbformat
-jupyter-events==0.12.1
-    # via jupyter-server
-jupyter-lsp==2.3.1
-    # via jupyterlab
-jupyter-server==2.18.2
-    # via
-    #   jupyter-lsp
-    #   jupyterlab
-    #   jupyterlab-server
-    #   notebook
-    #   notebook-shim
-jupyter-server-terminals==0.5.4
-    # via jupyter-server
-jupyterlab==4.5.7
-    # via
-    #   jupyter
-    #   notebook
-jupyterlab-pygments==0.3.0
-    # via nbconvert
-jupyterlab-server==2.28.0
-    # via
-    #   jupyterlab
-    #   notebook
-jupyterlab-widgets==3.0.16
-    # via ipywidgets
-knitro==15.1.0
-    # via linopy (pyproject.toml)
-lark==1.3.1
-    # via rfc3987-syntax
-librt==0.11.0
-    # via mypy
-linkify-it-py==2.1.0
-    # via markdown-it-py
-locket==1.0.0
-    # via partd
-markdown-it-py==4.2.0
-    # via
-    #   mdit-py-plugins
-    #   rich
-    #   textual
-markupsafe==3.0.3
-    # via
-    #   jinja2
-    #   nbconvert
-matplotlib-inline==0.2.2
-    # via
-    #   ipykernel
-    #   ipython
-mdit-py-plugins==0.6.1
-    # via textual
-mdurl==0.1.2
-    # via markdown-it-py
-memray==1.19.3
-    # via pytest-memray
-mindoptpy==2.3.0
-    # via linopy (pyproject.toml)
-mistune==3.2.1
-    # via nbconvert
-mosek==11.2.0
-    # via linopy (pyproject.toml)
-mypy==2.1.0
-    # via linopy (pyproject.toml)
-mypy-extensions==1.1.0
-    # via mypy
-narwhals==2.21.2
-    # via plotly
-nbclient==0.10.4
-    # via nbconvert
-nbconvert==7.17.1
-    # via
-    #   linopy (pyproject.toml)
-    #   jupyter
-    #   jupyter-server
-nbformat==5.10.4
-    # via
-    #   jupyter-server
-    #   nbclient
-    #   nbconvert
-nest-asyncio==1.6.0
-    # via ipykernel
-netcdf4==1.7.4
-    # via linopy (pyproject.toml)
-nodeenv==1.10.0
-    # via pre-commit
-notebook==7.5.6
-    # via jupyter
-notebook-shim==0.2.4
-    # via
-    #   jupyterlab
-    #   notebook
-numexpr==2.14.1
-    # via linopy (pyproject.toml)
-numpy==2.4.6
-    # via
-    #   linopy (pyproject.toml)
-    #   bottleneck
-    #   cftime
-    #   highspy
-    #   mindoptpy
-    #   mosek
-    #   netcdf4
-    #   numexpr
-    #   pandas
-    #   scipy
-    #   xarray
-overrides==7.7.0
-    # via jupyter-server
-packaging==26.2
-    # via
-    #   linopy (pyproject.toml)
-    #   dask
-    #   deprecation
-    #   ipykernel
-    #   jupyter-events
-    #   jupyter-server
-    #   jupyterlab
-    #   jupyterlab-server
-    #   nbconvert
-    #   plotly
-    #   pytest
-    #   xarray
-pandas==3.0.3
-    # via xarray
-pandocfilters==1.5.1
-    # via nbconvert
-paramiko==5.0.0
-    # via linopy (pyproject.toml)
-parso==0.8.7
-    # via jedi
-partd==1.4.2
-    # via dask
-pathspec==1.1.1
-    # via mypy
-pexpect==4.9.0
-    # via ipython
-platformdirs==4.10.0
-    # via
-    #   jupyter-core
-    #   python-discovery
-    #   textual
-    #   virtualenv
-plotly==6.7.0
-    # via linopy (pyproject.toml)
-pluggy==1.6.0
-    # via
-    #   pytest
-    #   pytest-cov
-polars==1.41.1
-    # via linopy (pyproject.toml)
-polars-runtime-32==1.41.1
-    # via polars
-pre-commit==4.6.0
-    # via linopy (pyproject.toml)
-prometheus-client==0.25.0
-    # via jupyter-server
-prompt-toolkit==3.0.52
-    # via
-    #   ipython
-    #   jupyter-console
-psutil==7.2.2
-    # via
-    #   ipykernel
-    #   ipython
-ptyprocess==0.7.0
-    # via
-    #   pexpect
-    #   terminado
-pure-eval==0.2.3
-    # via stack-data
-py-cpuinfo==9.0.0
-    # via pytest-benchmark
-pycparser==3.0
-    # via cffi
-pygments==2.20.0
-    # via
-    #   ipython
-    #   ipython-pygments-lexers
-    #   jupyter-console
-    #   nbconvert
-    #   pytest
-    #   rich
-    #   textual
-pynacl==1.6.2
-    # via paramiko
-pytest==9.0.3
-    # via
-    #   linopy (pyproject.toml)
-    #   pytest-benchmark
-    #   pytest-cov
-    #   pytest-memray
-pytest-benchmark==5.2.3
-    # via linopy (pyproject.toml)
-pytest-cov==7.1.0
-    # via linopy (pyproject.toml)
-pytest-memray==1.8.0
-    # via linopy (pyproject.toml)
-python-dateutil==2.9.0.post0
-    # via
-    #   arrow
-    #   jupyter-client
-    #   pandas
-python-discovery==1.4.0
-    # via virtualenv
-python-json-logger==4.1.0
-    # via jupyter-events
-pyyaml==6.0.3
-    # via
-    #   dask
-    #   jupyter-events
-    #   pre-commit
-pyzmq==27.1.0
-    # via
-    #   ipykernel
-    #   jupyter-client
-    #   jupyter-console
-    #   jupyter-server
-referencing==0.37.0
-    # via
-    #   jsonschema
-    #   jsonschema-specifications
-    #   jupyter-events
-requests==2.34.2
-    # via jupyterlab-server
-rfc3339-validator==0.1.4
-    # via
-    #   jsonschema
-    #   jupyter-events
-rfc3986-validator==0.1.1
-    # via
-    #   jsonschema
-    #   jupyter-events
-rfc3987-syntax==1.1.0
-    # via jsonschema
-rich==15.0.0
-    # via
-    #   memray
-    #   textual
-    #   typer
-rpds-py==2026.5.1
-    # via
-    #   jsonschema
-    #   referencing
-scipy==1.17.1
-    # via
-    #   linopy (pyproject.toml)
-    #   mindoptpy
-send2trash==2.1.0
-    # via jupyter-server
-setuptools==82.0.1
-    # via jupyterlab
-shellingham==1.5.4
-    # via typer
-six==1.17.0
-    # via
-    #   python-dateutil
-    #   rfc3339-validator
-soupsieve==2.8.4
-    # via beautifulsoup4
-stack-data==0.6.3
-    # via ipython
-terminado==0.18.1
-    # via
-    #   jupyter-server
-    #   jupyter-server-terminals
-textual==8.2.7
-    # via memray
-tinycss2==1.4.0
-    # via bleach
-toolz==1.1.0
-    # via
-    #   linopy (pyproject.toml)
-    #   dask
-    #   partd
-tornado==6.5.6
-    # via
-    #   ipykernel
-    #   jupyter-client
-    #   jupyter-server
-    #   jupyterlab
-    #   notebook
-    #   terminado
-tqdm==4.67.3
-    # via linopy (pyproject.toml)
-traitlets==5.15.0
-    # via
-    #   ipykernel
-    #   ipython
-    #   ipywidgets
-    #   jupyter-client
-    #   jupyter-console
-    #   jupyter-core
-    #   jupyter-events
-    #   jupyter-server
-    #   jupyterlab
-    #   matplotlib-inline
-    #   nbclient
-    #   nbconvert
-    #   nbformat
-typer==0.26.2
-    # via linopy (pyproject.toml)
-types-paramiko==4.0.0.20260518
-    # via linopy (pyproject.toml)
-types-requests==2.33.0.20260518
-    # via linopy (pyproject.toml)
-typing-extensions==4.15.0
-    # via
-    #   anyio
-    #   beautifulsoup4
-    #   ipython
-    #   mypy
-    #   referencing
-    #   textual
-tzdata==2026.2
-    # via arrow
-uc-micro-py==2.0.0
-    # via linkify-it-py
-uri-template==1.3.0
-    # via jsonschema
-urllib3==2.7.0
-    # via
-    #   requests
-    #   types-requests
-virtualenv==21.4.1
-    # via pre-commit
-wcwidth==0.7.0
-    # via prompt-toolkit
-webcolors==25.10.0
-    # via jsonschema
-webencodings==0.5.1
-    # via
-    #   bleach
-    #   tinycss2
-websocket-client==1.9.0
-    # via jupyter-server
-widgetsnbextension==4.0.15
-    # via ipywidgets
-xarray==2026.4.0
-    # via linopy (pyproject.toml)
-zipp==4.1.0
-    # via importlib-metadata
diff --git a/pyproject.toml b/pyproject.toml
index 38d39b8a..2d79bec0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -116,6 +116,7 @@ benchmarks = [
     "pytest-benchmark==5.2.3",
     "pytest-memray==1.8.0",
     "pytest-codspeed==5.0.3",
+    "jupytext==1.17.4",
     "nbconvert==7.17.1",
     "typer==0.26.2",
     "plotly==6.7.0",

From 3981cad1af415c5e3646ec2cde250b274bbeab26 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 22:28:00 +0200
Subject: [PATCH 46/68] benchmarks: add CLI walkthrough as Jupytext MyST
 notebook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

End-to-end walkthrough of the benchmarks CLI: registry introspection
(``list --details``, ``show``, ``filter``), a two-snapshot regression
workflow (``run --quick --phase build`` → ``compare`` table → ``plot
--view scatter`` / ``--view compare`` rendered inline via
``IPython.display.HTML``), peak-RSS snapshots (``memory save`` / ``memory
compare``), an "other CLI surfaces" reference table, and the "add a new
model" three-step recipe.

The file is the load-bearing documentation for the suite — README only
covers install and how to open it. CI executes it on every PR via
``python -m benchmarks notebook`` so the examples can't silently rot.
Contributors regenerate the gitignored ``walkthrough.ipynb`` sibling via
``python -m benchmarks notebook --build`` and open it in JupyterLab,
PyCharm, or VSCode.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/walkthrough.md | 230 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 230 insertions(+)
 create mode 100644 benchmarks/walkthrough.md

diff --git a/benchmarks/walkthrough.md b/benchmarks/walkthrough.md
new file mode 100644
index 00000000..1e9301e2
--- /dev/null
+++ b/benchmarks/walkthrough.md
@@ -0,0 +1,230 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.19.3
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
+# Linopy benchmarks — CLI walkthrough
+
+> ⚠️ **This file is the source. Don't edit the `.ipynb` directly.**
+> Run `python -m benchmarks notebook --build` to (re)generate
+> `walkthrough.ipynb` from this `.md`, then open the `.ipynb` in
+> JupyterLab / PyCharm / VSCode to view and run cells. To change the
+> walkthrough's content, edit the `.md`, then re-run `--build`. The
+> `.ipynb` is gitignored.
+
+Internal performance tracking for `linopy`. This notebook shows the
+typer CLI working end-to-end: introspect what's registered, run a
+timing snapshot, diff two snapshots, render the comparison views
+inline.
+
+For what this notebook deliberately doesn't duplicate:
+
+- **Install + size tiers** → [`benchmarks/README.md`](README.md)
+- **Every CLI flag** → `python -m benchmarks --help` (rich-rendered);
+  `--help` on any subcommand drills in.
+
+## What's measured
+
+| Phase            | Test file                         | Measures                                                       |
+| ---------------- | --------------------------------- | -------------------------------------------------------------- |
+| `build`          | `test_build.py`                   | constructing variables / expressions / constraints / objective |
+| `matrices`       | `test_matrices.py`                | `A`, `b`, `c`, bounds, labels, `Q` for QP                      |
+| `lp_write`       | `test_lp_write.py`                | `model.to_file(...)` — LP / MPS serialization                  |
+| `netcdf`         | `test_netcdf.py`                  | `to_netcdf` / `read_netcdf` round-trip                         |
+| `solver_handoff` | `test_solver_handoff.py`          | `lp.io.to_highspy` / `to_gurobipy` / `to_mosek` / `to_xpress`  |
+| end-to-end       | `test_pypsa_carbon_management.py` | fixed PyPSA model → highspy; sweeps `freeze_constraints`       |
+
+Solver algorithm runtime is intentionally out of scope.
+
+## Setup
+
+Locate the repo so the shell cells below can run `python -m benchmarks`
+regardless of where Jupyter was launched, and pick a tempdir for the
+snapshot/plot files we'll produce.
+
+```{code-cell} ipython3
+import os
+import sys
+import tempfile
+from pathlib import Path
+
+# CI sets LINOPY_REPO_ROOT; locally we walk up from cwd.
+_root = os.environ.get("LINOPY_REPO_ROOT") or next(
+    (
+        str(p) for p in [Path.cwd().resolve(), *Path.cwd().resolve().parents]
+        if (p / "benchmarks" / "registry.py").exists()
+    ),
+    None,
+)
+if _root is None:
+    raise RuntimeError(
+        "Could not locate linopy repo root. Set LINOPY_REPO_ROOT or launch "
+        "Jupyter from somewhere inside the repo."
+    )
+
+# Subshells launched by ``!``-cells inherit cwd, env, and PYTHONPATH.
+os.chdir(_root)
+os.environ["PYTHONPATH"] = f"{_root}:{os.environ.get('PYTHONPATH', '')}"
+# Rich/click disable colour when stdout isn't a TTY (and the ``!`` pipe
+# isn't); ``FORCE_COLOR`` overrides that so typer's ``--help`` panels
+# render with colour in the notebook output.
+os.environ["FORCE_COLOR"] = "1"
+
+_tmp = Path(tempfile.mkdtemp(prefix="bench-walkthrough-"))
+baseline = _tmp / "baseline.json"
+candidate = _tmp / "candidate.json"
+scatter_html = _tmp / "scatter.html"
+compare_html = _tmp / "compare.html"
+
+print(f"repo root: {_root}")
+print(f"tempdir:   {_tmp}")
+```
+
+## Introspect the registry
+
+`list` enumerates registered specs. `--details` shows the feature tags
+and size range each spec covers, so you can pick a focused target.
+
+```{code-cell} ipython3
+!python -m benchmarks list --details
+```
+
+`show <name>` drills into one spec — every attribute the registry
+exposes, including which phases it's eligible for and the
+`quick_threshold` / `long_threshold` gating its sizes.
+
+```{code-cell} ipython3
+!python -m benchmarks show basic
+```
+
+`filter` narrows by feature tag (`quadratic`, `integer`, `sos`, …) or
+phase tag — useful when you only care about a subset of the suite.
+
+```{code-cell} ipython3
+!python -m benchmarks filter --feature quadratic
+```
+
+## Run a timing snapshot
+
+`run` is the main timing entry point. Below we run twice with
+`--quick --phase build` (~10 s each) to get a baseline / candidate
+pair we can diff. On a real PR you'd run once on `master` and once on
+your branch.
+
+```{code-cell} ipython3
+!python -m benchmarks run --quick --phase build --json {baseline}
+```
+
+```{code-cell} ipython3
+!python -m benchmarks run --quick --phase build --json {candidate}
+```
+
+The diff between two `--quick` runs of the same code is just
+measurement noise — that's expected. On a real PR the numbers below
+would actually move.
+
+## Diff snapshots
+
+### Text table — `compare`
+
+`compare` wraps `pytest-benchmark compare` with opinionated defaults:
+group by full test name, sort by `min`, show min + IQR. One mini-table
+per test with the baseline + candidate rows and a relative-speedup
+factor flagging the slower one. Scales to 30+ tests, just long output.
+
+```{code-cell} ipython3
+!python -m benchmarks compare {baseline} {candidate}
+```
+
+### Scatter view — exploratory plot
+
+x = baseline cost on a log axis, y = ratio (candidate / baseline),
+colour = absolute Δ. **Top-right = slow tests that got slower** —
+the "fix this" zone. Top-left = cheap tests with big ratio swings
+(noise, not real change). Bottom-right = already-slow tests that
+didn't move. Resolves the absolute-vs-relative tension that either
+axis alone has a blind spot for.
+
+```{code-cell} ipython3
+!python -m benchmarks plot --view scatter {baseline} {candidate} -o {scatter_html}
+
+from IPython.display import HTML
+HTML(scatter_html.read_text())
+```
+
+### Compare view — sorted-Δ bar chart
+
+The "did this PR regress anything, ranked by impact" picture. Bars
+sorted by absolute time delta by default (`--sort relative` switches
+to percent). Diverging colour around zero.
+
+```{code-cell} ipython3
+!python -m benchmarks plot --view compare {baseline} {candidate} -o {compare_html}
+HTML(compare_html.read_text())
+```
+
+## Memory snapshots
+
+`memory save <label>` runs benchmarks under `memray.Tracker` and
+writes peak allocations (MiB) per `(phase, spec, size)` to
+`.benchmarks/memory/<label>.json`. The model is built **outside** the
+tracked region so peak reflects only the phase work, not model
+construction.
+
+```{code-cell} ipython3
+!python -m benchmarks memory save baseline_mem --quick --phase build
+```
+
+```{code-cell} ipython3
+!python -m benchmarks memory save candidate_mem --quick --phase build
+```
+
+`memory compare` prints a per-test table of the two labels with
+percent change — same shape as the timing `compare`, different
+metric. Tests present in only one snapshot show `—` for the missing
+column.
+
+```{code-cell} ipython3
+!python -m benchmarks memory compare baseline_mem candidate_mem
+```
+
+For cross-version memory tracking (analogous to `sweep` for timing),
+use `memory sweep <v1> <v2> ...` — same per-version venv shape, peak
+RSS metric.
+
+## Other CLI surfaces
+
+| Command                            | Purpose                                                              |
+| ---------------------------------- | -------------------------------------------------------------------- |
+| `smoke`                            | CI smoke run — every model/phase at quickest size, no timings (~20s) |
+| `run --long`                       | Full sweep including heaviest sizes (knapsack 1M, basic 1600); slow  |
+| `sweep <v1> <v2> ...`              | Build fresh venv per linopy version and run the suite in each        |
+| `memory sweep <v1> <v2> ...`       | Same shape as `sweep`, but tracks peak RSS per version               |
+| `plot --view sweep <s1> <s2> ...`  | Heatmap of ratios across 3+ snapshots                                |
+| `plot --view scaling <snap>`       | Log-log time vs `n` for size-parametrized tests, faceted by phase    |
+| `notebook`                         | Re-execute this walkthrough end-to-end (what CI runs)                |
+
+Each has its own `--help` with all flags.
+
+## Extending the suite
+
+Add a new model:
+
+1. Drop `benchmarks/models/<name>.py` with a `build_<name>(size) -> linopy.Model`.
+2. Build a `ModelSpec`, call `register(...)` at module scope, declare
+   realistic `quick_threshold` / `long_threshold` so the smoke run
+   stays fast.
+3. Import it in `benchmarks/models/__init__.py` so registration fires
+   on first import.
+
+Every phase test that lists `<name>` in its applicable phases picks it
+up automatically via `iter_params(phase)`. The first introspection
+section of this notebook will list your new spec on the next run.

From 59eadb348981f9f0c8c848cad97e334d7abe9691 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 22:29:00 +0200
Subject: [PATCH 47/68] benchmarks: bump pinned jupytext to 1.19.3 (matches
 installed)

The previous pin (1.17.4) was a guess; the resolved version in the
dev environment is 1.19.3. Align pyproject so per-version sweep venvs
install the same jupytext the local dev env uses.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 2d79bec0..9b67a22d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -116,7 +116,7 @@ benchmarks = [
     "pytest-benchmark==5.2.3",
     "pytest-memray==1.8.0",
     "pytest-codspeed==5.0.3",
-    "jupytext==1.17.4",
+    "jupytext==1.19.3",
     "nbconvert==7.17.1",
     "typer==0.26.2",
     "plotly==6.7.0",

From cbf517abbc372bfc388f8af2ea1bdc6e5c69f194 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 22:36:30 +0200
Subject: [PATCH 48/68] benchmarks: sweep --smoke for cross-version sanity
 checks
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a ``--smoke`` flag to ``sweep`` that runs the same pytest invocation
as the top-level ``smoke`` command in each per-version venv: every
model/phase fires once at the quickest size, no timings, ~10–20 s per
version. Useful before bumping a perf-sensitive pin like ``numpy`` to
confirm every linopy version we'd sweep against still installs, imports,
and exercises the suite cleanly.

Surfaces real binary-compat issues (e.g. a ``netcdf4`` wheel mismatched
against the pinned ``numpy``) that declared-constraint resolution can't
catch on its own.

The smoke pytest args are now a shared ``_SMOKE_PYTEST_ARGS`` constant
so the top-level command and ``sweep --smoke`` stay in sync — single
source for the definition of "smoke."

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py | 57 +++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index bed4ee14..c7a74cf3 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -75,6 +75,11 @@ def _benchmarks_extra_pins() -> list[str]:
     "solver_handoff": "benchmarks/test_solver_handoff.py",
 }
 
+# pytest args that constitute a "smoke" run — quick sizes, no timings.
+# Shared between the top-level ``smoke`` command and ``sweep --smoke`` so
+# bumping the definition stays single-source.
+_SMOKE_PYTEST_ARGS = ["benchmarks/", "--quick", "--benchmark-disable", "-q"]
+
 
 # --- Introspection commands ------------------------------------------------
 
@@ -206,8 +211,7 @@ def smoke(ctx: typer.Context) -> None:
 
         python -m benchmarks smoke -k basic --tb=short
     """
-    args = ["benchmarks/", "--quick", "--benchmark-disable", "-q", *ctx.args]
-    _run_pytest(args)
+    _run_pytest([*_SMOKE_PYTEST_ARGS, *ctx.args])
 
 
 @app.command(
@@ -434,6 +438,20 @@ def sweep(
             ),
         ),
     ] = None,
+    smoke: Annotated[
+        bool,
+        typer.Option(
+            "--smoke",
+            help=(
+                "Run the smoke suite in each version's venv instead of the "
+                "full timing run. Same pytest invocation as the top-level "
+                "``smoke`` command — every model/phase fires once at the "
+                "quickest size, no timings, ~20 s per version. Useful before "
+                "bumping a perf-sensitive pin to check the combination is "
+                "viable across every linopy version you'd sweep against."
+            ),
+        ),
+    ] = False,
 ) -> None:
     """
     Run the benchmark suite against several linopy versions.
@@ -472,6 +490,15 @@ def sweep(
         )
         raise typer.Exit(code=2)
 
+    if smoke and (long or rounds is not None):
+        typer.secho(
+            "--smoke can't be combined with --long or --rounds "
+            "(no timings are recorded in smoke mode).",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
+
     if shutil.which("uv") is None:
         typer.secho(
             "uv not found on PATH — install via https://docs.astral.sh/uv/",
@@ -481,7 +508,8 @@ def sweep(
         raise typer.Exit(code=2)
 
     repo_root = Path.cwd()
-    output_dir.mkdir(parents=True, exist_ok=True)
+    if not smoke:
+        output_dir.mkdir(parents=True, exist_ok=True)
 
     failed: list[str] = []
     for version in versions:
@@ -528,10 +556,31 @@ def sweep(
             # 3. Run the benchmarks. PYTHONPATH makes ``import benchmarks``
             #    resolve against the local checkout — the venv only needs to
             #    provide linopy + the test infra.
-            snapshot = (output_dir / f"linopy-{version}.json").resolve()
             env = os.environ.copy()
             env["PYTHONPATH"] = str(repo_root)
 
+            if smoke:
+                # Smoke mode: reuse the same pytest args as the top-level
+                # ``smoke`` command. No JSON snapshot, return code is the
+                # signal.
+                pytest_cmd = [str(vpy), "-m", "pytest", *_SMOKE_PYTEST_ARGS]
+                k_parts = [p for p in (model, filter_expr) if p]
+                if k_parts:
+                    pytest_cmd.extend(["-k", " and ".join(k_parts)])
+                pytest_cmd.extend(ctx.args)
+
+                typer.secho(f"$ {' '.join(pytest_cmd)}", fg=typer.colors.BRIGHT_BLACK)
+                r = subprocess.run(pytest_cmd, env=env, check=False)
+                if r.returncode != 0:
+                    typer.secho(
+                        f"smoke failed: {version}", fg=typer.colors.RED, err=True
+                    )
+                    failed.append(version)
+                else:
+                    typer.secho(f"smoke ok: {version}", fg=typer.colors.GREEN)
+                continue
+
+            snapshot = (output_dir / f"linopy-{version}.json").resolve()
             test_target = (
                 _PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/"
             )

From 4ba6fb41607f002f93528164fa971afc5a643417 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 22:39:53 +0200
Subject: [PATCH 49/68] benchmarks: small cleanups (dead __iter__, naming,
 stale comments)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Delete the module-level ``__iter__`` in registry.py — module-level
  ``__iter__`` is uncallable (you can't ``for x in module``), so it was
  dead code. Drops the unused ``Iterator`` import too.
- Rename ``memory.DEFAULT_PHASES`` → ``MEMORY_PHASES``. It collided with
  ``registry.DEFAULT_PHASES`` (frozenset of 9 phase tags) — same name,
  different shape, both imported elsewhere. Footgun for the next reader.
- Rewrite the stale comment above the ``[benchmarks]`` pyproject extra.
  It said "Not pinned here: numpy / scipy / pandas / xarray" directly
  above the lines pinning exactly those four, and referenced the
  ``requirements.lock`` we just deleted. Replace with the actual story:
  every measurement-relevant direct dep is pinned; sweep installs the
  same set into each per-version venv.
- Rename the CI smoke step from "Execute registry-usage notebook" to
  "Execute walkthrough notebook" to match the file the ``notebook``
  command now executes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/benchmark-smoke.yml |  6 +++---
 benchmarks/cli.py                     | 12 ++++++------
 benchmarks/memory.py                  |  4 ++--
 benchmarks/registry.py                |  6 +-----
 pyproject.toml                        | 18 +++++++++---------
 5 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/benchmark-smoke.yml b/.github/workflows/benchmark-smoke.yml
index e6ea6c65..59f6462f 100644
--- a/.github/workflows/benchmark-smoke.yml
+++ b/.github/workflows/benchmark-smoke.yml
@@ -40,9 +40,9 @@ jobs:
       run: |
         python -m benchmarks smoke
 
-    - name: Execute registry-usage notebook
-      # Catches doc rot — the notebook is the canonical "how to use the
-      # registry" walkthrough and must stay runnable end-to-end.
+    - name: Execute walkthrough notebook
+      # Catches doc rot — walkthrough.md is the canonical CLI walkthrough
+      # for the suite and must stay runnable end-to-end.
       run: |
         python -m benchmarks notebook
 
diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index c7a74cf3..a1de558a 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -939,13 +939,13 @@ def memory_save_cmd(
     pytest-style test IDs so ``compare`` diffs cleanly across runs that
     selected different subsets.
     """
-    from benchmarks.memory import DEFAULT_PHASES
+    from benchmarks.memory import MEMORY_PHASES
 
     if phase:
-        unknown = [p for p in phase if p not in DEFAULT_PHASES]
+        unknown = [p for p in phase if p not in MEMORY_PHASES]
         if unknown:
             typer.secho(
-                f"unknown phase(s): {unknown}; valid options: {list(DEFAULT_PHASES)}",
+                f"unknown phase(s): {unknown}; valid options: {list(MEMORY_PHASES)}",
                 fg=typer.colors.RED,
                 err=True,
             )
@@ -1001,13 +1001,13 @@ def memory_sweep_cmd(
     ``--repeats 1`` (default) is usually plenty. Use ``--repeats 3``
     if you need <5%% regression detection.
     """
-    from benchmarks.memory import DEFAULT_PHASES
+    from benchmarks.memory import MEMORY_PHASES
 
     if phase:
-        unknown = [p for p in phase if p not in DEFAULT_PHASES]
+        unknown = [p for p in phase if p not in MEMORY_PHASES]
         if unknown:
             typer.secho(
-                f"unknown phase(s): {unknown}; valid options: {list(DEFAULT_PHASES)}",
+                f"unknown phase(s): {unknown}; valid options: {list(MEMORY_PHASES)}",
                 fg=typer.colors.RED,
                 err=True,
             )
diff --git a/benchmarks/memory.py b/benchmarks/memory.py
index f2ad3329..942806ff 100644
--- a/benchmarks/memory.py
+++ b/benchmarks/memory.py
@@ -44,7 +44,7 @@
     from benchmarks.registry import ModelSpec
 
 RESULTS_DIR = Path(".benchmarks/memory")
-DEFAULT_PHASES: tuple[str, ...] = (
+MEMORY_PHASES: tuple[str, ...] = (
     "build",
     "matrices",
     "lp_write",
@@ -268,7 +268,7 @@ def save(
     measurement; ``memray.Tracker`` only counts what's allocated inside its
     ``with`` block, but the subprocess boundary makes the isolation total.
     """
-    phases = list(phases) if phases else list(DEFAULT_PHASES)
+    phases = list(phases) if phases else list(MEMORY_PHASES)
 
     all_results: dict[str, float] = {}
     for phase in phases:
diff --git a/benchmarks/registry.py b/benchmarks/registry.py
index dbee2281..41543112 100644
--- a/benchmarks/registry.py
+++ b/benchmarks/registry.py
@@ -23,7 +23,7 @@
 
 from __future__ import annotations
 
-from collections.abc import Callable, Iterator
+from collections.abc import Callable
 from dataclasses import dataclass
 
 import linopy
@@ -198,7 +198,3 @@ def iter_params(phase: str) -> list[tuple[ModelSpec, int]]:
 
 def param_ids(params: list[tuple[ModelSpec, int]]) -> list[str]:
     return [f"{spec.name}-n={size}" for spec, size in params]
-
-
-def __iter__() -> Iterator[ModelSpec]:  # pragma: no cover - convenience
-    return iter(REGISTRY.values())
diff --git a/pyproject.toml b/pyproject.toml
index 9b67a22d..0bb0128c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -82,16 +82,16 @@ dev = [
     "highspy",
     "jupyter",
 ]
-# Test infrastructure pinned exactly so the measurement environment stays
-# stable over time on the same machine — deltas between two runs then
-# reflect linopy changes, not a pytest or pandas upgrade. Absolute numbers
-# are still machine-dependent (CPU / cache / memory bandwidth).
+# Every direct dep that affects measurement is pinned exactly so the
+# environment stays stable over time on the same machine — deltas
+# between two runs then reflect linopy changes, not a numpy/scipy/pytest
+# upgrade. Absolute numbers are still machine-dependent (CPU / cache /
+# memory bandwidth).
 #
-# Not pinned here: numpy / scipy / pandas / xarray. They also affect
-# measurements, but the full transitive set lives in
-# ``benchmarks/requirements.lock`` (regen via ``uv pip compile``). The
-# lockfile excludes linopy itself so ``sweep`` can install any linopy
-# version on top of a stable environment.
+# ``sweep`` installs these into each per-version venv, so the same pin
+# set drives every linopy version in a sweep call — only ``linopy``
+# varies. Transitive deps resolve fresh per venv; uv's deterministic
+# resolution gives identical results across versions within one sweep.
 #
 # ``highspy`` follows the project-wide ``!=1.14.0`` exclusion (see the
 # ``solvers`` extra).

From d86b111b9938b6e1c020d220b7346bf4090c6652 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 22:41:20 +0200
Subject: [PATCH 50/68] benchmarks: delete unused SOLVER_BUILD phase + collapse
 models re-exports
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two related dead-surface trims flagged in code review:

- ``SOLVER_BUILD`` was an aspirational phase tag ("generic
  Solver.from_name(..., io_api='direct')") that was declared, exported,
  listed in ``DEFAULT_PHASES``, and added to ``sos``'s phase set — but
  never wired into any test. ``test_solver_handoff.py`` only exercises
  ``TO_HIGHSPY``/``GUROBIPY``/``MOSEK``/``XPRESS``. Remove it from
  registry.py, ``__init__.py`` exports, and the sos spec. If we ever
  want the generic ``from_name(..., io_api='direct')`` path measured,
  it can come back as a real phase with a real test.

- ``benchmarks/models/__init__.py`` was re-exporting 20 names
  (``BASIC_SIZES``, ``build_basic``, …) that nothing outside the
  ``models`` package referenced. The documented access path is
  ``REGISTRY["<name>"]``; the only thing ``__init__.py`` needs to do
  is trigger each submodule's ``register(...)`` side-effect. Collapse
  to a single ``from benchmarks.models import basic, …`` import block.
  Adding a new model is now one new file plus one line in this block.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/__init__.py        |  2 --
 benchmarks/models/__init__.py | 65 ++++++++++-------------------------
 benchmarks/models/sos.py      |  2 --
 benchmarks/registry.py        |  3 --
 4 files changed, 19 insertions(+), 53 deletions(-)

diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
index 9c709d73..5e181b98 100644
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
@@ -57,7 +57,6 @@ def load_long_df(snapshots, metric="min"):
     PIECEWISE,
     QUADRATIC,
     REGISTRY,
-    SOLVER_BUILD,
     SOS,
     TO_GUROBIPY,
     TO_HIGHSPY,
@@ -87,7 +86,6 @@ def load_long_df(snapshots, metric="min"):
     "PIECEWISE",
     "QUADRATIC",
     "REGISTRY",
-    "SOLVER_BUILD",
     "SOS",
     "TO_GUROBIPY",
     "TO_HIGHSPY",
diff --git a/benchmarks/models/__init__.py b/benchmarks/models/__init__.py
index 8e5b9ca2..a471b216 100644
--- a/benchmarks/models/__init__.py
+++ b/benchmarks/models/__init__.py
@@ -1,51 +1,24 @@
 """
 Model builders for benchmarks.
 
-Importing this package registers every model in :data:`benchmarks.registry.REGISTRY`.
-Each module exposes a ``build_<name>(size) -> linopy.Model`` callable and a
-module-level ``SPEC`` :class:`~benchmarks.registry.ModelSpec`.
+Importing this package triggers every submodule's ``register(...)`` call,
+populating :data:`benchmarks.registry.REGISTRY`. Each submodule exposes a
+``build_<name>(size) -> linopy.Model`` callable and a module-level ``SPEC``
+:class:`~benchmarks.registry.ModelSpec`. The documented access path is
+``REGISTRY["<name>"]``; submodule re-exports are intentionally not exposed
+here so that adding a new model is one new file plus one import below.
 """
 
-from benchmarks.models.basic import SIZES as BASIC_SIZES
-from benchmarks.models.basic import build_basic
-from benchmarks.models.expression_arithmetic import SIZES as EXPR_SIZES
-from benchmarks.models.expression_arithmetic import build_expression_arithmetic
-from benchmarks.models.knapsack import SIZES as KNAPSACK_SIZES
-from benchmarks.models.knapsack import build_knapsack
-from benchmarks.models.masked import SIZES as MASKED_SIZES
-from benchmarks.models.masked import build_masked
-from benchmarks.models.milp import SIZES as MILP_SIZES
-from benchmarks.models.milp import build_milp
-from benchmarks.models.piecewise import SIZES as PIECEWISE_SIZES
-from benchmarks.models.piecewise import build_piecewise
-from benchmarks.models.pypsa_scigrid import SIZES as PYPSA_SIZES
-from benchmarks.models.pypsa_scigrid import build_pypsa_scigrid
-from benchmarks.models.qp import SIZES as QP_SIZES
-from benchmarks.models.qp import build_qp
-from benchmarks.models.sos import SIZES as SOS_SIZES
-from benchmarks.models.sos import build_sos
-from benchmarks.models.sparse_network import SIZES as SPARSE_SIZES
-from benchmarks.models.sparse_network import build_sparse_network
-
-__all__ = [
-    "BASIC_SIZES",
-    "EXPR_SIZES",
-    "KNAPSACK_SIZES",
-    "MASKED_SIZES",
-    "MILP_SIZES",
-    "PIECEWISE_SIZES",
-    "PYPSA_SIZES",
-    "QP_SIZES",
-    "SOS_SIZES",
-    "SPARSE_SIZES",
-    "build_basic",
-    "build_expression_arithmetic",
-    "build_knapsack",
-    "build_masked",
-    "build_milp",
-    "build_piecewise",
-    "build_pypsa_scigrid",
-    "build_qp",
-    "build_sos",
-    "build_sparse_network",
-]
+# Side-effect imports — each module calls ``register(...)`` at import time.
+from benchmarks.models import (  # noqa: F401
+    basic,
+    expression_arithmetic,
+    knapsack,
+    masked,
+    milp,
+    piecewise,
+    pypsa_scigrid,
+    qp,
+    sos,
+    sparse_network,
+)
diff --git a/benchmarks/models/sos.py b/benchmarks/models/sos.py
index c5ac1d36..06ac22d4 100644
--- a/benchmarks/models/sos.py
+++ b/benchmarks/models/sos.py
@@ -31,7 +31,6 @@
     LP_WRITE,
     MATRICES,
     NETCDF,
-    SOLVER_BUILD,
     SOS,
     TO_GUROBIPY,
     TO_XPRESS,
@@ -87,7 +86,6 @@ def build_sos(n_gens: int) -> linopy.Model:
                     MATRICES,
                     LP_WRITE,
                     NETCDF,
-                    SOLVER_BUILD,
                     TO_GUROBIPY,
                     TO_XPRESS,
                 }
diff --git a/benchmarks/registry.py b/benchmarks/registry.py
index 41543112..b7735670 100644
--- a/benchmarks/registry.py
+++ b/benchmarks/registry.py
@@ -48,7 +48,6 @@
 MATRICES = "matrices"
 LP_WRITE = "lp_write"
 NETCDF = "netcdf"
-SOLVER_BUILD = "solver_build"  # generic Solver.from_name(..., io_api="direct")
 TO_HIGHSPY = "to_highspy"
 TO_GUROBIPY = "to_gurobipy"
 TO_MOSEK = "to_mosek"
@@ -60,7 +59,6 @@
         MATRICES,
         LP_WRITE,
         NETCDF,
-        SOLVER_BUILD,
         TO_HIGHSPY,
         TO_GUROBIPY,
         TO_MOSEK,
@@ -77,7 +75,6 @@
         MATRICES,
         LP_WRITE,
         NETCDF,
-        SOLVER_BUILD,
         TO_HIGHSPY,
         TO_GUROBIPY,
         TO_MOSEK,

From b153239a49459de458bbad22cc7b3a38c0f60a4a Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 22:46:03 +0200
Subject: [PATCH 51/68] benchmarks: share phase verbs via benchmarks/phases.py
 + guard the seam
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Extract the *what each phase does to a model* logic from the pytest test
files and ``memory.py`` into a single ``benchmarks/phases.py`` module.
Both drivers now import the same verbs — pytest wraps them in
``benchmark(...)``, memray wraps them in ``Tracker(...)`` — so the
measured operation is defined once.

The drift risk this targets is silent: if ``test_matrices`` grows a new
matrix attribute and ``memory.py``'s inline copy doesn't, the
timing/memory snapshots end up measuring different operations and
``plot`` shows non-overlapping sets, no error. Likewise the solver list
(currently 4 wrappers) was duplicated between ``test_solver_handoff.py``
and a hardcoded ``to_highspy`` in memory.

Touchpoints:

- ``benchmarks/phases.py`` (new): ``touch_matrices``, ``write_lp``
  (with ``progress=False`` pinned here), ``write_netcdf``,
  ``read_netcdf`` re-export, and a ``SOLVER_HANDOFFS`` tuple of
  ``(solver_name, registry_phase_tag, wrapper)``.
- ``test_matrices.py``: drops the local ``_access_matrices``.
- ``test_lp_write.py``: uses ``write_lp`` (pin lives in one place now).
- ``test_netcdf.py``: uses ``write_netcdf`` + ``read_netcdf`` from
  phases.
- ``test_solver_handoff.py``: ``_SOLVER_PHASES`` becomes
  ``SOLVER_HANDOFFS`` from phases; ``_make_params`` loop unchanged.
- ``memory.py``: inline matrices/lp_write/netcdf bodies replaced with
  ``touch_matrices`` / ``write_lp`` / ``write_netcdf`` / ``read_netcdf``
  from phases. The solver-handoff branch now looks up ``"highs"`` by
  name in ``SOLVER_HANDOFFS`` rather than ``[0]`` — reordering the
  tuple no longer silently swaps which solver gets measured.

The id seam — memory.py's hand-rolled ``f"...::test_X[name-n=size]"``
strings vs pytest's collected node ids — is intentionally not
abstracted (the netcdf double-emit and the ``highs-`` solver prefix make
a shared id generator more framework than it's worth). Instead, a new
``benchmarks/test_memory_id_alignment.py`` exercises both sides for one
cheap spec and asserts every memory-emitted id is in pytest's
collection. A test rename now fails this guard immediately.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/memory.py                   | 43 +++++----------
 benchmarks/phases.py                   | 68 ++++++++++++++++++++++++
 benchmarks/test_lp_write.py            |  3 +-
 benchmarks/test_matrices.py            | 18 +------
 benchmarks/test_memory_id_alignment.py | 72 ++++++++++++++++++++++++++
 benchmarks/test_netcdf.py              |  6 +--
 benchmarks/test_solver_handoff.py      | 20 ++-----
 7 files changed, 164 insertions(+), 66 deletions(-)
 create mode 100644 benchmarks/phases.py
 create mode 100644 benchmarks/test_memory_id_alignment.py

diff --git a/benchmarks/memory.py b/benchmarks/memory.py
index 942806ff..559eaa37 100644
--- a/benchmarks/memory.py
+++ b/benchmarks/memory.py
@@ -131,73 +131,58 @@ def _measurements(
     m = spec.build(size)
 
     if phase == "matrices":
-
-        def access() -> None:
-            mats = m.matrices
-            for attr in ("A", "b", "c", "lb", "ub", "sense", "vlabels", "clabels"):
-                getattr(mats, attr)
-            if m.is_quadratic:
-                mats.Q
+        from benchmarks.phases import touch_matrices
 
         yield (
             f"benchmarks/test_matrices.py::test_matrices[{name}-n={size}]",
-            access,
+            lambda: touch_matrices(m),
         )
 
     elif phase == "lp_write":
-        # ``to_file`` writes to disk; use a tempdir so we don't leak.
+        from benchmarks.phases import write_lp
+
         tmpdir = tempfile.TemporaryDirectory()
         lp_path = Path(tmpdir.name) / "m.lp"
-
-        def write_lp() -> None:
-            m.to_file(lp_path, progress=False)
-
         try:
             yield (
                 f"benchmarks/test_lp_write.py::test_lp_write[{name}-n={size}]",
-                write_lp,
+                lambda: write_lp(m, lp_path),
             )
         finally:
             tmpdir.cleanup()
 
     elif phase == "netcdf":
-        from linopy import read_netcdf
+        from benchmarks.phases import read_netcdf, write_netcdf
 
         tmpdir = tempfile.TemporaryDirectory()
         nc_path = Path(tmpdir.name) / "m.nc"
-
-        def write_nc() -> None:
-            m.to_netcdf(nc_path)
-
-        def read_nc() -> None:
-            read_netcdf(nc_path)
-
         try:
             yield (
                 f"benchmarks/test_netcdf.py::test_netcdf_write[{name}-n={size}]",
-                write_nc,
+                lambda: write_netcdf(m, nc_path),
             )
-            # ``write_nc`` was called by the caller as part of the
+            # ``write_netcdf`` was called by the caller as part of the
             # measurement, so ``nc_path`` now exists for the read.
             yield (
                 f"benchmarks/test_netcdf.py::test_netcdf_read[{name}-n={size}]",
-                read_nc,
+                lambda: read_netcdf(nc_path),
             )
         finally:
             tmpdir.cleanup()
 
     elif phase == "solver_handoff":
-        from linopy.io import to_highspy
+        from benchmarks.phases import SOLVER_HANDOFFS
 
-        def handoff() -> None:
-            to_highspy(m)
+        # Memory currently tracks only HiGHS — look it up by name so a
+        # reordering of SOLVER_HANDOFFS doesn't silently swap solvers.
+        highs = next(w for n, _, w in SOLVER_HANDOFFS if n == "highs")
 
         yield (
             (
                 f"benchmarks/test_solver_handoff.py::test_solver_handoff"
                 f"[highs-{name}-n={size}]"
             ),
-            handoff,
+            lambda: highs(m),
         )
 
     else:
diff --git a/benchmarks/phases.py b/benchmarks/phases.py
new file mode 100644
index 00000000..0761f49d
--- /dev/null
+++ b/benchmarks/phases.py
@@ -0,0 +1,68 @@
+"""
+Single source of truth for *what each benchmark phase does to a model*.
+
+Both drivers import these verbs:
+
+- the pytest ``test_<phase>.py`` files wrap them in ``benchmark(...)``;
+- ``memory.py`` wraps them in ``memray.Tracker(...)``.
+
+So the measured operation is defined once. Setup — building the model,
+creating scratch files — stays in the caller; only the verb itself
+lives here.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from pathlib import Path
+
+import linopy
+import linopy.io as lio
+from benchmarks.registry import TO_GUROBIPY, TO_HIGHSPY, TO_MOSEK, TO_XPRESS
+from linopy import read_netcdf
+
+# Re-export so callers can ``from benchmarks.phases import read_netcdf``
+# alongside the wrappers.
+__all__ = [
+    "SOLVER_HANDOFFS",
+    "read_netcdf",
+    "touch_matrices",
+    "write_lp",
+    "write_netcdf",
+]
+
+
+def touch_matrices(m: linopy.Model) -> None:
+    """Force every matrix block to materialise — the thing we measure."""
+    mats = m.matrices
+    for attr in ("A", "b", "c", "lb", "ub", "sense", "vlabels", "clabels"):
+        getattr(mats, attr)
+    if m.is_quadratic:
+        mats.Q
+
+
+def write_lp(m: linopy.Model, path: Path) -> None:
+    """
+    Write the model as an LP file.
+
+    ``progress=False`` is pinned here so the benchmark stays uniform
+    across drivers — the progress bar's overhead would otherwise leak
+    into the measurement.
+    """
+    m.to_file(path, progress=False)
+
+
+def write_netcdf(m: linopy.Model, path: Path) -> None:
+    m.to_netcdf(path)
+
+
+# (solver_name, registry phase tag, wrapper) — consumed by the pytest
+# parametrization in ``test_solver_handoff.py`` and by ``memory.py``,
+# which looks up the "highs" entry. Adding a solver here automatically
+# extends both drivers.
+SOLVER_HANDOFFS: tuple[tuple[str, str, Callable[[linopy.Model], object]], ...] = (
+    ("highs", TO_HIGHSPY, lio.to_highspy),
+    ("gurobi", TO_GUROBIPY, lio.to_gurobipy),
+    ("mosek", TO_MOSEK, lio.to_mosek),
+    ("xpress", TO_XPRESS, lio.to_xpress),
+)
diff --git a/benchmarks/test_lp_write.py b/benchmarks/test_lp_write.py
index ea3e04d7..2dec144d 100644
--- a/benchmarks/test_lp_write.py
+++ b/benchmarks/test_lp_write.py
@@ -5,6 +5,7 @@
 import pytest
 
 from benchmarks.conftest import maybe_skip
+from benchmarks.phases import write_lp
 from benchmarks.registry import LP_WRITE, iter_params, param_ids
 
 _PARAMS = iter_params(LP_WRITE)
@@ -15,4 +16,4 @@ def test_lp_write(benchmark, spec, size, request, tmp_path):
     maybe_skip(request, spec, size)
     m = spec.build(size)
     lp_file = tmp_path / "model.lp"
-    benchmark(m.to_file, lp_file, progress=False)
+    benchmark(write_lp, m, lp_file)
diff --git a/benchmarks/test_matrices.py b/benchmarks/test_matrices.py
index bd36a467..019f6dd8 100644
--- a/benchmarks/test_matrices.py
+++ b/benchmarks/test_matrices.py
@@ -5,28 +5,14 @@
 import pytest
 
 from benchmarks.conftest import maybe_skip
+from benchmarks.phases import touch_matrices
 from benchmarks.registry import MATRICES, iter_params, param_ids
 
 _PARAMS = iter_params(MATRICES)
 
 
-def _access_matrices(m):
-    """Touch every matrix property to force computation."""
-    matrices = m.matrices
-    _ = matrices.A
-    _ = matrices.b
-    _ = matrices.c
-    _ = matrices.lb
-    _ = matrices.ub
-    _ = matrices.sense
-    _ = matrices.vlabels
-    _ = matrices.clabels
-    if m.is_quadratic:
-        _ = matrices.Q  # exercise the QP path when present
-
-
 @pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
 def test_matrices(benchmark, spec, size, request):
     maybe_skip(request, spec, size)
     m = spec.build(size)
-    benchmark(_access_matrices, m)
+    benchmark(touch_matrices, m)
diff --git a/benchmarks/test_memory_id_alignment.py b/benchmarks/test_memory_id_alignment.py
new file mode 100644
index 00000000..8478f174
--- /dev/null
+++ b/benchmarks/test_memory_id_alignment.py
@@ -0,0 +1,72 @@
+"""
+Guard test for the timing ↔ memory test-id seam.
+
+``memory.py`` hand-rolls f-strings to label each measurement with the
+same node id pytest-benchmark produces (e.g.
+``benchmarks/test_matrices.py::test_matrices[basic-n=10]``). If a
+benchmark test function gets renamed and the matching f-string in
+``memory.py`` isn't updated, ``plot`` would silently end up with
+non-overlapping timing and memory sets — no error, just missing data.
+
+This test exercises both sides once and asserts every memory-emitted
+id is present in pytest's collection.
+"""
+
+from __future__ import annotations
+
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+from benchmarks.memory import MEMORY_PHASES, _measurements
+from benchmarks.registry import REGISTRY
+
+
+def _collect_benchmark_ids() -> set[str]:
+    """Return the set of node ids pytest collects under ``benchmarks/``."""
+    repo_root = Path(__file__).resolve().parents[1]
+    result = subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "pytest",
+            "benchmarks/",
+            "--collect-only",
+            "-q",
+            "--no-header",
+            "--co",
+        ],
+        capture_output=True,
+        text=True,
+        check=True,
+        cwd=repo_root,
+    )
+    # pytest -q --co emits one node id per line; trailing summary lines
+    # like "N tests collected" can be ignored.
+    return {
+        line.strip()
+        for line in result.stdout.splitlines()
+        if re.match(r"^benchmarks/.*::.*\[.*\]$", line.strip())
+    }
+
+
+def test_memory_node_ids_match_pytest_collection() -> None:
+    collected = _collect_benchmark_ids()
+    assert collected, "pytest collected zero benchmark node ids — sanity broken"
+
+    # ``basic`` at its smallest size is cheap and declares every default
+    # phase, so it exercises every node-id format ``_measurements`` emits.
+    spec = REGISTRY["basic"]
+    size = spec.sizes[0]
+
+    mem_ids: set[str] = set()
+    for phase in MEMORY_PHASES:
+        for test_id, _ in _measurements(phase, spec, size):
+            mem_ids.add(test_id)
+
+    missing = mem_ids - collected
+    assert not missing, (
+        "memory.py emits node ids that pytest doesn't collect "
+        "(test rename drift?):\n" + "\n".join(f"  {m}" for m in sorted(missing))
+    )
diff --git a/benchmarks/test_netcdf.py b/benchmarks/test_netcdf.py
index f26ae0fc..7b02c2bf 100644
--- a/benchmarks/test_netcdf.py
+++ b/benchmarks/test_netcdf.py
@@ -11,8 +11,8 @@
 import pytest
 
 from benchmarks.conftest import maybe_skip
+from benchmarks.phases import read_netcdf, write_netcdf
 from benchmarks.registry import NETCDF, iter_params, param_ids
-from linopy import read_netcdf
 
 _PARAMS = iter_params(NETCDF)
 
@@ -22,7 +22,7 @@ def test_netcdf_write(benchmark, spec, size, request, tmp_path):
     maybe_skip(request, spec, size)
     m = spec.build(size)
     out = tmp_path / "model.nc"
-    benchmark(m.to_netcdf, out)
+    benchmark(write_netcdf, m, out)
 
 
 @pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
@@ -30,5 +30,5 @@ def test_netcdf_read(benchmark, spec, size, request, tmp_path):
     maybe_skip(request, spec, size)
     m = spec.build(size)
     out = tmp_path / "model.nc"
-    m.to_netcdf(out)
+    write_netcdf(m, out)
     benchmark(read_netcdf, out)
diff --git a/benchmarks/test_solver_handoff.py b/benchmarks/test_solver_handoff.py
index c7d649fe..c3432b67 100644
--- a/benchmarks/test_solver_handoff.py
+++ b/benchmarks/test_solver_handoff.py
@@ -16,29 +16,15 @@
 
 import pytest
 
-import linopy.io as lio
 from benchmarks.conftest import maybe_skip
-from benchmarks.registry import (
-    TO_GUROBIPY,
-    TO_HIGHSPY,
-    TO_MOSEK,
-    TO_XPRESS,
-    iter_params,
-)
+from benchmarks.phases import SOLVER_HANDOFFS
+from benchmarks.registry import iter_params
 from linopy.solvers import available_solvers
 
-# (solver_name, phase tag, wrapper function)
-_SOLVER_PHASES = [
-    ("highs", TO_HIGHSPY, lio.to_highspy),
-    ("gurobi", TO_GUROBIPY, lio.to_gurobipy),
-    ("mosek", TO_MOSEK, lio.to_mosek),
-    ("xpress", TO_XPRESS, lio.to_xpress),
-]
-
 
 def _make_params():
     out = []
-    for solver_name, phase, wrapper in _SOLVER_PHASES:
+    for solver_name, phase, wrapper in SOLVER_HANDOFFS:
         for spec, size in iter_params(phase):
             out.append(
                 pytest.param(

From 754e0ec49de83d87e82d6a1a073e6f01b0cc520b Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Thu, 28 May 2026 22:50:02 +0200
Subject: [PATCH 52/68] benchmarks: extract _provision_venvs helper to dedupe
 sweep plumbing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

``sweep`` and ``memory sweep`` had ~80 lines of near-identical per-version
plumbing: uv availability check, ``=== linopy X ===`` banner, tempdir +
uv venv creation, install pass with the same args, ``PYTHONPATH`` setup,
and parallel failure-reason printing. Two copies that would drift the
moment one side gained anything (a wheel-cache flag, a constraint pin, a
different stderr handler).

Extract a ``_provision_venvs(versions, tmp_prefix)`` generator that
yields one ``_ProvisionedVenv`` record per version. On success, the
record carries ``python`` + ``env``; on failure, ``failed_at`` names the
step that broke ("venv" or "install") and the caller skips its
per-version action. Each tempdir cleanup happens when the generator
advances, so ``break``-ing out of the caller's loop still tears down
cleanly via the generator close protocol.

After the extraction:

- ``sweep`` shrinks from ~135 lines of venv plumbing + action to ~70
  lines of just the action (smoke pytest invocation vs full
  ``--benchmark-only`` invocation + snapshot check).
- ``memory sweep`` shrinks similarly — only the ``memory save``
  invocation and the snapshot-relocation bookkeeping remain.
- Future sweep flavours get the venv plumbing for free.

No user-facing behaviour change; the failure messages and the banner
output are identical.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py | 382 +++++++++++++++++++++++-----------------------
 1 file changed, 188 insertions(+), 194 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index a1de558a..1acd476f 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -17,6 +17,8 @@
 import subprocess
 import sys
 import tempfile
+from collections.abc import Iterator
+from dataclasses import dataclass
 from pathlib import Path
 from typing import Annotated, Literal
 
@@ -391,6 +393,96 @@ def _venv_python(venv: Path) -> Path:
     )
 
 
+@dataclass(frozen=True)
+class _ProvisionedVenv:
+    """
+    One fresh per-version venv from :func:`_provision_venvs`.
+
+    On success, ``python`` and ``env`` are populated and ``failed_at``
+    is ``None``. On failure, ``failed_at`` names the step that failed
+    (``"venv"`` or ``"install"``); the caller skips its per-version
+    action and records the failure.
+    """
+
+    version: str
+    python: Path | None
+    env: dict[str, str] | None
+    failed_at: str | None
+
+
+def _provision_venvs(
+    versions: list[str], tmp_prefix: str
+) -> Iterator[_ProvisionedVenv]:
+    """
+    Yield one fresh per-version uv venv for each linopy version.
+
+    Used by both ``sweep`` and ``memory sweep`` so the venv plumbing
+    (uv venv → install ``[benchmarks]`` pins + the target linopy →
+    set ``PYTHONPATH``) lives in one place. The caller supplies the
+    tempdir prefix (so ``ps``/``lsof`` can distinguish concurrent
+    runs) and does whatever per-version action it needs.
+
+    Each version's tempdir is cleaned up when the generator advances
+    (or exits). The caller can break the loop early — Python's
+    generator close protocol fires the ``with`` teardown.
+    """
+    if shutil.which("uv") is None:
+        typer.secho(
+            "uv not found on PATH — install via https://docs.astral.sh/uv/",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
+
+    repo_root = Path.cwd()
+    for version in versions:
+        typer.secho(f"\n=== linopy {version} ===", fg=typer.colors.CYAN, bold=True)
+        with tempfile.TemporaryDirectory(prefix=tmp_prefix) as tmp:
+            venv = Path(tmp) / "venv"
+
+            r = subprocess.run(
+                ["uv", "venv", "--python", sys.executable, str(venv)],
+                check=False,
+            )
+            if r.returncode != 0:
+                typer.secho(
+                    f"venv creation failed: {version}",
+                    fg=typer.colors.RED,
+                    err=True,
+                )
+                yield _ProvisionedVenv(version, None, None, "venv")
+                continue
+
+            vpy = _venv_python(venv)
+            spec = _linopy_install_spec(version)
+
+            # Single install pass: pinned infra from pyproject + linopy.
+            # Direct pins in [benchmarks] are sufficient for sweep
+            # reproducibility — uv resolves the same input deterministically
+            # into each per-version venv.
+            install_args = [
+                "uv",
+                "pip",
+                "install",
+                "--python",
+                str(vpy),
+                *_benchmarks_extra_pins(),
+                spec,
+            ]
+            r = subprocess.run(install_args, check=False)
+            if r.returncode != 0:
+                typer.secho(f"install failed: {version}", fg=typer.colors.RED, err=True)
+                yield _ProvisionedVenv(version, None, None, "install")
+                continue
+
+            # PYTHONPATH makes ``import benchmarks`` resolve against the
+            # local checkout — the venv only provides linopy + test infra.
+            env = os.environ.copy()
+            env["PYTHONPATH"] = str(repo_root)
+
+            yield _ProvisionedVenv(version, vpy, env, None)
+
+
 @app.command(
     context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
@@ -499,127 +591,74 @@ def sweep(
         )
         raise typer.Exit(code=2)
 
-    if shutil.which("uv") is None:
-        typer.secho(
-            "uv not found on PATH — install via https://docs.astral.sh/uv/",
-            fg=typer.colors.RED,
-            err=True,
-        )
-        raise typer.Exit(code=2)
-
-    repo_root = Path.cwd()
     if not smoke:
         output_dir.mkdir(parents=True, exist_ok=True)
 
     failed: list[str] = []
-    for version in versions:
-        typer.secho(f"\n=== linopy {version} ===", fg=typer.colors.CYAN, bold=True)
-        with tempfile.TemporaryDirectory(prefix="linopy-bench-") as tmp:
-            venv = Path(tmp) / "venv"
+    for prov in _provision_venvs(versions, "linopy-bench-"):
+        if prov.failed_at:
+            failed.append(prov.version)
+            continue
+
+        if smoke:
+            # Smoke mode: reuse the same pytest args as the top-level
+            # ``smoke`` command. No JSON snapshot, return code is the
+            # signal.
+            pytest_cmd = [str(prov.python), "-m", "pytest", *_SMOKE_PYTEST_ARGS]
+            k_parts = [p for p in (model, filter_expr) if p]
+            if k_parts:
+                pytest_cmd.extend(["-k", " and ".join(k_parts)])
+            pytest_cmd.extend(ctx.args)
 
-            # 1. uv venv — same interpreter that's driving the CLI.
-            r = subprocess.run(
-                ["uv", "venv", "--python", sys.executable, str(venv)],
-                check=False,
-            )
+            typer.secho(f"$ {' '.join(pytest_cmd)}", fg=typer.colors.BRIGHT_BLACK)
+            r = subprocess.run(pytest_cmd, env=prov.env, check=False)
             if r.returncode != 0:
                 typer.secho(
-                    f"venv creation failed: {version}",
-                    fg=typer.colors.RED,
-                    err=True,
+                    f"smoke failed: {prov.version}", fg=typer.colors.RED, err=True
                 )
-                failed.append(version)
-                continue
-
-            vpy = _venv_python(venv)
-            spec = _linopy_install_spec(version)
-
-            # 2. Single install pass: pinned infra (from pyproject) + linopy.
-            #    Direct pins in [benchmarks] are sufficient for sweep
-            #    reproducibility — uv resolves the same input
-            #    deterministically into each per-version venv.
-            install_args = [
-                "uv",
-                "pip",
-                "install",
-                "--python",
-                str(vpy),
-                *_benchmarks_extra_pins(),
-                spec,
-            ]
-            r = subprocess.run(install_args, check=False)
-            if r.returncode != 0:
-                typer.secho(f"install failed: {version}", fg=typer.colors.RED, err=True)
-                failed.append(version)
-                continue
-
-            # 3. Run the benchmarks. PYTHONPATH makes ``import benchmarks``
-            #    resolve against the local checkout — the venv only needs to
-            #    provide linopy + the test infra.
-            env = os.environ.copy()
-            env["PYTHONPATH"] = str(repo_root)
-
-            if smoke:
-                # Smoke mode: reuse the same pytest args as the top-level
-                # ``smoke`` command. No JSON snapshot, return code is the
-                # signal.
-                pytest_cmd = [str(vpy), "-m", "pytest", *_SMOKE_PYTEST_ARGS]
-                k_parts = [p for p in (model, filter_expr) if p]
-                if k_parts:
-                    pytest_cmd.extend(["-k", " and ".join(k_parts)])
-                pytest_cmd.extend(ctx.args)
-
-                typer.secho(f"$ {' '.join(pytest_cmd)}", fg=typer.colors.BRIGHT_BLACK)
-                r = subprocess.run(pytest_cmd, env=env, check=False)
-                if r.returncode != 0:
-                    typer.secho(
-                        f"smoke failed: {version}", fg=typer.colors.RED, err=True
-                    )
-                    failed.append(version)
-                else:
-                    typer.secho(f"smoke ok: {version}", fg=typer.colors.GREEN)
-                continue
+                failed.append(prov.version)
+            else:
+                typer.secho(f"smoke ok: {prov.version}", fg=typer.colors.GREEN)
+            continue
 
-            snapshot = (output_dir / f"linopy-{version}.json").resolve()
-            test_target = (
-                _PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/"
+        snapshot = (output_dir / f"linopy-{prov.version}.json").resolve()
+        test_target = _PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/"
+        pytest_cmd = [
+            str(prov.python),
+            "-m",
+            "pytest",
+            test_target,
+            "--benchmark-only",
+            "--benchmark-json",
+            str(snapshot),
+        ]
+        if quick:
+            pytest_cmd.append("--quick")
+        elif long:
+            pytest_cmd.append("--long")
+        if rounds is not None:
+            pytest_cmd.extend(
+                [f"--benchmark-min-rounds={rounds}", "--benchmark-max-time=0"]
             )
-            pytest_cmd = [
-                str(vpy),
-                "-m",
-                "pytest",
-                test_target,
-                "--benchmark-only",
-                "--benchmark-json",
-                str(snapshot),
-            ]
-            if quick:
-                pytest_cmd.append("--quick")
-            elif long:
-                pytest_cmd.append("--long")
-            if rounds is not None:
-                pytest_cmd.extend(
-                    [f"--benchmark-min-rounds={rounds}", "--benchmark-max-time=0"]
-                )
 
-            k_parts = [p for p in (model, filter_expr) if p]
-            if k_parts:
-                pytest_cmd.extend(["-k", " and ".join(k_parts)])
+        k_parts = [p for p in (model, filter_expr) if p]
+        if k_parts:
+            pytest_cmd.extend(["-k", " and ".join(k_parts)])
 
-            pytest_cmd.extend(ctx.args)
+        pytest_cmd.extend(ctx.args)
 
-            typer.secho(f"$ {' '.join(pytest_cmd)}", fg=typer.colors.BRIGHT_BLACK)
-            subprocess.run(pytest_cmd, env=env, check=False)
+        typer.secho(f"$ {' '.join(pytest_cmd)}", fg=typer.colors.BRIGHT_BLACK)
+        subprocess.run(pytest_cmd, env=prov.env, check=False)
 
-            if snapshot.exists():
-                typer.secho(f"saved {snapshot}", fg=typer.colors.GREEN)
-            else:
-                typer.secho(
-                    f"no snapshot produced for {version}",
-                    fg=typer.colors.RED,
-                    err=True,
-                )
-                failed.append(version)
+        if snapshot.exists():
+            typer.secho(f"saved {snapshot}", fg=typer.colors.GREEN)
+        else:
+            typer.secho(
+                f"no snapshot produced for {prov.version}",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            failed.append(prov.version)
 
     if failed:
         typer.secho(f"\nFailed versions: {failed}", fg=typer.colors.RED, err=True)
@@ -1013,97 +1052,52 @@ def memory_sweep_cmd(
             )
             raise typer.Exit(code=2)
 
-    if shutil.which("uv") is None:
-        typer.secho(
-            "uv not found on PATH — install via https://docs.astral.sh/uv/",
-            fg=typer.colors.RED,
-            err=True,
-        )
-        raise typer.Exit(code=2)
-
-    repo_root = Path.cwd()
     output_dir.mkdir(parents=True, exist_ok=True)
+    repo_root = Path.cwd()
 
     failed: list[str] = []
-    for version in versions:
-        typer.secho(f"\n=== linopy {version} ===", fg=typer.colors.CYAN, bold=True)
-        with tempfile.TemporaryDirectory(prefix="linopy-mem-") as tmp:
-            venv = Path(tmp) / "venv"
-
-            r = subprocess.run(
-                ["uv", "venv", "--python", sys.executable, str(venv)],
-                check=False,
+    for prov in _provision_venvs(versions, "linopy-mem-"):
+        if prov.failed_at:
+            failed.append(prov.version)
+            continue
+
+        # ``memory save`` writes to ``.benchmarks/memory/<label>.json``
+        # under cwd; we run it with cwd pinned to repo root, then move
+        # the file if the user asked for a custom output dir.
+        label = f"linopy-{prov.version}"
+        mem_cmd = [
+            str(prov.python),
+            "-m",
+            "benchmarks",
+            "memory",
+            "save",
+            label,
+        ]
+        if quick:
+            mem_cmd.append("--quick")
+        for ph in phase or []:
+            mem_cmd.extend(["--phase", ph])
+        if repeats > 1:
+            mem_cmd.extend(["--repeats", str(repeats)])
+
+        typer.secho(f"$ {' '.join(mem_cmd)}", fg=typer.colors.BRIGHT_BLACK)
+        subprocess.run(mem_cmd, env=prov.env, cwd=str(repo_root), check=False)
+
+        default_path = repo_root / ".benchmarks" / "memory" / f"{label}.json"
+        target = output_dir / f"{label}.json"
+        if default_path.exists() and default_path.resolve() != target.resolve():
+            target.parent.mkdir(parents=True, exist_ok=True)
+            default_path.replace(target)
+
+        if target.exists():
+            typer.secho(f"saved {target}", fg=typer.colors.GREEN)
+        else:
+            typer.secho(
+                f"no snapshot produced for {prov.version}",
+                fg=typer.colors.RED,
+                err=True,
             )
-            if r.returncode != 0:
-                typer.secho(
-                    f"venv creation failed: {version}",
-                    fg=typer.colors.RED,
-                    err=True,
-                )
-                failed.append(version)
-                continue
-
-            vpy = _venv_python(venv)
-            spec = _linopy_install_spec(version)
-
-            install_args = [
-                "uv",
-                "pip",
-                "install",
-                "--python",
-                str(vpy),
-                *_benchmarks_extra_pins(),
-                spec,
-            ]
-            r = subprocess.run(install_args, check=False)
-            if r.returncode != 0:
-                typer.secho(f"install failed: {version}", fg=typer.colors.RED, err=True)
-                failed.append(version)
-                continue
-
-            env = os.environ.copy()
-            env["PYTHONPATH"] = str(repo_root)
-
-            # ``memory save`` writes to ``.benchmarks/memory/<label>.json``
-            # under cwd, so we cd back into the repo root via env and let
-            # ``output_dir`` resolve naturally.
-            label = f"linopy-{version}"
-            mem_cmd = [
-                str(vpy),
-                "-m",
-                "benchmarks",
-                "memory",
-                "save",
-                label,
-            ]
-            if quick:
-                mem_cmd.append("--quick")
-            for ph in phase or []:
-                mem_cmd.extend(["--phase", ph])
-            if repeats > 1:
-                mem_cmd.extend(["--repeats", str(repeats)])
-
-            typer.secho(f"$ {' '.join(mem_cmd)}", fg=typer.colors.BRIGHT_BLACK)
-            subprocess.run(mem_cmd, env=env, cwd=str(repo_root), check=False)
-
-            # memory.save writes to .benchmarks/memory/<label>.json relative
-            # to its cwd. Move it under output_dir if the user asked for a
-            # custom location.
-            default_path = repo_root / ".benchmarks" / "memory" / f"{label}.json"
-            target = output_dir / f"{label}.json"
-            if default_path.exists() and default_path.resolve() != target.resolve():
-                target.parent.mkdir(parents=True, exist_ok=True)
-                default_path.replace(target)
-
-            if target.exists():
-                typer.secho(f"saved {target}", fg=typer.colors.GREEN)
-            else:
-                typer.secho(
-                    f"no snapshot produced for {version}",
-                    fg=typer.colors.RED,
-                    err=True,
-                )
-                failed.append(version)
+            failed.append(prov.version)
 
     if failed:
         typer.secho(f"\nFailed versions: {failed}", fg=typer.colors.RED, err=True)

From 7d3e474ddc3fbbbf386534cb6ce0f4c09fdf854f Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 29 May 2026 08:22:01 +0200
Subject: [PATCH 53/68] =?UTF-8?q?benchmarks:=20bump=20pinned=20numpy=201.2?=
 =?UTF-8?q?6.4=20=E2=86=92=202.4.6?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous pin was on the last numpy 1.x release; 2.x has been the
default for ~2 years now and is what most linopy users actually run.
Pinning benchmarks to 1.26.4 meant the suite was measuring a code path
nobody hits anymore.

Verified safe via ``sweep --smoke`` across the realistic sweep set
(0.5.8, 0.6.0, 0.6.7, 0.7.0) — every linopy version installs, imports,
and exercises the suite (every model build / phase fire) cleanly
against numpy 2.4.6.

The pre-existing ``netcdf4`` binary-incompat warning (``numpy.ndarray
size changed``) is unchanged by this bump — it's a wheel-vs-ABI
mismatch from ``netcdf4==1.7.4`` that's present under both numpy 1.26.4
and 2.4.6, doesn't fail any test, and is a separate concern.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 0bb0128c..d792161d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -103,7 +103,7 @@ benchmarks = [
     # Perf-sensitive runtime deps. Pinned here (not in ``[project
     # .dependencies]``) so downstream linopy consumers keep their loose
     # resolve while the benchmark environment is fixed.
-    "numpy==1.26.4",
+    "numpy==2.4.6",
     "scipy==1.16.3",
     "xarray==2025.9.0",
     "pandas==2.3.3",

From 2621a7bc039c48a68d2c3ac52c86a25911bac066 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 29 May 2026 08:30:24 +0200
Subject: [PATCH 54/68] benchmarks: relax numpy pin to <2.0 for wider sweep
 coverage
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Empirical sweep --smoke verification:

- linopy <0.5.1 declares ``numpy<2.0`` in its metadata, so any
  numpy 2.x ``==`` pin in our ``[benchmarks]`` extra makes uv refuse
  to resolve those versions (install fails before any code runs).
- Relaxing to ``numpy<2.0`` lets the older versions install and run.
  Verified: 0.3.15, 0.4.0, 0.4.4, 0.5.0, 0.7.0 all pass ``sweep --smoke``
  under the relaxed pin.

uv resolves ``numpy<2.0`` to 1.26.4 on every current platform (the last
numpy 1.x release; numpy is done with the 1.x line), so the practical
reproducibility property of "every per-version venv gets the same numpy"
is preserved despite the looser-looking constraint.

Reverts the bump in 7d3e474. We'll go back to a 2.x ``==`` pin once
we drop pre-0.5.1 from sweep coverage — at which point ``sweep --smoke``
is the right tool to re-verify, same way it found this floor.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pyproject.toml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index d792161d..312b0198 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -103,7 +103,13 @@ benchmarks = [
     # Perf-sensitive runtime deps. Pinned here (not in ``[project
     # .dependencies]``) so downstream linopy consumers keep their loose
     # resolve while the benchmark environment is fixed.
-    "numpy==2.4.6",
+    #
+    # ``numpy`` is the one ``<X`` rather than ``==X`` pin: linopy <0.5.1
+    # declares ``numpy<2.0``, and we want ``sweep`` to cover the older
+    # versions too. uv resolves this to 1.26.4 (last numpy 1.x) on every
+    # current platform. Bump to a 2.x ``==`` pin once we drop pre-0.5.1
+    # from sweep coverage.
+    "numpy<2.0",
     "scipy==1.16.3",
     "xarray==2025.9.0",
     "pandas==2.3.3",

From 2656178087766efce5b7a70c99d63d8737505ea2 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 29 May 2026 08:34:34 +0200
Subject: [PATCH 55/68] benchmarks: pin numpy back to ==1.26.4 (last 1.x)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Honour the ``==`` pin convention used by every other measurement-relevant
dep in ``[benchmarks]``. ``numpy<2.0`` in the previous commit gave the
same practical result (uv resolves to 1.26.4) but broke the "every
direct dep is pinned exactly" property the surrounding pins rely on for
reproducibility.

Empirically verified ``sweep --smoke`` still covers the full 0.3.x →
0.7.0 range under the exact pin.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 pyproject.toml | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 312b0198..9a4b05c2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -104,12 +104,11 @@ benchmarks = [
     # .dependencies]``) so downstream linopy consumers keep their loose
     # resolve while the benchmark environment is fixed.
     #
-    # ``numpy`` is the one ``<X`` rather than ``==X`` pin: linopy <0.5.1
-    # declares ``numpy<2.0``, and we want ``sweep`` to cover the older
-    # versions too. uv resolves this to 1.26.4 (last numpy 1.x) on every
-    # current platform. Bump to a 2.x ``==`` pin once we drop pre-0.5.1
-    # from sweep coverage.
-    "numpy<2.0",
+    # ``numpy`` is held at the last 1.x release: linopy <0.5.1 declares
+    # ``numpy<2.0``, and we want ``sweep`` to cover the older versions.
+    # Bump to a 2.x release once we drop pre-0.5.1 from sweep coverage —
+    # ``sweep --smoke`` is the right tool to re-verify when that happens.
+    "numpy==1.26.4",
     "scipy==1.16.3",
     "xarray==2025.9.0",
     "pandas==2.3.3",

From e7f9c5b4a9c06ade342792aff630ef261e1f98aa Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 29 May 2026 08:56:41 +0200
Subject: [PATCH 56/68] benchmarks: fix sweep silently measuring dev linopy +
 getattr SOLVER_HANDOFFS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two coupled bugs that were making ``sweep`` produce meaningless
cross-version timings.

**Bug 1: silent linopy shadowing.** ``_provision_venvs`` ran the
per-version pytest with the cwd inherited from the user's shell —
typically the repo root, which contains a ``linopy/`` package (the
one we're developing). Python prepends cwd to ``sys.path`` as
``''``, so ``import linopy`` resolved to the dev tree rather than the
venv's installed version. Every sweep run was measuring dev linopy
against itself; the per-version timings were noise on the same code.
(Previously the function also set ``PYTHONPATH=repo_root`` for
``import benchmarks``, which independently caused the same shadowing
even with a different cwd.)

Fix: create an isolated import root per version — a fresh tempdir
containing only a symlink ``benchmarks → repo_root/benchmarks``. The
sweep callers now run subprocesses with ``cwd=import_dir`` and no
``PYTHONPATH``. ``import benchmarks`` resolves via the symlink;
``import linopy`` falls through to site-packages → the requested
version. Added ``import_dir`` to ``_ProvisionedVenv`` and threaded it
through both ``sweep`` and ``memory sweep`` call sites (memory
discovery now looks under ``import_dir/.benchmarks/memory`` for the
``memory save`` output before moving it to ``output_dir``).

**Bug 2: SOLVER_HANDOFFS eagerly imports linopy.io.to_xpress, which
doesn't exist in any released linopy.** With shadowing in effect we
never noticed; after the isolation fix, even ``sweep --smoke 0.7.0``
fails collection because ``lio.to_xpress`` is an AttributeError.

Fix: build ``SOLVER_HANDOFFS`` via ``getattr(lio, name, None)`` and
filter out wrappers that aren't present in the installed linopy. The
tuple shape stays the same; older versions silently drop solvers
they don't support. ``memory.py``'s ``next("highs", ...)`` lookup
defaults to ``None`` and skips the solver_handoff memory phase rather
than emitting an unmatchable test id.

Consequence the user should expect: ``sweep --smoke`` against older
linopy versions now surfaces real install / runtime / API
incompatibilities rather than passing silently. Versions whose
metadata installs cleanly but whose code imports fail under our
pinned ``xarray`` / etc. will report ``smoke failed`` — that's the
correct signal.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py    | 59 ++++++++++++++++++++++++++++++++------------
 benchmarks/memory.py |  6 ++++-
 benchmarks/phases.py | 19 ++++++++++----
 3 files changed, 62 insertions(+), 22 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 1acd476f..a6713ee6 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -398,8 +398,10 @@ class _ProvisionedVenv:
     """
     One fresh per-version venv from :func:`_provision_venvs`.
 
-    On success, ``python`` and ``env`` are populated and ``failed_at``
-    is ``None``. On failure, ``failed_at`` names the step that failed
+    On success, ``python``, ``env``, and ``import_dir`` are populated
+    and ``failed_at`` is ``None``. The caller MUST use ``import_dir``
+    as cwd for per-version subprocesses — see :func:`_provision_venvs`
+    for why. On failure, ``failed_at`` names the step that failed
     (``"venv"`` or ``"install"``); the caller skips its per-version
     action and records the failure.
     """
@@ -407,6 +409,7 @@ class _ProvisionedVenv:
     version: str
     python: Path | None
     env: dict[str, str] | None
+    import_dir: Path | None
     failed_at: str | None
 
 
@@ -418,9 +421,22 @@ def _provision_venvs(
 
     Used by both ``sweep`` and ``memory sweep`` so the venv plumbing
     (uv venv → install ``[benchmarks]`` pins + the target linopy →
-    set ``PYTHONPATH``) lives in one place. The caller supplies the
-    tempdir prefix (so ``ps``/``lsof`` can distinguish concurrent
-    runs) and does whatever per-version action it needs.
+    set up an isolated import root) lives in one place. The caller
+    supplies the tempdir prefix (so ``ps``/``lsof`` can distinguish
+    concurrent runs) and does whatever per-version action it needs.
+
+    **Isolation:** the repo root contains a ``linopy/`` package (the
+    one we're developing). Running the per-version pytest with the
+    repo root on ``sys.path`` — either via ``PYTHONPATH=repo`` or via
+    ``cwd=repo`` (Python prepends cwd as ``''``) — shadows the venv's
+    installed linopy with the dev tree. The whole sweep then measures
+    the dev linopy against itself instead of the requested version.
+    To avoid this, ``import_dir`` is a fresh tempdir per version that
+    contains a single symlink ``benchmarks → repo_root/benchmarks``.
+    Running subprocesses with ``cwd=import_dir`` and no ``PYTHONPATH``
+    makes ``import benchmarks`` resolve via the symlink while
+    ``import linopy`` falls through to the venv's site-packages — i.e.
+    the requested version.
 
     Each version's tempdir is cleaned up when the generator advances
     (or exits). The caller can break the loop early — Python's
@@ -450,7 +466,7 @@ def _provision_venvs(
                     fg=typer.colors.RED,
                     err=True,
                 )
-                yield _ProvisionedVenv(version, None, None, "venv")
+                yield _ProvisionedVenv(version, None, None, None, "venv")
                 continue
 
             vpy = _venv_python(venv)
@@ -472,15 +488,21 @@ def _provision_venvs(
             r = subprocess.run(install_args, check=False)
             if r.returncode != 0:
                 typer.secho(f"install failed: {version}", fg=typer.colors.RED, err=True)
-                yield _ProvisionedVenv(version, None, None, "install")
+                yield _ProvisionedVenv(version, None, None, None, "install")
                 continue
 
-            # PYTHONPATH makes ``import benchmarks`` resolve against the
-            # local checkout — the venv only provides linopy + test infra.
+            # Build the isolated import root described in the docstring.
+            import_dir = Path(tmp) / "iso"
+            import_dir.mkdir()
+            (import_dir / "benchmarks").symlink_to(repo_root / "benchmarks")
+
+            # No PYTHONPATH manipulation: the symlink + cwd=import_dir
+            # carries ``benchmarks`` without pulling the repo's
+            # ``linopy/`` into the import path.
             env = os.environ.copy()
-            env["PYTHONPATH"] = str(repo_root)
+            env.pop("PYTHONPATH", None)
 
-            yield _ProvisionedVenv(version, vpy, env, None)
+            yield _ProvisionedVenv(version, vpy, env, import_dir, None)
 
 
 @app.command(
@@ -611,7 +633,9 @@ def sweep(
             pytest_cmd.extend(ctx.args)
 
             typer.secho(f"$ {' '.join(pytest_cmd)}", fg=typer.colors.BRIGHT_BLACK)
-            r = subprocess.run(pytest_cmd, env=prov.env, check=False)
+            r = subprocess.run(
+                pytest_cmd, env=prov.env, cwd=str(prov.import_dir), check=False
+            )
             if r.returncode != 0:
                 typer.secho(
                     f"smoke failed: {prov.version}", fg=typer.colors.RED, err=True
@@ -648,7 +672,7 @@ def sweep(
         pytest_cmd.extend(ctx.args)
 
         typer.secho(f"$ {' '.join(pytest_cmd)}", fg=typer.colors.BRIGHT_BLACK)
-        subprocess.run(pytest_cmd, env=prov.env, check=False)
+        subprocess.run(pytest_cmd, env=prov.env, cwd=str(prov.import_dir), check=False)
 
         if snapshot.exists():
             typer.secho(f"saved {snapshot}", fg=typer.colors.GREEN)
@@ -1053,7 +1077,6 @@ def memory_sweep_cmd(
             raise typer.Exit(code=2)
 
     output_dir.mkdir(parents=True, exist_ok=True)
-    repo_root = Path.cwd()
 
     failed: list[str] = []
     for prov in _provision_venvs(versions, "linopy-mem-"):
@@ -1081,9 +1104,13 @@ def memory_sweep_cmd(
             mem_cmd.extend(["--repeats", str(repeats)])
 
         typer.secho(f"$ {' '.join(mem_cmd)}", fg=typer.colors.BRIGHT_BLACK)
-        subprocess.run(mem_cmd, env=prov.env, cwd=str(repo_root), check=False)
+        subprocess.run(mem_cmd, env=prov.env, cwd=str(prov.import_dir), check=False)
 
-        default_path = repo_root / ".benchmarks" / "memory" / f"{label}.json"
+        # ``memory save`` writes to ``.benchmarks/memory/<label>.json``
+        # relative to its cwd — here, the isolated import_dir. Move it
+        # under the user's chosen output_dir (resolves under repo_root
+        # by default).
+        default_path = prov.import_dir / ".benchmarks" / "memory" / f"{label}.json"
         target = output_dir / f"{label}.json"
         if default_path.exists() and default_path.resolve() != target.resolve():
             target.parent.mkdir(parents=True, exist_ok=True)
diff --git a/benchmarks/memory.py b/benchmarks/memory.py
index 559eaa37..c43b57a2 100644
--- a/benchmarks/memory.py
+++ b/benchmarks/memory.py
@@ -175,7 +175,11 @@ def _measurements(
 
         # Memory currently tracks only HiGHS — look it up by name so a
         # reordering of SOLVER_HANDOFFS doesn't silently swap solvers.
-        highs = next(w for n, _, w in SOLVER_HANDOFFS if n == "highs")
+        # Older linopy releases without ``to_highspy`` skip the phase
+        # silently rather than emitting an id with no possible match.
+        highs = next((w for n, _, w in SOLVER_HANDOFFS if n == "highs"), None)
+        if highs is None:
+            return
 
         yield (
             (
diff --git a/benchmarks/phases.py b/benchmarks/phases.py
index 0761f49d..f0d18313 100644
--- a/benchmarks/phases.py
+++ b/benchmarks/phases.py
@@ -60,9 +60,18 @@ def write_netcdf(m: linopy.Model, path: Path) -> None:
 # parametrization in ``test_solver_handoff.py`` and by ``memory.py``,
 # which looks up the "highs" entry. Adding a solver here automatically
 # extends both drivers.
-SOLVER_HANDOFFS: tuple[tuple[str, str, Callable[[linopy.Model], object]], ...] = (
-    ("highs", TO_HIGHSPY, lio.to_highspy),
-    ("gurobi", TO_GUROBIPY, lio.to_gurobipy),
-    ("mosek", TO_MOSEK, lio.to_mosek),
-    ("xpress", TO_XPRESS, lio.to_xpress),
+#
+# Each wrapper is fetched via ``getattr`` so the tuple silently drops
+# any solver wrapper missing from the installed ``linopy`` — necessary
+# for cross-version ``sweep`` runs against older releases (e.g.
+# ``to_xpress`` doesn't exist before linopy 0.7.1).
+SOLVER_HANDOFFS: tuple[tuple[str, str, Callable[[linopy.Model], object]], ...] = tuple(
+    (name, tag, wrapper)
+    for name, tag, wrapper in (
+        ("highs", TO_HIGHSPY, getattr(lio, "to_highspy", None)),
+        ("gurobi", TO_GUROBIPY, getattr(lio, "to_gurobipy", None)),
+        ("mosek", TO_MOSEK, getattr(lio, "to_mosek", None)),
+        ("xpress", TO_XPRESS, getattr(lio, "to_xpress", None)),
+    )
+    if wrapper is not None
 )

From b35fafedc45e4d63bd45dc403fbb3f5ee3431b29 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 29 May 2026 09:07:53 +0200
Subject: [PATCH 57/68] benchmarks: pin xarray to 2025.1.2 to extend sweep
 coverage to 0.4.4
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

xarray 2025.3.0 moved ``xarray.core.rolling`` to a different location.
linopy <=0.5.0 imports it directly, so the previous ``xarray==2025.9.0``
pin made any sweep against those releases fail at ``import linopy`` —
real signal that was masked while sweep was silently running the dev
linopy instead.

Pin to the last release before the rename (2025.1.2). Coverage now
extends down to 0.4.4 cleanly. The realistic floor is 0.4.4 — 0.4.0's
``to_file`` lacks the ``progress`` kwarg, and reaching back further
would need version-specific shims that aren't worth maintaining.

Verified: ``sweep --smoke 0.4.4 0.5.0 0.5.8 0.6.7 0.7.0 -k basic`` all
green; local ``smoke`` still passes on the dev linopy.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/phases.py | 4 +++-
 pyproject.toml       | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/benchmarks/phases.py b/benchmarks/phases.py
index f0d18313..6d111f74 100644
--- a/benchmarks/phases.py
+++ b/benchmarks/phases.py
@@ -47,7 +47,9 @@ def write_lp(m: linopy.Model, path: Path) -> None:
 
     ``progress=False`` is pinned here so the benchmark stays uniform
     across drivers — the progress bar's overhead would otherwise leak
-    into the measurement.
+    into the measurement. The sweep coverage floor is linopy 0.4.1,
+    when this kwarg was added; older versions raise ``TypeError`` and
+    are out of scope.
     """
     m.to_file(path, progress=False)
 
diff --git a/pyproject.toml b/pyproject.toml
index 9a4b05c2..49071402 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -110,7 +110,7 @@ benchmarks = [
     # ``sweep --smoke`` is the right tool to re-verify when that happens.
     "numpy==1.26.4",
     "scipy==1.16.3",
-    "xarray==2025.9.0",
+    "xarray==2025.1.2",
     "pandas==2.3.3",
     "polars==1.35.2",
     "dask==2025.11.0",

From 11f56d293728ba23a833675a14327226beaa9dd5 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 29 May 2026 09:15:25 +0200
Subject: [PATCH 58/68] benchmarks: shim write_lp for linopy <0.4.1, extending
 sweep floor to 0.2.0
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

linopy added the ``progress`` kwarg to ``Model.to_file`` in 0.4.1. The
suite's ``write_lp`` verb passes ``progress=False`` to keep the
progress-bar overhead out of the measurement, which means anything
older than 0.4.1 raised ``TypeError`` and failed sweep smoke.

Check once at import time (``inspect.signature``) whether the kwarg is
present; if not, fall back to the native call. Branchless on the hot
path — the check resolves once when phases.py loads.

Empirically extends sweep coverage from 0.4.4 down to 0.2.0 with no
other changes — roughly three years of historical releases now in
scope. 0.1.x has further API drift (``add_variables`` signature) and
0.0.x has pre-pyproject metadata that uv can't install, both out of
scope.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/phases.py | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/benchmarks/phases.py b/benchmarks/phases.py
index 6d111f74..58e7e67f 100644
--- a/benchmarks/phases.py
+++ b/benchmarks/phases.py
@@ -13,6 +13,7 @@
 
 from __future__ import annotations
 
+import inspect
 from collections.abc import Callable
 from pathlib import Path
 
@@ -21,6 +22,10 @@
 from benchmarks.registry import TO_GUROBIPY, TO_HIGHSPY, TO_MOSEK, TO_XPRESS
 from linopy import read_netcdf
 
+# linopy <0.4.1's ``to_file`` doesn't accept ``progress``. Check once
+# at import so the benchmark loop stays branchless on the hot path.
+_TO_FILE_HAS_PROGRESS = "progress" in inspect.signature(linopy.Model.to_file).parameters
+
 # Re-export so callers can ``from benchmarks.phases import read_netcdf``
 # alongside the wrappers.
 __all__ = [
@@ -45,13 +50,15 @@ def write_lp(m: linopy.Model, path: Path) -> None:
     """
     Write the model as an LP file.
 
-    ``progress=False`` is pinned here so the benchmark stays uniform
-    across drivers — the progress bar's overhead would otherwise leak
-    into the measurement. The sweep coverage floor is linopy 0.4.1,
-    when this kwarg was added; older versions raise ``TypeError`` and
-    are out of scope.
+    Where supported, ``progress=False`` is pinned here so the
+    benchmark stays uniform across drivers — the progress bar's
+    overhead would otherwise leak into the measurement. linopy <0.4.1
+    doesn't accept the kwarg; falls back to the native call.
     """
-    m.to_file(path, progress=False)
+    if _TO_FILE_HAS_PROGRESS:
+        m.to_file(path, progress=False)
+    else:
+        m.to_file(path)
 
 
 def write_netcdf(m: linopy.Model, path: Path) -> None:

From 3091c64b8765381294addd6fb6878c84bd8e8f75 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 29 May 2026 09:20:10 +0200
Subject: [PATCH 59/68] benchmarks: add --as-of <DATE> for
 cross-time-reproducible sweeps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Direct pins in ``[benchmarks]`` keep results reproducible *within* one
sweep call, but unpinned transitive deps can drift between sweep calls
days apart — a delta could then come from a numpy patch release rather
than the linopy change you wanted to attribute it to.

Add an ``--as-of <DATE>`` flag to both ``sweep`` and ``memory sweep``
that passes ``--exclude-newer`` to uv. The entire transitive
resolution is frozen to releases on or before the date; running the
same sweep set + the same ``--as-of`` value at any later point
reproduces the same dep tree (modulo PyPI yanking).

Plumbed through ``_provision_venvs(as_of=...)`` so both call sites
stay single-source. Default is unchanged — no ``--as-of`` ⇒ latest
resolution, matching prior behaviour.

Empirically verified: ``--as-of 2026-05-01`` correctly rejects the
install when ``pytest-codspeed==5.0.3`` (released later) is in the
pin set; ``--as-of 2026-05-29`` resolves cleanly and ``sweep --smoke``
passes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py | 40 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index a6713ee6..8c70388c 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -414,7 +414,7 @@ class _ProvisionedVenv:
 
 
 def _provision_venvs(
-    versions: list[str], tmp_prefix: str
+    versions: list[str], tmp_prefix: str, as_of: str | None = None
 ) -> Iterator[_ProvisionedVenv]:
     """
     Yield one fresh per-version uv venv for each linopy version.
@@ -441,6 +441,14 @@ def _provision_venvs(
     Each version's tempdir is cleaned up when the generator advances
     (or exits). The caller can break the loop early — Python's
     generator close protocol fires the ``with`` teardown.
+
+    **Cross-time reproducibility:** if ``as_of`` is a date string
+    (``YYYY-MM-DD`` or any ISO 8601 timestamp), passes
+    ``--exclude-newer`` to uv so the entire transitive resolution is
+    frozen to releases on or before that date. Pinning direct deps
+    alone (current default) keeps results reproducible *within* one
+    sweep call, but unpinned transitives can drift between sweep calls
+    days apart; ``as_of`` closes that gap.
     """
     if shutil.which("uv") is None:
         typer.secho(
@@ -482,6 +490,7 @@ def _provision_venvs(
                 "install",
                 "--python",
                 str(vpy),
+                *(["--exclude-newer", as_of] if as_of else []),
                 *_benchmarks_extra_pins(),
                 spec,
             ]
@@ -566,6 +575,20 @@ def sweep(
             ),
         ),
     ] = False,
+    as_of: Annotated[
+        str | None,
+        typer.Option(
+            "--as-of",
+            help=(
+                "Freeze every dep's resolution to releases on or before this "
+                "date (``YYYY-MM-DD`` or ISO 8601). Passes ``--exclude-newer`` "
+                "to uv. Use a consistent value across invocations for "
+                "cross-time-reproducible sweeps — direct pins alone keep "
+                "results stable within one call but transitives can drift "
+                "between calls."
+            ),
+        ),
+    ] = None,
 ) -> None:
     """
     Run the benchmark suite against several linopy versions.
@@ -617,7 +640,7 @@ def sweep(
         output_dir.mkdir(parents=True, exist_ok=True)
 
     failed: list[str] = []
-    for prov in _provision_venvs(versions, "linopy-bench-"):
+    for prov in _provision_venvs(versions, "linopy-bench-", as_of=as_of):
         if prov.failed_at:
             failed.append(prov.version)
             continue
@@ -1051,6 +1074,17 @@ def memory_sweep_cmd(
             help="min-of-N peak per measurement (default 1).",
         ),
     ] = 1,
+    as_of: Annotated[
+        str | None,
+        typer.Option(
+            "--as-of",
+            help=(
+                "Freeze every dep's resolution to releases on or before this "
+                "date (``YYYY-MM-DD`` or ISO 8601). Same semantics as "
+                "``sweep --as-of`` — see that command's help."
+            ),
+        ),
+    ] = None,
 ) -> None:
     """
     Sweep peak-memory measurements across several linopy versions.
@@ -1079,7 +1113,7 @@ def memory_sweep_cmd(
     output_dir.mkdir(parents=True, exist_ok=True)
 
     failed: list[str] = []
-    for prov in _provision_venvs(versions, "linopy-mem-"):
+    for prov in _provision_venvs(versions, "linopy-mem-", as_of=as_of):
         if prov.failed_at:
             failed.append(prov.version)
             continue

From e74ae1e0d21150e60069a3b4473ffd6f4af7cde7 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 29 May 2026 09:22:27 +0200
Subject: [PATCH 60/68] benchmarks: harden the sweep isolation seam (preflight
 + no bytecode)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two follow-ups to the symlink-isolation fix in e7f9c5b:

1. Preflight check per provisioned venv. After symlinking benchmarks/
   into the import_dir, run a tiny ``python -c`` that imports linopy
   from the cwd we're about to use and asserts ``linopy.__file__`` is
   under the venv's prefix. If a future change reintroduces the
   dev-linopy shadow (PYTHONPATH=repo, missing PYTHONDONTWRITEBYTECODE
   side-effect, pytest import-mode bump, …), this fails loudly with
   "isolation leak: linopy resolved to <path>, not the venv" rather
   than silently corrupting every snapshot in the sweep. New
   ``failed_at`` value ``"isolation"`` lets callers record this the
   same way they already record venv/install failures.

2. ``PYTHONDONTWRITEBYTECODE=1`` in the subprocess env. The symlink
   resolves to the real benchmarks/ source tree, so every sweep
   subprocess would otherwise write fresh ``.pyc`` files into the
   user's working tree — harmless (Python is held constant so the
   bytecode is valid) but it mutates the checkout and would risk write
   contention if sweep ever becomes parallel. One env var keeps each
   run pure.

Verified: shadowing simulated by re-setting ``PYTHONPATH=repo`` is now
caught by the preflight with the expected assertion message; happy
path ``sweep 0.7.0 --smoke`` still passes; ``benchmarks/__pycache__``
is untouched after a sweep.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 8c70388c..c3ce8be9 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -507,9 +507,43 @@ def _provision_venvs(
 
             # No PYTHONPATH manipulation: the symlink + cwd=import_dir
             # carries ``benchmarks`` without pulling the repo's
-            # ``linopy/`` into the import path.
+            # ``linopy/`` into the import path. PYTHONDONTWRITEBYTECODE
+            # keeps the symlinked ``benchmarks/`` source tree clean of
+            # ``__pycache__`` writes from each per-version subprocess.
             env = os.environ.copy()
             env.pop("PYTHONPATH", None)
+            env["PYTHONDONTWRITEBYTECODE"] = "1"
+
+            # Preflight: confirm the venv's linopy is what gets imported
+            # under cwd=import_dir. If a future change reintroduces the
+            # dev-linopy shadow bug, this fails loudly here rather than
+            # silently corrupting every snapshot in the sweep.
+            preflight = subprocess.run(
+                [
+                    str(vpy),
+                    "-c",
+                    (
+                        "import linopy; "
+                        f"assert {str(venv)!r} in linopy.__file__, "
+                        "f'isolation leak: linopy resolved to "
+                        "{linopy.__file__}, not the venv'"
+                    ),
+                ],
+                cwd=str(import_dir),
+                env=env,
+                capture_output=True,
+                text=True,
+                check=False,
+            )
+            if preflight.returncode != 0:
+                typer.secho(
+                    f"isolation preflight failed: {version}",
+                    fg=typer.colors.RED,
+                    err=True,
+                )
+                typer.echo(preflight.stderr.strip(), err=True)
+                yield _ProvisionedVenv(version, None, None, None, "isolation")
+                continue
 
             yield _ProvisionedVenv(version, vpy, env, import_dir, None)
 

From 55612f53f68a10feb4d16ff4b85f6c41444eed2f Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 29 May 2026 09:40:57 +0200
Subject: [PATCH 61/68] benchmarks: copy harness into sweep venvs instead of
 symlinking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The per-version isolation root held a symlink to repo_root/benchmarks.
Replace it with a filtered copy so the sweep runs on Windows (no symlink
privilege needed) and no per-version subprocess — including its
__pycache__ writes — can touch the working tree.

Drops the now-redundant PYTHONDONTWRITEBYTECODE: with a copy, bytecode
lands in the throwaway tempdir, so the working tree is structurally
untouchable rather than protected by an env var. ignore_patterns skips
the executed notebook and cruft to keep the copy cheap.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index c3ce8be9..ebbf2218 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -402,8 +402,8 @@ class _ProvisionedVenv:
     and ``failed_at`` is ``None``. The caller MUST use ``import_dir``
     as cwd for per-version subprocesses — see :func:`_provision_venvs`
     for why. On failure, ``failed_at`` names the step that failed
-    (``"venv"`` or ``"install"``); the caller skips its per-version
-    action and records the failure.
+    (``"venv"``, ``"install"``, or ``"isolation"``); the caller skips
+    its per-version action and records the failure.
     """
 
     version: str
@@ -432,11 +432,14 @@ def _provision_venvs(
     installed linopy with the dev tree. The whole sweep then measures
     the dev linopy against itself instead of the requested version.
     To avoid this, ``import_dir`` is a fresh tempdir per version that
-    contains a single symlink ``benchmarks → repo_root/benchmarks``.
-    Running subprocesses with ``cwd=import_dir`` and no ``PYTHONPATH``
-    makes ``import benchmarks`` resolve via the symlink while
-    ``import linopy`` falls through to the venv's site-packages — i.e.
-    the requested version.
+    holds a filtered *copy* of ``benchmarks/`` and nothing else — a
+    copy rather than a symlink so the sweep runs on Windows without
+    symlink privileges and so no per-version subprocess (nor its
+    ``__pycache__`` writes) ever touches the working tree. Running
+    subprocesses with ``cwd=import_dir`` and no ``PYTHONPATH`` makes
+    ``import benchmarks`` resolve to that copy while ``import linopy``
+    falls through to the venv's site-packages — i.e. the requested
+    version. The preflight below asserts that resolution actually held.
 
     Each version's tempdir is cleaned up when the generator advances
     (or exits). The caller can break the loop early — Python's
@@ -500,19 +503,26 @@ def _provision_venvs(
                 yield _ProvisionedVenv(version, None, None, None, "install")
                 continue
 
-            # Build the isolated import root described in the docstring.
+            # Build the isolated import root described in the docstring:
+            # a filtered copy of ``benchmarks/`` and nothing else. The
+            # heavy, sweep-irrelevant artifacts (the executed notebook,
+            # bytecode caches, macOS cruft) are skipped to keep the
+            # per-version copy cheap.
             import_dir = Path(tmp) / "iso"
             import_dir.mkdir()
-            (import_dir / "benchmarks").symlink_to(repo_root / "benchmarks")
+            shutil.copytree(
+                repo_root / "benchmarks",
+                import_dir / "benchmarks",
+                ignore=shutil.ignore_patterns("__pycache__", "*.ipynb", ".DS_Store"),
+            )
 
-            # No PYTHONPATH manipulation: the symlink + cwd=import_dir
-            # carries ``benchmarks`` without pulling the repo's
-            # ``linopy/`` into the import path. PYTHONDONTWRITEBYTECODE
-            # keeps the symlinked ``benchmarks/`` source tree clean of
-            # ``__pycache__`` writes from each per-version subprocess.
+            # No PYTHONPATH manipulation: the copied ``benchmarks`` under
+            # cwd=import_dir carries the harness without pulling the
+            # repo's ``linopy/`` into the import path. Bytecode the
+            # subprocess writes lands in this throwaway copy, never the
+            # working tree, so no PYTHONDONTWRITEBYTECODE is needed.
             env = os.environ.copy()
             env.pop("PYTHONPATH", None)
-            env["PYTHONDONTWRITEBYTECODE"] = "1"
 
             # Preflight: confirm the venv's linopy is what gets imported
             # under cwd=import_dir. If a future change reintroduces the

From c031153d2c2ceca344b70f283875f917225baefe Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 29 May 2026 10:09:57 +0200
Subject: [PATCH 62/68] benchmarks: add ad-hoc `bench` helper for arbitrary
 callables
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`benchmarks.bench` times or memory-profiles any callable in-process on
the current tree — a registry builder, a phase verb on a hand-built
model, a one-off lambda — and returns a result that round-trips through
the existing snapshot/plot machinery (`load_long_df`). Three entry
points (`time`, `memory`, `compare`) plus `TimingResult` / `MemoryResult`
/ `ResultSet` with `to_snapshot` / `to_df` / rich Jupyter reprs.

To make the memray peak-measurer reusable, memory.py no longer raises at
import on Windows: the check moves into a `_require_memray()` called by
each measuring entry point, and `_measure_peak` is promoted to public
`measure_peak` (back-compat alias kept). bench reuses it for the memory
path.

Adds a "Benchmarking custom things" section to walkthrough.md (executes
end-to-end under the CI notebook run) and re-exports `bench` from the
package.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/__init__.py    |  18 +-
 benchmarks/bench.py       | 367 ++++++++++++++++++++++++++++++++++++++
 benchmarks/memory.py      |  39 +++-
 benchmarks/test_bench.py  | 107 +++++++++++
 benchmarks/walkthrough.md |  70 ++++++++
 5 files changed, 591 insertions(+), 10 deletions(-)
 create mode 100644 benchmarks/bench.py
 create mode 100644 benchmarks/test_bench.py

diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
index 5e181b98..f508540a 100644
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
@@ -22,11 +22,24 @@
     qp_specs = filter_by(has_feature=QUADRATIC)
 """
 
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pandas as pd
+
+    from benchmarks.plotting import Metric
+
 # Importing the models package triggers each module's ``register(...)`` call.
-from benchmarks import models  # noqa: F401, E402
+from benchmarks import bench, models  # noqa: F401, E402
 
 
-def load_long_df(snapshots, metric="min"):
+def load_long_df(
+    snapshots: list[Path], metric: Metric = "min"
+) -> tuple[pd.DataFrame, str]:
     """
     Load one or more pytest-benchmark JSON snapshots into a tidy DataFrame.
 
@@ -91,6 +104,7 @@ def load_long_df(snapshots, metric="min"):
     "TO_HIGHSPY",
     "TO_MOSEK",
     "TO_XPRESS",
+    "bench",
     "filter_by",
     "get",
     "iter_params",
diff --git a/benchmarks/bench.py b/benchmarks/bench.py
new file mode 100644
index 00000000..cdee1eea
--- /dev/null
+++ b/benchmarks/bench.py
@@ -0,0 +1,367 @@
+"""
+Ad-hoc benchmarking of arbitrary callables on the *current* linopy tree.
+
+Where the pytest suite measures the fixed registry grid and ``sweep``
+measures across installed linopy versions, ``bench`` is for the
+interactive middle: time or memory-profile any callable — a registry
+builder, a phase verb applied to a model you built by hand, or a one-off
+lambda — get a result object back, and either inspect it as a DataFrame
+or drop it into a snapshot the existing ``plot`` / ``compare`` machinery
+already understands::
+
+    from benchmarks import bench, REGISTRY
+
+    r = bench.time(REGISTRY["basic"].build, 100)
+    r                                  # rich repr in a notebook
+    r.to_snapshot("a.json", model="basic", size=100, phase="build")
+
+    bench.compare({"v1": f1, "v2": f2}).to_snapshot("cmp.json")
+
+This plugs into the *output* side of the pipeline (snapshot JSON read by
+``plotting.load_long_df``), not into ``sweep``: a sweep runs pytest inside
+per-version venvs as subprocesses, so it can only measure importable
+registry models — an in-process callable can't cross that boundary. To
+sweep a custom model across versions, promote it to ``benchmarks/models/``.
+
+**Methodology.** Timing uses ``time.perf_counter`` with the same
+min-of-N convention as the rest of the suite (the fastest sample
+approximates the no-noise floor). It is *not* pytest-benchmark's
+calibrated timer, so absolute numbers are not interchangeable with suite
+snapshots — compare ``bench`` to ``bench`` and suite to suite.
+"""
+
+from __future__ import annotations
+
+import json
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass, field
+from pathlib import Path
+from statistics import mean, median, stdev
+from time import perf_counter
+from typing import TYPE_CHECKING, Any, Literal
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+__all__ = [
+    "MemoryResult",
+    "ResultSet",
+    "TimingResult",
+    "compare",
+    "memory",
+    "time",
+]
+
+# Floor / cap on the auto-tuned round count when ``rounds`` is unset.
+# The floor guarantees a meaningful min-of-N even for slow callables that
+# blow past ``min_time`` in one shot; the cap stops a microsecond callable
+# from spinning forever.
+_ROUND_FLOOR = 5
+_ROUND_CAP = 10_000
+
+
+def _fn_name(fn: Callable[..., object]) -> str:
+    """Best-effort label for a callable (``functools.partial`` has no name)."""
+    return getattr(fn, "__name__", None) or repr(fn)
+
+
+def _synth_id(
+    label: str, *, model: str | None, size: int | None, phase: str | None
+) -> str:
+    """
+    Build the snapshot test id for a result.
+
+    With all of ``model``/``size``/``phase`` supplied, synthesize
+    ``bench::{phase}[{model}-n={size}]`` — this parses cleanly into the
+    ``(phase, model, size)`` columns (so ``plot --view scaling`` works
+    across several sizes). With none supplied, fall back to ``label``
+    verbatim (lands in the ``"other"`` bucket — still fine for
+    ``compare``). A partial spec is ambiguous and rejected.
+    """
+    given = (model is not None, size is not None, phase is not None)
+    if all(given):
+        return f"bench::{phase}[{model}-n={size}]"
+    if any(given):
+        raise ValueError(
+            "model, size, and phase must be given together (or all omitted)"
+        )
+    return label
+
+
+def _row(test_id: str, value: float) -> dict[str, object]:
+    """One ``load_long_df``-shaped row for an in-process result."""
+    from benchmarks.plotting import _parse_test_id
+
+    phase, model, size = _parse_test_id(test_id)
+    return {
+        "snapshot": test_id,
+        "test_id": test_id,
+        "phase": phase,
+        "model": model,
+        "size": size,
+        "value": value,
+    }
+
+
+def _frame(rows: list[dict[str, object]]) -> pd.DataFrame:
+    """Build a DataFrame with the exact column set/dtype of ``load_long_df``."""
+    import pandas as pd
+
+    df = pd.DataFrame(
+        rows, columns=["snapshot", "test_id", "phase", "model", "size", "value"]
+    )
+    df["size"] = df["size"].astype("Int64")
+    return df
+
+
+# --- Result types ----------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class TimingResult:
+    """One timed callable: per-round stats with ``min`` as the headline."""
+
+    label: str
+    stats: dict[str, float]
+    unit: Literal["s"] = "s"
+
+    def to_snapshot(
+        self,
+        path: str | Path,
+        *,
+        model: str | None = None,
+        size: int | None = None,
+        phase: str | None = None,
+    ) -> Path:
+        """Write a pytest-benchmark-shaped timing snapshot (seconds)."""
+        test_id = _synth_id(self.label, model=model, size=size, phase=phase)
+        data = {"benchmarks": [{"fullname": test_id, "stats": dict(self.stats)}]}
+        out = Path(path)
+        out.write_text(json.dumps(data, indent=2))
+        return out
+
+    def to_df(self) -> pd.DataFrame:
+        """``load_long_df``-shaped frame (one row, ``value`` = min seconds)."""
+        return _frame([_row(self.label, self.stats["min"])])
+
+    def __repr__(self) -> str:
+        return (
+            f"TimingResult({self.label!r}, min={self.stats['min']:.4g}s, "
+            f"rounds={int(self.stats['rounds'])})"
+        )
+
+    def _repr_html_(self) -> str:
+        rows = [
+            ("min", f"{self.stats['min']:.4g} s"),
+            ("median", f"{self.stats['median']:.4g} s"),
+            ("mean", f"{self.stats['mean']:.4g} s"),
+            ("max", f"{self.stats['max']:.4g} s"),
+            ("stddev", f"{self.stats['stddev']:.4g} s"),
+            ("rounds", int(self.stats["rounds"])),
+        ]
+        return _html_table("TimingResult", self.label, rows)
+
+
+@dataclass(frozen=True)
+class MemoryResult:
+    """One memory-profiled callable: peak RSS in MiB."""
+
+    label: str
+    peak_mib: float
+    unit: Literal["MiB"] = "MiB"
+
+    def to_snapshot(
+        self,
+        path: str | Path,
+        *,
+        model: str | None = None,
+        size: int | None = None,
+        phase: str | None = None,
+    ) -> Path:
+        """Write a memory.py-shaped snapshot (peak MiB)."""
+        test_id = _synth_id(self.label, model=model, size=size, phase=phase)
+        data = {"label": self.label, "peak_mib": {test_id: self.peak_mib}}
+        out = Path(path)
+        out.write_text(json.dumps(data, indent=2))
+        return out
+
+    def to_df(self) -> pd.DataFrame:
+        """``load_long_df``-shaped frame (one row, ``value`` = peak MiB)."""
+        return _frame([_row(self.label, self.peak_mib)])
+
+    def __repr__(self) -> str:
+        return f"MemoryResult({self.label!r}, peak={self.peak_mib:.1f} MiB)"
+
+    def _repr_html_(self) -> str:
+        return _html_table(
+            "MemoryResult", self.label, [("peak", f"{self.peak_mib:.1f} MiB")]
+        )
+
+
+@dataclass(frozen=True)
+class ResultSet:
+    """
+    Several results of one kind (all timing, or all memory).
+
+    ``to_snapshot`` writes every result into a single file keyed by its
+    label — the natural "compare these N variants" case. For
+    size-parametrized ``scaling`` plots, write each result individually
+    with ``model``/``size``/``phase`` instead.
+    """
+
+    results: list[TimingResult | MemoryResult] = field(default_factory=list)
+    unit: Literal["s", "MiB"] = "s"
+
+    def to_snapshot(self, path: str | Path) -> Path:
+        """Write all results into one snapshot, each keyed by its label."""
+        out = Path(path)
+        if self.unit == "s":
+            entries = [
+                {"fullname": r.label, "stats": dict(r.stats)}
+                for r in self.results
+                if isinstance(r, TimingResult)
+            ]
+            out.write_text(json.dumps({"benchmarks": entries}, indent=2))
+        else:
+            peaks = {
+                r.label: r.peak_mib for r in self.results if isinstance(r, MemoryResult)
+            }
+            out.write_text(
+                json.dumps({"label": "compare", "peak_mib": peaks}, indent=2)
+            )
+        return out
+
+    def to_df(self) -> pd.DataFrame:
+        """Concatenate the per-result frames (shares ``load_long_df`` columns)."""
+        import pandas as pd
+
+        return pd.concat([r.to_df() for r in self.results], ignore_index=True)
+
+    def __repr__(self) -> str:
+        labels = ", ".join(r.label for r in self.results)
+        return f"ResultSet(unit={self.unit!r}, [{labels}])"
+
+    def _repr_html_(self) -> str:
+        rows = [
+            (
+                r.label,
+                f"{r.stats['min']:.4g} s"
+                if isinstance(r, TimingResult)
+                else f"{r.peak_mib:.1f} MiB",
+            )
+            for r in self.results
+        ]
+        return _html_table("ResultSet", self.unit, rows)
+
+
+def _html_table(kind: str, header: str, rows: Sequence[tuple[str, object]]) -> str:
+    """Compact two-column Jupyter table, mirroring ``ModelSpec._repr_html_``."""
+    body = "".join(
+        f"<tr><th style='text-align:left;padding-right:1em'>{k}</th><td>{v}</td></tr>"
+        for k, v in rows
+    )
+    return (
+        f"<b>{kind}</b> <code>{header}</code>"
+        f"<table style='font-size:90%'>{body}</table>"
+    )
+
+
+# --- Entry points ----------------------------------------------------------
+
+
+def time(
+    fn: Callable[..., object],
+    /,
+    *args: object,
+    rounds: int | None = None,
+    warmup: int = 1,
+    min_time: float = 0.5,
+    label: str | None = None,
+    **kwargs: object,
+) -> TimingResult:
+    """
+    Time ``fn(*args, **kwargs)`` and return a :class:`TimingResult`.
+
+    After ``warmup`` untimed calls, run timed calls with
+    ``time.perf_counter``. With ``rounds`` set, run exactly that many;
+    otherwise auto-tune — keep going until cumulative timed wall-clock
+    reaches ``min_time`` (with a floor of 5 rounds and a hard cap). The
+    headline number is ``stats["min"]``.
+    """
+    call = lambda: fn(*args, **kwargs)  # noqa: E731
+
+    for _ in range(max(0, warmup)):
+        call()
+
+    samples: list[float] = []
+    if rounds is not None:
+        for _ in range(max(1, rounds)):
+            t0 = perf_counter()
+            call()
+            samples.append(perf_counter() - t0)
+    else:
+        total = 0.0
+        while True:
+            t0 = perf_counter()
+            call()
+            dt = perf_counter() - t0
+            samples.append(dt)
+            total += dt
+            if len(samples) >= _ROUND_FLOOR and total >= min_time:
+                break
+            if len(samples) >= _ROUND_CAP:
+                break
+
+    stats = {
+        "min": min(samples),
+        "max": max(samples),
+        "mean": mean(samples),
+        "median": median(samples),
+        "stddev": stdev(samples) if len(samples) > 1 else 0.0,
+        "rounds": float(len(samples)),
+    }
+    return TimingResult(label=label or _fn_name(fn), stats=stats)
+
+
+def memory(
+    fn: Callable[..., object],
+    /,
+    *args: object,
+    repeats: int = 1,
+    label: str | None = None,
+    **kwargs: object,
+) -> MemoryResult:
+    """
+    Peak-RSS profile ``fn(*args, **kwargs)`` and return a :class:`MemoryResult`.
+
+    Thin wrapper over :func:`benchmarks.memory.measure_peak`; ``repeats > 1``
+    keeps the minimum peak. Raises on Windows (no ``memray``).
+    """
+    from benchmarks.memory import measure_peak
+
+    peak = measure_peak(lambda: fn(*args, **kwargs), repeats=repeats)
+    return MemoryResult(label=label or _fn_name(fn), peak_mib=peak)
+
+
+def compare(
+    cases: dict[str, Callable[[], object]],
+    *,
+    kind: Literal["time", "memory"] = "time",
+    **opts: Any,
+) -> ResultSet:
+    """
+    Run each zero-arg callable in ``cases`` and collect a :class:`ResultSet`.
+
+    ``kind`` selects timing (default) or memory; ``opts`` are forwarded to
+    :func:`time` / :func:`memory` (e.g. ``rounds=``, ``repeats=``). The
+    dict key becomes each case's label.
+    """
+    if kind == "time":
+        results: list[TimingResult | MemoryResult] = [
+            time(fn, label=name, **opts) for name, fn in cases.items()
+        ]
+        return ResultSet(results=results, unit="s")
+    if kind == "memory":
+        results = [memory(fn, label=name, **opts) for name, fn in cases.items()]
+        return ResultSet(results=results, unit="MiB")
+    raise ValueError(f"kind must be 'time' or 'memory', got {kind!r}")
diff --git a/benchmarks/memory.py b/benchmarks/memory.py
index c43b57a2..6ce8db49 100644
--- a/benchmarks/memory.py
+++ b/benchmarks/memory.py
@@ -34,15 +34,27 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
-if platform.system() == "Windows":
-    raise RuntimeError(
-        "memory measurement requires ``memray`` which is not available on "
-        "Windows. Run memory benchmarks on Linux or macOS."
-    )
-
 if TYPE_CHECKING:
     from benchmarks.registry import ModelSpec
 
+
+def _require_memray() -> None:
+    """
+    Raise if memory measurement isn't supported on this platform.
+
+    Called at the top of every entry point that actually measures
+    (:func:`measure_peak`, :func:`run_phase`, :func:`save`) rather than
+    at import time, so the module imports cleanly everywhere — notably
+    ``benchmarks.bench`` reuses :func:`measure_peak` and must import on
+    Windows. Only *measuring* fails there, with the original message.
+    """
+    if platform.system() == "Windows":
+        raise RuntimeError(
+            "memory measurement requires ``memray`` which is not available on "
+            "Windows. Run memory benchmarks on Linux or macOS."
+        )
+
+
 RESULTS_DIR = Path(".benchmarks/memory")
 MEMORY_PHASES: tuple[str, ...] = (
     "build",
@@ -72,7 +84,7 @@ def _phase_tag(phase: str) -> str:
     }[phase]
 
 
-def _measure_peak(action: Callable[[], object], repeats: int = 1) -> float:
+def measure_peak(action: Callable[[], object], repeats: int = 1) -> float:
     """
     Run ``action()`` under ``memray.Tracker`` and return peak MiB.
 
@@ -82,6 +94,8 @@ def _measure_peak(action: Callable[[], object], repeats: int = 1) -> float:
     file-system page cache for netcdf) so the min-of-N is the cleanest
     estimate of "the floor this code can hit".
     """
+    _require_memray()
+
     import memray
 
     peaks: list[float] = []
@@ -108,6 +122,11 @@ def _measure_peak(action: Callable[[], object], repeats: int = 1) -> float:
     return min(peaks)
 
 
+# Back-compat alias: ``_measure_peak`` was the private name before
+# ``benchmarks.bench`` needed to reuse it.
+_measure_peak = measure_peak
+
+
 def _measurements(
     phase: str, spec: ModelSpec, size: int
 ) -> Iterator[tuple[str, Callable[[], object]]]:
@@ -199,8 +218,10 @@ def run_phase(phase: str, quick: bool = False, repeats: int = 1) -> dict[str, fl
 
     Returns a ``{test_id: peak_mib}`` mapping. Invoked once per phase as a
     subprocess by :func:`save` for isolation. ``repeats`` is forwarded to
-    :func:`_measure_peak` so callers can dial up signal-to-noise.
+    :func:`measure_peak` so callers can dial up signal-to-noise.
     """
+    _require_memray()
+
     from benchmarks import REGISTRY
 
     tag = _phase_tag(phase)
@@ -257,6 +278,8 @@ def save(
     measurement; ``memray.Tracker`` only counts what's allocated inside its
     ``with`` block, but the subprocess boundary makes the isolation total.
     """
+    _require_memray()
+
     phases = list(phases) if phases else list(MEMORY_PHASES)
 
     all_results: dict[str, float] = {}
diff --git a/benchmarks/test_bench.py b/benchmarks/test_bench.py
new file mode 100644
index 00000000..8ac2b5f8
--- /dev/null
+++ b/benchmarks/test_bench.py
@@ -0,0 +1,107 @@
+"""
+Tests for the ad-hoc ``bench`` helper.
+
+The contract under test is the *seam*: a ``bench`` result must round-trip
+into ``plotting.load_long_df`` exactly like a real snapshot, and its
+in-process ``to_df`` must line up column-for-column with the loaded frame.
+These are the only non-obvious behaviours — the timing math itself is not
+asserted beyond "finite and positive", since wall-clock values aren't
+reproducible.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+import linopy
+from benchmarks import REGISTRY, bench
+from benchmarks.phases import touch_matrices
+from benchmarks.plotting import load_long_df
+
+
+def _tiny() -> int:
+    return sum(range(1000))
+
+
+def _alloc() -> int:
+    # Allocate ~16 MB so the memray peak is unambiguously above zero;
+    # ``_tiny`` allocates nothing measurable.
+    data = [0] * 2_000_000
+    return len(data)
+
+
+def test_timing_snapshot_round_trips_into_loader(tmp_path: Path) -> None:
+    """A synthesized id parses back into the (phase, model, size) columns."""
+    snap = tmp_path / "t.json"
+    bench.time(_tiny, rounds=3).to_snapshot(
+        snap, model="basic", size=100, phase="build"
+    )
+
+    df, unit = load_long_df([snap])
+    assert unit == "s"
+    assert len(df) == 1
+    row = df.iloc[0]
+    assert (row["phase"], row["model"], row["size"]) == ("build", "basic", 100)
+    assert row["value"] > 0
+
+
+def test_compare_writes_n_entries(tmp_path: Path) -> None:
+    """``compare`` collects N cases into one snapshot → N loadable rows."""
+    snap = tmp_path / "cmp.json"
+    rs = bench.compare({"a": _tiny, "b": _tiny, "c": _tiny}, kind="time", rounds=2)
+    rs.to_snapshot(snap)
+
+    df, unit = load_long_df([snap])
+    assert unit == "s"
+    assert len(df) == 3
+    assert set(df["test_id"]) == {"a", "b", "c"}
+
+
+def test_to_df_columns_match_loader(tmp_path: Path) -> None:
+    """In-process ``to_df`` shares the loader's exact column set/order."""
+    snap = tmp_path / "t.json"
+    result = bench.time(_tiny, rounds=2)
+    result.to_snapshot(snap, model="basic", size=10, phase="build")
+
+    loaded, _ = load_long_df([snap])
+    assert list(result.to_df().columns) == list(loaded.columns)
+
+
+def test_memory_path_round_trips(tmp_path: Path) -> None:
+    """Memory results carry MiB and round-trip through the loader."""
+    pytest.importorskip("memray")
+    snap = tmp_path / "m.json"
+    result = bench.memory(_alloc)
+    assert result.peak_mib > 0
+    result.to_snapshot(snap, model="basic", size=10, phase="build")
+
+    df, unit = load_long_df([snap])
+    assert unit == "MiB"
+    assert df.iloc[0]["value"] > 0
+
+
+def test_phase_verb_on_custom_model() -> None:
+    """The headline use case: a phase verb timed on a hand-built model."""
+    m = linopy.Model()
+    x = m.add_variables(lower=0, name="x")
+    m.add_constraints(x >= 1)
+    m.add_objective(x)
+
+    result = bench.time(touch_matrices, m, rounds=2)
+    assert result.stats["min"] > 0
+    assert result.stats["rounds"] == 2
+
+
+def test_registry_builder_times() -> None:
+    """A registry builder is a plain callable — no special-casing needed."""
+    result = bench.time(REGISTRY["basic"].build, 50, rounds=2)
+    assert result.stats["min"] > 0
+
+
+def test_partial_id_spec_rejected(tmp_path: Path) -> None:
+    """A half-given (model/size/phase) id is ambiguous and must error."""
+    result = bench.time(_tiny, rounds=1)
+    with pytest.raises(ValueError, match="given together"):
+        result.to_snapshot(tmp_path / "x.json", model="basic")
diff --git a/benchmarks/walkthrough.md b/benchmarks/walkthrough.md
index 1e9301e2..8821a1b5 100644
--- a/benchmarks/walkthrough.md
+++ b/benchmarks/walkthrough.md
@@ -200,6 +200,76 @@ For cross-version memory tracking (analogous to `sweep` for timing),
 use `memory sweep <v1> <v2> ...` — same per-version venv shape, peak
 RSS metric.
 
+## Benchmarking custom things — the `bench` API
+
+The CLI measures the fixed registry grid. When you want to time or
+memory-profile *something the registry doesn't have* — a builder called
+with odd arguments, a phase verb on a model you built by hand, a one-off
+lambda — reach for `benchmarks.bench`. It measures in-process on the
+**current** tree and hands back a result you can inspect or drop into a
+snapshot the `plot` / `compare` machinery already reads. (It can't feed
+`sweep`, which runs pytest in per-version subprocesses — promote a model
+to `benchmarks/models/` to sweep it.)
+
+`bench.time` times any callable with the suite's min-of-N convention. It
+is *not* pytest-benchmark's calibrated timer, so compare `bench` numbers
+only to other `bench` numbers:
+
+```{code-cell} ipython3
+from benchmarks import REGISTRY, bench
+
+bench.time(REGISTRY["basic"].build, 100, rounds=5)
+```
+
+Any callable works — including a phase verb applied to a model the
+registry has never heard of. `bench.memory` profiles peak RSS through
+the same `memray` path the `memory` command uses:
+
+```{code-cell} ipython3
+import linopy
+from benchmarks.phases import touch_matrices
+
+m = linopy.Model()
+x = m.add_variables(coords=[range(2000)], dims=["i"], name="x")
+m.add_constraints(x >= 1)
+m.add_objective(x.sum())
+
+bench.memory(touch_matrices, m)
+```
+
+`bench.compare` runs several callables and collects a `ResultSet`.
+`to_snapshot` writes it in the on-disk shape `load_long_df` reads — the
+seam every plot view sits on — so in-process results round-trip through
+the existing tooling without a detour:
+
+```{code-cell} ipython3
+from benchmarks import load_long_df
+
+rs = bench.compare(
+    {
+        "listcomp": lambda: [i * i for i in range(10_000)],
+        "map": lambda: list(map(lambda i: i * i, range(10_000))),
+    },
+    rounds=20,
+)
+
+bench_snap = _tmp / "bench.json"
+rs.to_snapshot(bench_snap)
+
+df, unit = load_long_df([bench_snap])
+print(f"unit: {unit}")
+df
+```
+
+Those label-keyed ids land in the `other` bucket. For a size-`scaling`
+plot, write each result with `model=` / `size=` / `phase=` so the id
+parses into those columns — `plot` then treats it like any suite
+snapshot:
+
+    bench.time(REGISTRY["basic"].build, 100).to_snapshot(
+        snap, model="basic", size=100, phase="build"
+    )
+
 ## Other CLI surfaces
 
 | Command                            | Purpose                                                              |

From 3df647c82ab2e31fca2c3af87d3674eede169f46 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 29 May 2026 10:10:13 +0200
Subject: [PATCH 63/68] benchmarks: make the suite mypy-clean
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Annotate the benchmark modules so `mypy benchmarks/*.py
benchmarks/models/*.py` passes (28 files, 0 errors). Covers the real
type gaps — `SPEC: ModelSpec | None` in the conditionally-registered sos
/ piecewise models, narrowing `prov.import_dir` past the `failed_at`
guard in `memory sweep`, and return/arg annotations on the plotting
helpers — plus `-> None` / fixture-arg annotations across the phase
tests and conftest.

benchmarks/* stays in the mypy `exclude`, so this isn't enforced in CI;
it just makes an explicit `mypy benchmarks/...` run come back clean.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py                          |  3 +++
 benchmarks/conftest.py                     |  6 +++--
 benchmarks/models/piecewise.py             |  1 +
 benchmarks/models/sos.py                   |  1 +
 benchmarks/plotting.py                     |  9 ++++---
 benchmarks/test_build.py                   | 11 ++++++--
 benchmarks/test_lp_write.py                | 13 ++++++++--
 benchmarks/test_matrices.py                | 11 ++++++--
 benchmarks/test_netcdf.py                  | 21 +++++++++++++---
 benchmarks/test_pypsa_carbon_management.py | 29 +++++++++++++++-------
 benchmarks/test_solver_handoff.py          | 17 ++++++++++---
 11 files changed, 95 insertions(+), 27 deletions(-)

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index ebbf2218..623ee26f 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -1161,6 +1161,9 @@ def memory_sweep_cmd(
         if prov.failed_at:
             failed.append(prov.version)
             continue
+        # ``failed_at is None`` guarantees these are populated (see
+        # ``_ProvisionedVenv``); narrow for the type checker.
+        assert prov.python is not None and prov.import_dir is not None
 
         # ``memory save`` writes to ``.benchmarks/memory/<label>.json``
         # under cwd; we run it with cwd pinned to repo root, then move
diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py
index fea6ba6b..abe56ac7 100644
--- a/benchmarks/conftest.py
+++ b/benchmarks/conftest.py
@@ -7,7 +7,7 @@
 from benchmarks.registry import ModelSpec
 
 
-def pytest_addoption(parser):
+def pytest_addoption(parser: pytest.Parser) -> None:
     parser.addoption(
         "--quick",
         action="store_true",
@@ -25,7 +25,9 @@ def pytest_addoption(parser):
     )
 
 
-def pytest_collection_modifyitems(config, items):
+def pytest_collection_modifyitems(
+    config: pytest.Config, items: list[pytest.Item]
+) -> None:
     """
     Drop PyPSA end-to-end tests under ``--quick``.
 
diff --git a/benchmarks/models/piecewise.py b/benchmarks/models/piecewise.py
index 0fb393bd..77157ba1 100644
--- a/benchmarks/models/piecewise.py
+++ b/benchmarks/models/piecewise.py
@@ -74,6 +74,7 @@ def build_piecewise(n_gens: int) -> linopy.Model:
 
 # ``add_piecewise_formulation`` is a recent (still-evolving) API. Skip
 # registration silently on older linopy so the rest of the suite stays usable.
+SPEC: ModelSpec | None
 if _API_AVAILABLE:
     SPEC = register(
         ModelSpec(
diff --git a/benchmarks/models/sos.py b/benchmarks/models/sos.py
index 06ac22d4..26ff2fb7 100644
--- a/benchmarks/models/sos.py
+++ b/benchmarks/models/sos.py
@@ -70,6 +70,7 @@ def build_sos(n_gens: int) -> linopy.Model:
 
 # ``add_sos_constraints`` is a recent API. On older linopy we silently skip
 # registering this model — the rest of the suite stays usable.
+SPEC: ModelSpec | None
 if _API_AVAILABLE:
     SPEC = register(
         ModelSpec(
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index 45c2b010..1b52e80f 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -28,6 +28,7 @@
 from typing import TYPE_CHECKING, Literal
 
 if TYPE_CHECKING:
+    import pandas as pd
     from plotly.graph_objects import Figure
 
 PlotView = Literal["compare", "scatter", "sweep", "scaling"]
@@ -83,7 +84,9 @@ def _parse_test_id(test_id: str) -> tuple[str, str, int | None]:
     return "other", "other", None
 
 
-def load_long_df(snapshots: list[Path], metric: Metric = "min"):
+def load_long_df(
+    snapshots: list[Path], metric: Metric = "min"
+) -> tuple[pd.DataFrame, str]:
     """
     Return ``(df, unit)`` — one row per ``(snapshot, test_id)`` pair.
 
@@ -126,7 +129,7 @@ def _axis_kwargs(unit: str) -> dict:
     return {"ticksuffix": f" {unit}"}
 
 
-def _hide_non_leftmost_yticks(fig, wrap: int) -> None:
+def _hide_non_leftmost_yticks(fig: Figure, wrap: int) -> None:
     """
     Hide y-axis tick labels on every facet except the leftmost column.
 
@@ -142,7 +145,7 @@ def _hide_non_leftmost_yticks(fig, wrap: int) -> None:
             yaxis.update(showticklabels=False)
 
 
-def _share_axis_labels(fig, y_label: str, x_label: str) -> None:
+def _share_axis_labels(fig: Figure, y_label: str, x_label: str) -> None:
     """
     Replace per-facet axis titles with one shared label per axis.
 
diff --git a/benchmarks/test_build.py b/benchmarks/test_build.py
index 98559b3c..5bb3430b 100644
--- a/benchmarks/test_build.py
+++ b/benchmarks/test_build.py
@@ -2,15 +2,22 @@
 
 from __future__ import annotations
 
+from collections.abc import Callable
+
 import pytest
 
 from benchmarks.conftest import maybe_skip
-from benchmarks.registry import BUILD, iter_params, param_ids
+from benchmarks.registry import BUILD, ModelSpec, iter_params, param_ids
 
 _PARAMS = iter_params(BUILD)
 
 
 @pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
-def test_build(benchmark, spec, size, request):
+def test_build(
+    benchmark: Callable[..., object],
+    spec: ModelSpec,
+    size: int,
+    request: pytest.FixtureRequest,
+) -> None:
     maybe_skip(request, spec, size)
     benchmark(spec.build, size)
diff --git a/benchmarks/test_lp_write.py b/benchmarks/test_lp_write.py
index 2dec144d..f31c1284 100644
--- a/benchmarks/test_lp_write.py
+++ b/benchmarks/test_lp_write.py
@@ -2,17 +2,26 @@
 
 from __future__ import annotations
 
+from collections.abc import Callable
+from pathlib import Path
+
 import pytest
 
 from benchmarks.conftest import maybe_skip
 from benchmarks.phases import write_lp
-from benchmarks.registry import LP_WRITE, iter_params, param_ids
+from benchmarks.registry import LP_WRITE, ModelSpec, iter_params, param_ids
 
 _PARAMS = iter_params(LP_WRITE)
 
 
 @pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
-def test_lp_write(benchmark, spec, size, request, tmp_path):
+def test_lp_write(
+    benchmark: Callable[..., object],
+    spec: ModelSpec,
+    size: int,
+    request: pytest.FixtureRequest,
+    tmp_path: Path,
+) -> None:
     maybe_skip(request, spec, size)
     m = spec.build(size)
     lp_file = tmp_path / "model.lp"
diff --git a/benchmarks/test_matrices.py b/benchmarks/test_matrices.py
index 019f6dd8..f985aec3 100644
--- a/benchmarks/test_matrices.py
+++ b/benchmarks/test_matrices.py
@@ -2,17 +2,24 @@
 
 from __future__ import annotations
 
+from collections.abc import Callable
+
 import pytest
 
 from benchmarks.conftest import maybe_skip
 from benchmarks.phases import touch_matrices
-from benchmarks.registry import MATRICES, iter_params, param_ids
+from benchmarks.registry import MATRICES, ModelSpec, iter_params, param_ids
 
 _PARAMS = iter_params(MATRICES)
 
 
 @pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
-def test_matrices(benchmark, spec, size, request):
+def test_matrices(
+    benchmark: Callable[..., object],
+    spec: ModelSpec,
+    size: int,
+    request: pytest.FixtureRequest,
+) -> None:
     maybe_skip(request, spec, size)
     m = spec.build(size)
     benchmark(touch_matrices, m)
diff --git a/benchmarks/test_netcdf.py b/benchmarks/test_netcdf.py
index 7b02c2bf..a47203e0 100644
--- a/benchmarks/test_netcdf.py
+++ b/benchmarks/test_netcdf.py
@@ -8,17 +8,26 @@
 
 from __future__ import annotations
 
+from collections.abc import Callable
+from pathlib import Path
+
 import pytest
 
 from benchmarks.conftest import maybe_skip
 from benchmarks.phases import read_netcdf, write_netcdf
-from benchmarks.registry import NETCDF, iter_params, param_ids
+from benchmarks.registry import NETCDF, ModelSpec, iter_params, param_ids
 
 _PARAMS = iter_params(NETCDF)
 
 
 @pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
-def test_netcdf_write(benchmark, spec, size, request, tmp_path):
+def test_netcdf_write(
+    benchmark: Callable[..., object],
+    spec: ModelSpec,
+    size: int,
+    request: pytest.FixtureRequest,
+    tmp_path: Path,
+) -> None:
     maybe_skip(request, spec, size)
     m = spec.build(size)
     out = tmp_path / "model.nc"
@@ -26,7 +35,13 @@ def test_netcdf_write(benchmark, spec, size, request, tmp_path):
 
 
 @pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
-def test_netcdf_read(benchmark, spec, size, request, tmp_path):
+def test_netcdf_read(
+    benchmark: Callable[..., object],
+    spec: ModelSpec,
+    size: int,
+    request: pytest.FixtureRequest,
+    tmp_path: Path,
+) -> None:
     maybe_skip(request, spec, size)
     m = spec.build(size)
     out = tmp_path / "model.nc"
diff --git a/benchmarks/test_pypsa_carbon_management.py b/benchmarks/test_pypsa_carbon_management.py
index 7583de5a..a57763e2 100644
--- a/benchmarks/test_pypsa_carbon_management.py
+++ b/benchmarks/test_pypsa_carbon_management.py
@@ -1,3 +1,8 @@
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import Any
+
 import pytest
 
 import linopy as lp
@@ -8,39 +13,45 @@
 
 
 @pytest.fixture(scope="module")
-def network():
+def network() -> Any:
     return pypsa.examples.carbon_management()
 
 
-def test_create_model_frozen(benchmark, network):
+def test_create_model_frozen(benchmark: Callable[..., object], network: Any) -> None:
     benchmark(network.optimize.create_model, freeze_constraints=True)
 
 
-def test_create_model_mutable(benchmark, network):
+def test_create_model_mutable(benchmark: Callable[..., object], network: Any) -> None:
     benchmark(network.optimize.create_model, freeze_constraints=False)
 
 
 @pytest.fixture(scope="module")
-def model_frozen(network):
+def model_frozen(network: Any) -> Any:
     return network.optimize.create_model(freeze_constraints=True)
 
 
 @pytest.fixture(scope="module")
-def model_mutable(network):
+def model_mutable(network: Any) -> Any:
     return network.optimize.create_model(freeze_constraints=False)
 
 
-def test_to_highspy_frozen(benchmark, model_frozen):
+def test_to_highspy_frozen(benchmark: Callable[..., object], model_frozen: Any) -> None:
     benchmark(lp.io.to_highspy, model_frozen)
 
 
-def test_to_highspy_mutable(benchmark, model_mutable):
+def test_to_highspy_mutable(
+    benchmark: Callable[..., object], model_mutable: Any
+) -> None:
     benchmark(lp.io.to_highspy, model_mutable)
 
 
-def test_to_highspy_mutable_no_names(benchmark, model_mutable):
+def test_to_highspy_mutable_no_names(
+    benchmark: Callable[..., object], model_mutable: Any
+) -> None:
     benchmark(lp.io.to_highspy, model_mutable, set_names=False)
 
 
-def test_to_highspy_frozen_no_names(benchmark, model_frozen):
+def test_to_highspy_frozen_no_names(
+    benchmark: Callable[..., object], model_frozen: Any
+) -> None:
     benchmark(lp.io.to_highspy, model_frozen, set_names=False)
diff --git a/benchmarks/test_solver_handoff.py b/benchmarks/test_solver_handoff.py
index c3432b67..702d8a21 100644
--- a/benchmarks/test_solver_handoff.py
+++ b/benchmarks/test_solver_handoff.py
@@ -14,16 +14,18 @@
 
 from __future__ import annotations
 
+from collections.abc import Callable
+
 import pytest
 
 from benchmarks.conftest import maybe_skip
 from benchmarks.phases import SOLVER_HANDOFFS
-from benchmarks.registry import iter_params
+from benchmarks.registry import ModelSpec, iter_params
 from linopy.solvers import available_solvers
 
 
-def _make_params():
-    out = []
+def _make_params() -> list[object]:
+    out: list[object] = []
     for solver_name, phase, wrapper in SOLVER_HANDOFFS:
         for spec, size in iter_params(phase):
             out.append(
@@ -39,7 +41,14 @@ def _make_params():
 
 
 @pytest.mark.parametrize("solver_name,wrapper,spec,size", _make_params())
-def test_solver_handoff(benchmark, solver_name, wrapper, spec, size, request):
+def test_solver_handoff(
+    benchmark: Callable[..., object],
+    solver_name: str,
+    wrapper: Callable[..., object],
+    spec: ModelSpec,
+    size: int,
+    request: pytest.FixtureRequest,
+) -> None:
     if solver_name not in available_solvers:
         pytest.skip(f"{solver_name} not installed")
     maybe_skip(request, spec, size)

From c5f23ec62498990a7116472025ccb0799da8b866 Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 29 May 2026 11:17:19 +0200
Subject: [PATCH 64/68] benchmarks: extract snapshot.py + calibrate bench.time

Two related cleanups after noticing bench had quietly re-derived the
snapshot contract and a thinner pytest-benchmark.

Extract a dependency-free benchmarks/snapshot.py that owns the format:
the two on-disk JSON shapes (write_timing_snapshot / write_memory_snapshot
/ load_snapshot), the test-id grammar (parse_test_id / synth_test_id),
and load_long_df. plotting, bench, and memory now all depend on it
instead of bench reaching sideways into plotting's private _parse_test_id
and three writers hand-rolling the same JSON. plotting shrinks to "plotly
views over a long DataFrame"; memory.save and bench.to_snapshot share one
writer; __init__'s load_long_df re-export drops its plotly-pulling path.

Rebase bench.time on timeit.Timer.autorange: calibrate an inner
iteration count so timer resolution stops dominating fast callables (the
old one-call-per-round loop was unstable in exactly that regime, and min
is the headline stat), then sample per-iteration time across rounds.
Records stats["iterations"]. Still explicitly not interchangeable with
suite numbers.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/__init__.py   |   8 +-
 benchmarks/bench.py      | 141 +++++++++++++++-----------------
 benchmarks/memory.py     |   5 +-
 benchmarks/plotting.py   |  91 +--------------------
 benchmarks/snapshot.py   | 172 +++++++++++++++++++++++++++++++++++++++
 benchmarks/test_bench.py |   4 +-
 6 files changed, 248 insertions(+), 173 deletions(-)
 create mode 100644 benchmarks/snapshot.py

diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
index f508540a..2f476484 100644
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
@@ -31,7 +31,7 @@
 
     import pandas as pd
 
-    from benchmarks.plotting import Metric
+    from benchmarks.snapshot import Metric
 
 # Importing the models package triggers each module's ``register(...)`` call.
 from benchmarks import bench, models  # noqa: F401, E402
@@ -41,16 +41,16 @@ def load_long_df(
     snapshots: list[Path], metric: Metric = "min"
 ) -> tuple[pd.DataFrame, str]:
     """
-    Load one or more pytest-benchmark JSON snapshots into a tidy DataFrame.
+    Load one or more benchmark JSON snapshots into a tidy DataFrame.
 
-    Thin re-export of :func:`benchmarks.plotting.load_long_df` so callers
+    Thin re-export of :func:`benchmarks.snapshot.load_long_df` so callers
     can do their own analysis without importing the plotting module
     (which pulls in plotly). Returns ``(df, unit)`` where ``df`` has one
     row per ``(snapshot, test_id)`` with columns ``snapshot, test_id,
     phase, model, size, value``, and ``unit`` is ``"s"`` (timing) or
     ``"MiB"`` (memory).
     """
-    from benchmarks.plotting import load_long_df as _impl
+    from benchmarks.snapshot import load_long_df as _impl
 
     return _impl(snapshots, metric)
 
diff --git a/benchmarks/bench.py b/benchmarks/bench.py
index cdee1eea..682523f2 100644
--- a/benchmarks/bench.py
+++ b/benchmarks/bench.py
@@ -18,28 +18,37 @@
     bench.compare({"v1": f1, "v2": f2}).to_snapshot("cmp.json")
 
 This plugs into the *output* side of the pipeline (snapshot JSON read by
-``plotting.load_long_df``), not into ``sweep``: a sweep runs pytest inside
+``snapshot.load_long_df``), not into ``sweep``: a sweep runs pytest inside
 per-version venvs as subprocesses, so it can only measure importable
 registry models — an in-process callable can't cross that boundary. To
 sweep a custom model across versions, promote it to ``benchmarks/models/``.
 
-**Methodology.** Timing uses ``time.perf_counter`` with the same
-min-of-N convention as the rest of the suite (the fastest sample
-approximates the no-noise floor). It is *not* pytest-benchmark's
-calibrated timer, so absolute numbers are not interchangeable with suite
-snapshots — compare ``bench`` to ``bench`` and suite to suite.
+**Methodology.** Timing is built on :class:`timeit.Timer`: an
+``autorange`` calibration picks the inner iteration count (so timer
+resolution doesn't dominate fast callables), then the per-iteration time
+is sampled across rounds with the suite's min-of-N convention (the
+fastest sample approximates the no-noise floor). It is *not*
+pytest-benchmark's calibrated timer, so absolute numbers are not
+interchangeable with suite snapshots — compare ``bench`` to ``bench`` and
+suite to suite.
 """
 
 from __future__ import annotations
 
-import json
 from collections.abc import Callable, Sequence
 from dataclasses import dataclass, field
 from pathlib import Path
 from statistics import mean, median, stdev
-from time import perf_counter
+from timeit import Timer
 from typing import TYPE_CHECKING, Any, Literal
 
+from benchmarks.snapshot import (
+    parse_test_id,
+    synth_test_id,
+    write_memory_snapshot,
+    write_timing_snapshot,
+)
+
 if TYPE_CHECKING:
     import pandas as pd
 
@@ -65,34 +74,9 @@ def _fn_name(fn: Callable[..., object]) -> str:
     return getattr(fn, "__name__", None) or repr(fn)
 
 
-def _synth_id(
-    label: str, *, model: str | None, size: int | None, phase: str | None
-) -> str:
-    """
-    Build the snapshot test id for a result.
-
-    With all of ``model``/``size``/``phase`` supplied, synthesize
-    ``bench::{phase}[{model}-n={size}]`` — this parses cleanly into the
-    ``(phase, model, size)`` columns (so ``plot --view scaling`` works
-    across several sizes). With none supplied, fall back to ``label``
-    verbatim (lands in the ``"other"`` bucket — still fine for
-    ``compare``). A partial spec is ambiguous and rejected.
-    """
-    given = (model is not None, size is not None, phase is not None)
-    if all(given):
-        return f"bench::{phase}[{model}-n={size}]"
-    if any(given):
-        raise ValueError(
-            "model, size, and phase must be given together (or all omitted)"
-        )
-    return label
-
-
 def _row(test_id: str, value: float) -> dict[str, object]:
     """One ``load_long_df``-shaped row for an in-process result."""
-    from benchmarks.plotting import _parse_test_id
-
-    phase, model, size = _parse_test_id(test_id)
+    phase, model, size = parse_test_id(test_id)
     return {
         "snapshot": test_id,
         "test_id": test_id,
@@ -134,11 +118,8 @@ def to_snapshot(
         phase: str | None = None,
     ) -> Path:
         """Write a pytest-benchmark-shaped timing snapshot (seconds)."""
-        test_id = _synth_id(self.label, model=model, size=size, phase=phase)
-        data = {"benchmarks": [{"fullname": test_id, "stats": dict(self.stats)}]}
-        out = Path(path)
-        out.write_text(json.dumps(data, indent=2))
-        return out
+        test_id = synth_test_id(self.label, model=model, size=size, phase=phase)
+        return write_timing_snapshot(path, [(test_id, dict(self.stats))])
 
     def to_df(self) -> pd.DataFrame:
         """``load_long_df``-shaped frame (one row, ``value`` = min seconds)."""
@@ -147,7 +128,7 @@ def to_df(self) -> pd.DataFrame:
     def __repr__(self) -> str:
         return (
             f"TimingResult({self.label!r}, min={self.stats['min']:.4g}s, "
-            f"rounds={int(self.stats['rounds'])})"
+            f"rounds={int(self.stats['rounds'])}x{int(self.stats.get('iterations', 1))})"
         )
 
     def _repr_html_(self) -> str:
@@ -158,6 +139,7 @@ def _repr_html_(self) -> str:
             ("max", f"{self.stats['max']:.4g} s"),
             ("stddev", f"{self.stats['stddev']:.4g} s"),
             ("rounds", int(self.stats["rounds"])),
+            ("iterations", int(self.stats.get("iterations", 1))),
         ]
         return _html_table("TimingResult", self.label, rows)
 
@@ -179,11 +161,8 @@ def to_snapshot(
         phase: str | None = None,
     ) -> Path:
         """Write a memory.py-shaped snapshot (peak MiB)."""
-        test_id = _synth_id(self.label, model=model, size=size, phase=phase)
-        data = {"label": self.label, "peak_mib": {test_id: self.peak_mib}}
-        out = Path(path)
-        out.write_text(json.dumps(data, indent=2))
-        return out
+        test_id = synth_test_id(self.label, model=model, size=size, phase=phase)
+        return write_memory_snapshot(path, self.label, {test_id: self.peak_mib})
 
     def to_df(self) -> pd.DataFrame:
         """``load_long_df``-shaped frame (one row, ``value`` = peak MiB)."""
@@ -214,22 +193,19 @@ class ResultSet:
 
     def to_snapshot(self, path: str | Path) -> Path:
         """Write all results into one snapshot, each keyed by its label."""
-        out = Path(path)
         if self.unit == "s":
-            entries = [
-                {"fullname": r.label, "stats": dict(r.stats)}
-                for r in self.results
-                if isinstance(r, TimingResult)
-            ]
-            out.write_text(json.dumps({"benchmarks": entries}, indent=2))
-        else:
-            peaks = {
-                r.label: r.peak_mib for r in self.results if isinstance(r, MemoryResult)
-            }
-            out.write_text(
-                json.dumps({"label": "compare", "peak_mib": peaks}, indent=2)
+            return write_timing_snapshot(
+                path,
+                [
+                    (r.label, dict(r.stats))
+                    for r in self.results
+                    if isinstance(r, TimingResult)
+                ],
             )
-        return out
+        peaks = {
+            r.label: r.peak_mib for r in self.results if isinstance(r, MemoryResult)
+        }
+        return write_memory_snapshot(path, "compare", peaks)
 
     def to_df(self) -> pd.DataFrame:
         """Concatenate the per-result frames (shares ``load_long_df`` columns)."""
@@ -282,31 +258,43 @@ def time(
     """
     Time ``fn(*args, **kwargs)`` and return a :class:`TimingResult`.
 
-    After ``warmup`` untimed calls, run timed calls with
-    ``time.perf_counter``. With ``rounds`` set, run exactly that many;
-    otherwise auto-tune — keep going until cumulative timed wall-clock
-    reaches ``min_time`` (with a floor of 5 rounds and a hard cap). The
-    headline number is ``stats["min"]``.
+    Built on :class:`timeit.Timer`: an ``autorange`` calibration first
+    picks the inner iteration count so timer resolution doesn't dominate
+    for fast callables (the bespoke "one call per round" loop this
+    replaced was unstable in exactly that regime). Each round then runs
+    that many calibrated iterations; the per-iteration time is the
+    sample. ``warmup`` rounds are discarded to prime caches.
+
+    With ``rounds`` set, run exactly that many rounds; otherwise
+    auto-tune — keep going until cumulative timed wall-clock reaches
+    ``min_time`` (floor of 5 rounds, hard cap). The headline number is
+    ``stats["min"]``; ``stats["iterations"]`` records the calibrated
+    inner count.
+
+    This is *not* pytest-benchmark's calibrated timer — ``bench`` numbers
+    are only comparable to other ``bench`` numbers, not to suite
+    snapshots.
     """
-    call = lambda: fn(*args, **kwargs)  # noqa: E731
+    timer = Timer(lambda: fn(*args, **kwargs))
+
+    # Calibrate inner iterations so a single round is long enough that
+    # ``perf_counter`` granularity is negligible (timeit targets ~0.2 s).
+    number, _ = timer.autorange()
 
     for _ in range(max(0, warmup)):
-        call()
+        timer.timeit(number)
 
-    samples: list[float] = []
+    samples: list[float] = []  # per-iteration seconds
     if rounds is not None:
-        for _ in range(max(1, rounds)):
-            t0 = perf_counter()
-            call()
-            samples.append(perf_counter() - t0)
+        samples = [
+            t / number for t in timer.repeat(repeat=max(1, rounds), number=number)
+        ]
     else:
         total = 0.0
         while True:
-            t0 = perf_counter()
-            call()
-            dt = perf_counter() - t0
-            samples.append(dt)
-            total += dt
+            t = timer.timeit(number)
+            samples.append(t / number)
+            total += t
             if len(samples) >= _ROUND_FLOOR and total >= min_time:
                 break
             if len(samples) >= _ROUND_CAP:
@@ -319,6 +307,7 @@ def time(
         "median": median(samples),
         "stddev": stdev(samples) if len(samples) > 1 else 0.0,
         "rounds": float(len(samples)),
+        "iterations": float(number),
     }
     return TimingResult(label=label or _fn_name(fn), stats=stats)
 
diff --git a/benchmarks/memory.py b/benchmarks/memory.py
index 6ce8db49..b3bf4cc6 100644
--- a/benchmarks/memory.py
+++ b/benchmarks/memory.py
@@ -34,6 +34,8 @@
 from pathlib import Path
 from typing import TYPE_CHECKING
 
+from benchmarks.snapshot import write_memory_snapshot
+
 if TYPE_CHECKING:
     from benchmarks.registry import ModelSpec
 
@@ -329,8 +331,7 @@ def save(
         sys.exit(1)
 
     RESULTS_DIR.mkdir(parents=True, exist_ok=True)
-    out_path = RESULTS_DIR / f"{label}.json"
-    out_path.write_text(json.dumps({"label": label, "peak_mib": all_results}, indent=2))
+    out_path = write_memory_snapshot(RESULTS_DIR / f"{label}.json", label, all_results)
     print(f"\nSaved {len(all_results)} measurements to {out_path}", file=sys.stderr)
     return out_path
 
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
index 1b52e80f..9a107de2 100644
--- a/benchmarks/plotting.py
+++ b/benchmarks/plotting.py
@@ -21,106 +21,19 @@
 
 from __future__ import annotations
 
-import json
-import re
 from collections.abc import Callable
 from pathlib import Path
 from typing import TYPE_CHECKING, Literal
 
+from benchmarks.snapshot import Metric, load_long_df
+
 if TYPE_CHECKING:
-    import pandas as pd
     from plotly.graph_objects import Figure
 
 PlotView = Literal["compare", "scatter", "sweep", "scaling"]
-Metric = Literal["min", "median", "mean", "max"]
 SortMode = Literal["absolute", "relative"]
 FacetBy = Literal["phase", "model"]
 
-_SIZE_RE = re.compile(r"(.*)\[([^\[\]]+?)-n=(\d+)\]")
-
-
-def _load_snapshot(
-    path: Path, metric: Metric = "min"
-) -> tuple[str, dict[str, float], str]:
-    """
-    Return ``(label, {fullname: value}, unit)`` for one snapshot.
-
-    Auto-detects the JSON shape:
-
-    - pytest-benchmark timing (``{"benchmarks": [{"stats": {...}}]}``) →
-      ``value`` is ``stats[metric]`` in **seconds**.
-    - memory.py output (``{"peak_mib": {test_id: float}}``) → ``value`` is
-      the peak in **MiB**; ``metric`` is ignored.
-    """
-    data = json.loads(path.read_text())
-    if "peak_mib" in data:
-        return path.stem, dict(data["peak_mib"]), "MiB"
-    values = {bm["fullname"]: bm["stats"][metric] for bm in data["benchmarks"]}
-    return path.stem, values, "s"
-
-
-def _check_same_unit(snapshots: list[tuple[str, dict[str, float], str]]) -> str:
-    """Validate that every snapshot has the same unit, return it."""
-    units = {u for _, _, u in snapshots}
-    if len(units) > 1:
-        raise ValueError(
-            f"snapshots mix units {units}; can't compare timing and memory"
-        )
-    return next(iter(units))
-
-
-def _parse_test_id(test_id: str) -> tuple[str, str, int | None]:
-    """
-    Return ``(phase, model, size)`` for a pytest test id.
-
-    Falls back to ``("other", "other", None)`` for ids that don't match
-    the ``benchmarks/test_<phase>.py::test_<phase>[<model>-n=<size>]``
-    parametrize shape (e.g. ``test_pypsa_carbon_management``).
-    """
-    m = _SIZE_RE.match(test_id)
-    if m:
-        phase = m.group(1).split("::")[-1]
-        return phase, m.group(2), int(m.group(3))
-    return "other", "other", None
-
-
-def load_long_df(
-    snapshots: list[Path], metric: Metric = "min"
-) -> tuple[pd.DataFrame, str]:
-    """
-    Return ``(df, unit)`` — one row per ``(snapshot, test_id)`` pair.
-
-    Columns: ``snapshot``, ``test_id``, ``phase``, ``model``, ``size``
-    (``Int64``-nullable for the "other" bucket), ``value``. ``unit`` is
-    the shared unit string (``"s"`` for timing, ``"MiB"`` for memory)
-    — every loaded snapshot must agree.
-
-    Every plot view downstream pivots or filters this single frame so
-    test-id parsing, unit checking, and the "x snapshots, y tests"
-    matrix logic all live in one place.
-    """
-    import pandas as pd
-
-    raw = [_load_snapshot(p, metric) for p in snapshots]
-    unit = _check_same_unit(raw)
-    rows = []
-    for label, vals, _ in raw:
-        for test_id, value in vals.items():
-            phase, model, size = _parse_test_id(test_id)
-            rows.append(
-                {
-                    "snapshot": label,
-                    "test_id": test_id,
-                    "phase": phase,
-                    "model": model,
-                    "size": size,
-                    "value": value,
-                }
-            )
-    df = pd.DataFrame(rows)
-    df["size"] = df["size"].astype("Int64")
-    return df, unit
-
 
 def _axis_kwargs(unit: str) -> dict:
     """Return ``update_xaxes`` kwargs for a given unit."""
diff --git a/benchmarks/snapshot.py b/benchmarks/snapshot.py
new file mode 100644
index 00000000..59dddba6
--- /dev/null
+++ b/benchmarks/snapshot.py
@@ -0,0 +1,172 @@
+"""
+The benchmark snapshot contract — one owner for the on-disk JSON shapes,
+the test-id grammar, and the long-DataFrame loader.
+
+Dependency-free within the package (stdlib plus a lazily-imported
+pandas), so every writer (pytest-benchmark via file, :func:`memory.save`,
+:mod:`benchmarks.bench`) and every reader (:mod:`benchmarks.plotting`,
+:func:`memory.compare`) can sit on it without import cycles.
+
+Two snapshot shapes, auto-detected on load:
+
+- **timing** — ``{"benchmarks": [{"fullname": <id>, "stats": {"min":…,
+  "median":…, "mean":…, "max":…}}]}`` → value in **seconds** (the shape
+  pytest-benchmark writes).
+- **memory** — ``{"label": <str>, "peak_mib": {<id>: <float>}}`` → value
+  in **MiB**.
+
+Test ids follow ``…[<model>-n=<size>]``; :func:`parse_test_id` splits one
+into ``(phase, model, size)`` and :func:`synth_test_id` builds one.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from pathlib import Path
+from typing import TYPE_CHECKING, Literal
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+Metric = Literal["min", "median", "mean", "max"]
+
+_SIZE_RE = re.compile(r"(.*)\[([^\[\]]+?)-n=(\d+)\]")
+
+
+# --- test-id grammar -------------------------------------------------------
+
+
+def parse_test_id(test_id: str) -> tuple[str, str, int | None]:
+    """
+    Return ``(phase, model, size)`` for a pytest test id.
+
+    Falls back to ``("other", "other", None)`` for ids that don't match
+    the ``benchmarks/test_<phase>.py::test_<phase>[<model>-n=<size>]``
+    parametrize shape (e.g. ``test_pypsa_carbon_management``).
+    """
+    m = _SIZE_RE.match(test_id)
+    if m:
+        phase = m.group(1).split("::")[-1]
+        return phase, m.group(2), int(m.group(3))
+    return "other", "other", None
+
+
+def synth_test_id(
+    label: str, *, model: str | None, size: int | None, phase: str | None
+) -> str:
+    """
+    Build a snapshot test id from optional metadata.
+
+    With all of ``model``/``size``/``phase`` supplied, synthesize
+    ``bench::{phase}[{model}-n={size}]`` — this round-trips through
+    :func:`parse_test_id` into the three columns (so ``plot --view
+    scaling`` works across several sizes). With none supplied, fall back
+    to ``label`` verbatim (lands in the ``"other"`` bucket — still fine
+    for ``compare``). A partial spec is ambiguous and rejected.
+    """
+    given = (model is not None, size is not None, phase is not None)
+    if all(given):
+        return f"bench::{phase}[{model}-n={size}]"
+    if any(given):
+        raise ValueError(
+            "model, size, and phase must be given together (or all omitted)"
+        )
+    return label
+
+
+# --- writers ---------------------------------------------------------------
+
+
+def write_timing_snapshot(
+    path: str | Path, entries: list[tuple[str, dict[str, float]]]
+) -> Path:
+    """Write the pytest-benchmark timing shape (seconds) from ``(id, stats)``."""
+    data = {
+        "benchmarks": [
+            {"fullname": fullname, "stats": dict(stats)} for fullname, stats in entries
+        ]
+    }
+    out = Path(path)
+    out.write_text(json.dumps(data, indent=2))
+    return out
+
+
+def write_memory_snapshot(
+    path: str | Path, label: str, peaks: dict[str, float]
+) -> Path:
+    """Write the memory shape (``{id: peak_mib}``)."""
+    out = Path(path)
+    out.write_text(json.dumps({"label": label, "peak_mib": dict(peaks)}, indent=2))
+    return out
+
+
+# --- readers ---------------------------------------------------------------
+
+
+def load_snapshot(
+    path: Path, metric: Metric = "min"
+) -> tuple[str, dict[str, float], str]:
+    """
+    Return ``(label, {fullname: value}, unit)`` for one snapshot.
+
+    Auto-detects the JSON shape:
+
+    - timing (``{"benchmarks": [{"stats": {...}}]}``) → ``value`` is
+      ``stats[metric]`` in **seconds**.
+    - memory (``{"peak_mib": {id: float}}``) → ``value`` is the peak in
+      **MiB**; ``metric`` is ignored.
+    """
+    data = json.loads(path.read_text())
+    if "peak_mib" in data:
+        return path.stem, dict(data["peak_mib"]), "MiB"
+    values = {bm["fullname"]: bm["stats"][metric] for bm in data["benchmarks"]}
+    return path.stem, values, "s"
+
+
+def _check_same_unit(snapshots: list[tuple[str, dict[str, float], str]]) -> str:
+    """Validate that every snapshot has the same unit, return it."""
+    units = {u for _, _, u in snapshots}
+    if len(units) > 1:
+        raise ValueError(
+            f"snapshots mix units {units}; can't compare timing and memory"
+        )
+    return next(iter(units))
+
+
+def load_long_df(
+    snapshots: list[Path], metric: Metric = "min"
+) -> tuple[pd.DataFrame, str]:
+    """
+    Return ``(df, unit)`` — one row per ``(snapshot, test_id)`` pair.
+
+    Columns: ``snapshot``, ``test_id``, ``phase``, ``model``, ``size``
+    (``Int64``-nullable for the "other" bucket), ``value``. ``unit`` is
+    the shared unit string (``"s"`` for timing, ``"MiB"`` for memory)
+    — every loaded snapshot must agree.
+
+    Every plot view downstream pivots or filters this single frame so
+    test-id parsing, unit checking, and the "x snapshots, y tests"
+    matrix logic all live in one place.
+    """
+    import pandas as pd
+
+    raw = [load_snapshot(p, metric) for p in snapshots]
+    unit = _check_same_unit(raw)
+    rows = []
+    for label, vals, _ in raw:
+        for test_id, value in vals.items():
+            phase, model, size = parse_test_id(test_id)
+            rows.append(
+                {
+                    "snapshot": label,
+                    "test_id": test_id,
+                    "phase": phase,
+                    "model": model,
+                    "size": size,
+                    "value": value,
+                }
+            )
+    df = pd.DataFrame(rows)
+    df["size"] = df["size"].astype("Int64")
+    return df, unit
diff --git a/benchmarks/test_bench.py b/benchmarks/test_bench.py
index 8ac2b5f8..78c94739 100644
--- a/benchmarks/test_bench.py
+++ b/benchmarks/test_bench.py
@@ -2,7 +2,7 @@
 Tests for the ad-hoc ``bench`` helper.
 
 The contract under test is the *seam*: a ``bench`` result must round-trip
-into ``plotting.load_long_df`` exactly like a real snapshot, and its
+into ``snapshot.load_long_df`` exactly like a real snapshot, and its
 in-process ``to_df`` must line up column-for-column with the loaded frame.
 These are the only non-obvious behaviours — the timing math itself is not
 asserted beyond "finite and positive", since wall-clock values aren't
@@ -18,7 +18,7 @@
 import linopy
 from benchmarks import REGISTRY, bench
 from benchmarks.phases import touch_matrices
-from benchmarks.plotting import load_long_df
+from benchmarks.snapshot import load_long_df
 
 
 def _tiny() -> int:

From 2839145a0a109b7dbe2cc5faf6e9fc1eefd18eeb Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 29 May 2026 11:29:00 +0200
Subject: [PATCH 65/68] benchmarks: split sweep orchestration out of cli.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Move the ~340-line per-version provisioning block and both sweep bodies
into a new benchmarks/sweep.py (provision_venvs + run_sweep /
run_memory_sweep). cli.py's `sweep` and `memory sweep` commands become
thin shims that resolve their options (phase -> test file, smoke args)
and delegate. No behavior change — command set, flags, and help text are
identical; verified with a live one-version smoke sweep.

Per the plan's Item B but adapted in two ways:
- The shared-discovery helper goes into the existing snapshot.py as
  discover_snapshots() rather than a new snapshots.py — a sibling module
  one plural away from snapshot.py would be a nasty import footgun.
  _suggest_snapshots (typer-coupled presentation) stays in cli.py and
  calls it.
- run_memory_sweep moves too (not just run_sweep), so the provisioning
  generator stays private to sweep.py instead of being imported across a
  module boundary; all three memory subcommands are now thin.

cli.py: 1218 -> 882 lines (the remainder is command signatures + help).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/cli.py      | 401 +++-----------------------------------
 benchmarks/snapshot.py |  14 ++
 benchmarks/sweep.py    | 425 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 465 insertions(+), 375 deletions(-)
 create mode 100644 benchmarks/sweep.py

diff --git a/benchmarks/cli.py b/benchmarks/cli.py
index 623ee26f..9fde533f 100644
--- a/benchmarks/cli.py
+++ b/benchmarks/cli.py
@@ -12,13 +12,9 @@
 from __future__ import annotations
 
 import os
-import re
-import shutil
 import subprocess
 import sys
 import tempfile
-from collections.abc import Iterator
-from dataclasses import dataclass
 from pathlib import Path
 from typing import Annotated, Literal
 
@@ -32,6 +28,8 @@
 from benchmarks.memory import compare as memory_compare
 from benchmarks.memory import save as memory_save
 from benchmarks.plotting import FacetBy, Metric, PlotView, SortMode
+from benchmarks.snapshot import discover_snapshots
+from benchmarks.sweep import run_memory_sweep, run_sweep
 
 app = typer.Typer(
     help=(
@@ -52,23 +50,6 @@
 PhaseName = Literal["build", "matrices", "lp_write", "netcdf", "solver_handoff"]
 
 
-def _benchmarks_extra_pins() -> list[str]:
-    """
-    Return the pins from ``pyproject.toml``'s ``[benchmarks]`` extra.
-
-    Both ``sweep`` and ``memory sweep`` install these into each
-    per-version venv. Direct pins are kept in pyproject as the single
-    source of truth — bump them there and both sweeps pick up the
-    change. Transitive deps resolve fresh per venv; uv's deterministic
-    resolution gives identical results across versions within one sweep.
-    """
-    import tomllib
-
-    pyproject = Path(__file__).resolve().parents[1] / "pyproject.toml"
-    data = tomllib.loads(pyproject.read_text())
-    return list(data["project"]["optional-dependencies"]["benchmarks"])
-
-
 _PHASE_TEST_FILE: dict[PhaseName, str] = {
     "build": "benchmarks/test_build.py",
     "matrices": "benchmarks/test_matrices.py",
@@ -377,187 +358,6 @@ def notebook(
 # --- Sweep across linopy versions ------------------------------------------
 
 
-_PLAIN_VERSION_RE = re.compile(r"^\d+(\.\d+)*([a-z]+\d*)?$")
-
-
-def _linopy_install_spec(version: str) -> str:
-    """Turn ``0.4.0`` → ``linopy==0.4.0``, leave anything URL-y untouched."""
-    if _PLAIN_VERSION_RE.match(version):
-        return f"linopy=={version}"
-    return version
-
-
-def _venv_python(venv: Path) -> Path:
-    return (
-        venv / "Scripts" / "python.exe" if os.name == "nt" else venv / "bin" / "python"
-    )
-
-
-@dataclass(frozen=True)
-class _ProvisionedVenv:
-    """
-    One fresh per-version venv from :func:`_provision_venvs`.
-
-    On success, ``python``, ``env``, and ``import_dir`` are populated
-    and ``failed_at`` is ``None``. The caller MUST use ``import_dir``
-    as cwd for per-version subprocesses — see :func:`_provision_venvs`
-    for why. On failure, ``failed_at`` names the step that failed
-    (``"venv"``, ``"install"``, or ``"isolation"``); the caller skips
-    its per-version action and records the failure.
-    """
-
-    version: str
-    python: Path | None
-    env: dict[str, str] | None
-    import_dir: Path | None
-    failed_at: str | None
-
-
-def _provision_venvs(
-    versions: list[str], tmp_prefix: str, as_of: str | None = None
-) -> Iterator[_ProvisionedVenv]:
-    """
-    Yield one fresh per-version uv venv for each linopy version.
-
-    Used by both ``sweep`` and ``memory sweep`` so the venv plumbing
-    (uv venv → install ``[benchmarks]`` pins + the target linopy →
-    set up an isolated import root) lives in one place. The caller
-    supplies the tempdir prefix (so ``ps``/``lsof`` can distinguish
-    concurrent runs) and does whatever per-version action it needs.
-
-    **Isolation:** the repo root contains a ``linopy/`` package (the
-    one we're developing). Running the per-version pytest with the
-    repo root on ``sys.path`` — either via ``PYTHONPATH=repo`` or via
-    ``cwd=repo`` (Python prepends cwd as ``''``) — shadows the venv's
-    installed linopy with the dev tree. The whole sweep then measures
-    the dev linopy against itself instead of the requested version.
-    To avoid this, ``import_dir`` is a fresh tempdir per version that
-    holds a filtered *copy* of ``benchmarks/`` and nothing else — a
-    copy rather than a symlink so the sweep runs on Windows without
-    symlink privileges and so no per-version subprocess (nor its
-    ``__pycache__`` writes) ever touches the working tree. Running
-    subprocesses with ``cwd=import_dir`` and no ``PYTHONPATH`` makes
-    ``import benchmarks`` resolve to that copy while ``import linopy``
-    falls through to the venv's site-packages — i.e. the requested
-    version. The preflight below asserts that resolution actually held.
-
-    Each version's tempdir is cleaned up when the generator advances
-    (or exits). The caller can break the loop early — Python's
-    generator close protocol fires the ``with`` teardown.
-
-    **Cross-time reproducibility:** if ``as_of`` is a date string
-    (``YYYY-MM-DD`` or any ISO 8601 timestamp), passes
-    ``--exclude-newer`` to uv so the entire transitive resolution is
-    frozen to releases on or before that date. Pinning direct deps
-    alone (current default) keeps results reproducible *within* one
-    sweep call, but unpinned transitives can drift between sweep calls
-    days apart; ``as_of`` closes that gap.
-    """
-    if shutil.which("uv") is None:
-        typer.secho(
-            "uv not found on PATH — install via https://docs.astral.sh/uv/",
-            fg=typer.colors.RED,
-            err=True,
-        )
-        raise typer.Exit(code=2)
-
-    repo_root = Path.cwd()
-    for version in versions:
-        typer.secho(f"\n=== linopy {version} ===", fg=typer.colors.CYAN, bold=True)
-        with tempfile.TemporaryDirectory(prefix=tmp_prefix) as tmp:
-            venv = Path(tmp) / "venv"
-
-            r = subprocess.run(
-                ["uv", "venv", "--python", sys.executable, str(venv)],
-                check=False,
-            )
-            if r.returncode != 0:
-                typer.secho(
-                    f"venv creation failed: {version}",
-                    fg=typer.colors.RED,
-                    err=True,
-                )
-                yield _ProvisionedVenv(version, None, None, None, "venv")
-                continue
-
-            vpy = _venv_python(venv)
-            spec = _linopy_install_spec(version)
-
-            # Single install pass: pinned infra from pyproject + linopy.
-            # Direct pins in [benchmarks] are sufficient for sweep
-            # reproducibility — uv resolves the same input deterministically
-            # into each per-version venv.
-            install_args = [
-                "uv",
-                "pip",
-                "install",
-                "--python",
-                str(vpy),
-                *(["--exclude-newer", as_of] if as_of else []),
-                *_benchmarks_extra_pins(),
-                spec,
-            ]
-            r = subprocess.run(install_args, check=False)
-            if r.returncode != 0:
-                typer.secho(f"install failed: {version}", fg=typer.colors.RED, err=True)
-                yield _ProvisionedVenv(version, None, None, None, "install")
-                continue
-
-            # Build the isolated import root described in the docstring:
-            # a filtered copy of ``benchmarks/`` and nothing else. The
-            # heavy, sweep-irrelevant artifacts (the executed notebook,
-            # bytecode caches, macOS cruft) are skipped to keep the
-            # per-version copy cheap.
-            import_dir = Path(tmp) / "iso"
-            import_dir.mkdir()
-            shutil.copytree(
-                repo_root / "benchmarks",
-                import_dir / "benchmarks",
-                ignore=shutil.ignore_patterns("__pycache__", "*.ipynb", ".DS_Store"),
-            )
-
-            # No PYTHONPATH manipulation: the copied ``benchmarks`` under
-            # cwd=import_dir carries the harness without pulling the
-            # repo's ``linopy/`` into the import path. Bytecode the
-            # subprocess writes lands in this throwaway copy, never the
-            # working tree, so no PYTHONDONTWRITEBYTECODE is needed.
-            env = os.environ.copy()
-            env.pop("PYTHONPATH", None)
-
-            # Preflight: confirm the venv's linopy is what gets imported
-            # under cwd=import_dir. If a future change reintroduces the
-            # dev-linopy shadow bug, this fails loudly here rather than
-            # silently corrupting every snapshot in the sweep.
-            preflight = subprocess.run(
-                [
-                    str(vpy),
-                    "-c",
-                    (
-                        "import linopy; "
-                        f"assert {str(venv)!r} in linopy.__file__, "
-                        "f'isolation leak: linopy resolved to "
-                        "{linopy.__file__}, not the venv'"
-                    ),
-                ],
-                cwd=str(import_dir),
-                env=env,
-                capture_output=True,
-                text=True,
-                check=False,
-            )
-            if preflight.returncode != 0:
-                typer.secho(
-                    f"isolation preflight failed: {version}",
-                    fg=typer.colors.RED,
-                    err=True,
-                )
-                typer.echo(preflight.stderr.strip(), err=True)
-                yield _ProvisionedVenv(version, None, None, None, "isolation")
-                continue
-
-            yield _ProvisionedVenv(version, vpy, env, import_dir, None)
-
-
 @app.command(
     context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
 )
@@ -663,119 +463,30 @@ def sweep(
     Wall-clock: roughly 1-2 minutes per version (venv + install +
     benchmarks). uv's wheel cache makes repeated runs much faster.
     """
-    if quick and long:
-        typer.secho(
-            "--quick and --long are mutually exclusive",
-            fg=typer.colors.RED,
-            err=True,
-        )
-        raise typer.Exit(code=2)
-
-    if smoke and (long or rounds is not None):
-        typer.secho(
-            "--smoke can't be combined with --long or --rounds "
-            "(no timings are recorded in smoke mode).",
-            fg=typer.colors.RED,
-            err=True,
-        )
-        raise typer.Exit(code=2)
-
-    if not smoke:
-        output_dir.mkdir(parents=True, exist_ok=True)
-
-    failed: list[str] = []
-    for prov in _provision_venvs(versions, "linopy-bench-", as_of=as_of):
-        if prov.failed_at:
-            failed.append(prov.version)
-            continue
-
-        if smoke:
-            # Smoke mode: reuse the same pytest args as the top-level
-            # ``smoke`` command. No JSON snapshot, return code is the
-            # signal.
-            pytest_cmd = [str(prov.python), "-m", "pytest", *_SMOKE_PYTEST_ARGS]
-            k_parts = [p for p in (model, filter_expr) if p]
-            if k_parts:
-                pytest_cmd.extend(["-k", " and ".join(k_parts)])
-            pytest_cmd.extend(ctx.args)
-
-            typer.secho(f"$ {' '.join(pytest_cmd)}", fg=typer.colors.BRIGHT_BLACK)
-            r = subprocess.run(
-                pytest_cmd, env=prov.env, cwd=str(prov.import_dir), check=False
-            )
-            if r.returncode != 0:
-                typer.secho(
-                    f"smoke failed: {prov.version}", fg=typer.colors.RED, err=True
-                )
-                failed.append(prov.version)
-            else:
-                typer.secho(f"smoke ok: {prov.version}", fg=typer.colors.GREEN)
-            continue
-
-        snapshot = (output_dir / f"linopy-{prov.version}.json").resolve()
-        test_target = _PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/"
-        pytest_cmd = [
-            str(prov.python),
-            "-m",
-            "pytest",
-            test_target,
-            "--benchmark-only",
-            "--benchmark-json",
-            str(snapshot),
-        ]
-        if quick:
-            pytest_cmd.append("--quick")
-        elif long:
-            pytest_cmd.append("--long")
-        if rounds is not None:
-            pytest_cmd.extend(
-                [f"--benchmark-min-rounds={rounds}", "--benchmark-max-time=0"]
-            )
-
-        k_parts = [p for p in (model, filter_expr) if p]
-        if k_parts:
-            pytest_cmd.extend(["-k", " and ".join(k_parts)])
-
-        pytest_cmd.extend(ctx.args)
-
-        typer.secho(f"$ {' '.join(pytest_cmd)}", fg=typer.colors.BRIGHT_BLACK)
-        subprocess.run(pytest_cmd, env=prov.env, cwd=str(prov.import_dir), check=False)
-
-        if snapshot.exists():
-            typer.secho(f"saved {snapshot}", fg=typer.colors.GREEN)
-        else:
-            typer.secho(
-                f"no snapshot produced for {prov.version}",
-                fg=typer.colors.RED,
-                err=True,
-            )
-            failed.append(prov.version)
-
-    if failed:
-        typer.secho(f"\nFailed versions: {failed}", fg=typer.colors.RED, err=True)
-        raise typer.Exit(code=1)
+    test_target = _PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/"
+    run_sweep(
+        versions,
+        output_dir=output_dir,
+        test_target=test_target,
+        smoke_args=_SMOKE_PYTEST_ARGS,
+        long=long,
+        quick=quick,
+        rounds=rounds,
+        model=model,
+        filter_expr=filter_expr,
+        smoke=smoke,
+        as_of=as_of,
+        extra_args=ctx.args,
+    )
 
 
 # --- Compare timing snapshots ---------------------------------------------
 
 
-def _discover_snapshots() -> list[Path]:
-    """
-    Return JSON snapshot files under the canonical .benchmarks/ tree.
-
-    Paths are relative to cwd so they're easier to copy-paste back into
-    the CLI than the absolute form would be.
-    """
-    root = Path(".benchmarks")
-    if not root.exists():
-        return []
-    return sorted(root.rglob("*.json"))
-
-
 def _suggest_snapshots(reason: str) -> None:
     """Print an error + a hint listing whatever snapshots we can find."""
     typer.secho(reason, fg=typer.colors.RED, err=True)
-    found = _discover_snapshots()
+    found = discover_snapshots()
     if found:
         typer.echo("\nAvailable snapshots under .benchmarks/:", err=True)
         for p in found:
@@ -1142,74 +853,14 @@ def memory_sweep_cmd(
     ``--repeats 1`` (default) is usually plenty. Use ``--repeats 3``
     if you need <5%% regression detection.
     """
-    from benchmarks.memory import MEMORY_PHASES
-
-    if phase:
-        unknown = [p for p in phase if p not in MEMORY_PHASES]
-        if unknown:
-            typer.secho(
-                f"unknown phase(s): {unknown}; valid options: {list(MEMORY_PHASES)}",
-                fg=typer.colors.RED,
-                err=True,
-            )
-            raise typer.Exit(code=2)
-
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    failed: list[str] = []
-    for prov in _provision_venvs(versions, "linopy-mem-", as_of=as_of):
-        if prov.failed_at:
-            failed.append(prov.version)
-            continue
-        # ``failed_at is None`` guarantees these are populated (see
-        # ``_ProvisionedVenv``); narrow for the type checker.
-        assert prov.python is not None and prov.import_dir is not None
-
-        # ``memory save`` writes to ``.benchmarks/memory/<label>.json``
-        # under cwd; we run it with cwd pinned to repo root, then move
-        # the file if the user asked for a custom output dir.
-        label = f"linopy-{prov.version}"
-        mem_cmd = [
-            str(prov.python),
-            "-m",
-            "benchmarks",
-            "memory",
-            "save",
-            label,
-        ]
-        if quick:
-            mem_cmd.append("--quick")
-        for ph in phase or []:
-            mem_cmd.extend(["--phase", ph])
-        if repeats > 1:
-            mem_cmd.extend(["--repeats", str(repeats)])
-
-        typer.secho(f"$ {' '.join(mem_cmd)}", fg=typer.colors.BRIGHT_BLACK)
-        subprocess.run(mem_cmd, env=prov.env, cwd=str(prov.import_dir), check=False)
-
-        # ``memory save`` writes to ``.benchmarks/memory/<label>.json``
-        # relative to its cwd — here, the isolated import_dir. Move it
-        # under the user's chosen output_dir (resolves under repo_root
-        # by default).
-        default_path = prov.import_dir / ".benchmarks" / "memory" / f"{label}.json"
-        target = output_dir / f"{label}.json"
-        if default_path.exists() and default_path.resolve() != target.resolve():
-            target.parent.mkdir(parents=True, exist_ok=True)
-            default_path.replace(target)
-
-        if target.exists():
-            typer.secho(f"saved {target}", fg=typer.colors.GREEN)
-        else:
-            typer.secho(
-                f"no snapshot produced for {prov.version}",
-                fg=typer.colors.RED,
-                err=True,
-            )
-            failed.append(prov.version)
-
-    if failed:
-        typer.secho(f"\nFailed versions: {failed}", fg=typer.colors.RED, err=True)
-        raise typer.Exit(code=1)
+    run_memory_sweep(
+        versions,
+        output_dir=output_dir,
+        quick=quick,
+        phases=phase,
+        repeats=repeats,
+        as_of=as_of,
+    )
 
 
 @memory_app.command("compare")
diff --git a/benchmarks/snapshot.py b/benchmarks/snapshot.py
index 59dddba6..ab4ff95a 100644
--- a/benchmarks/snapshot.py
+++ b/benchmarks/snapshot.py
@@ -124,6 +124,20 @@ def load_snapshot(
     return path.stem, values, "s"
 
 
+def discover_snapshots() -> list[Path]:
+    """
+    Return JSON snapshot files under the canonical ``.benchmarks/`` tree.
+
+    Paths are relative to cwd so they're easier to copy-paste back into
+    the CLI than the absolute form would be. Used by ``compare`` / ``plot``
+    to suggest available snapshots when the user passes none.
+    """
+    root = Path(".benchmarks")
+    if not root.exists():
+        return []
+    return sorted(root.rglob("*.json"))
+
+
 def _check_same_unit(snapshots: list[tuple[str, dict[str, float], str]]) -> str:
     """Validate that every snapshot has the same unit, return it."""
     units = {u for _, _, u in snapshots}
diff --git a/benchmarks/sweep.py b/benchmarks/sweep.py
new file mode 100644
index 00000000..821c7cbc
--- /dev/null
+++ b/benchmarks/sweep.py
@@ -0,0 +1,425 @@
+"""
+Cross-version sweep orchestration — build a fresh per-version uv venv,
+install the pinned benchmark infra plus a target ``linopy``, and run the
+suite (timing) or ``memory save`` (peak RSS) inside it.
+
+The heavy provisioning loop and the two sweep bodies live here so
+``cli.py`` stays a thin layer of typer command shims. The CLI resolves
+its options (phase → test file, smoke args) and calls :func:`run_sweep`
+/ :func:`run_memory_sweep`; everything else — venv creation, isolation,
+the per-version subprocess — is internal to this module.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+from collections.abc import Iterator
+from dataclasses import dataclass
+from pathlib import Path
+
+import typer
+
+_PLAIN_VERSION_RE = re.compile(r"^\d+(\.\d+)*([a-z]+\d*)?$")
+
+
+def _benchmarks_extra_pins() -> list[str]:
+    """
+    Return the pins from ``pyproject.toml``'s ``[benchmarks]`` extra.
+
+    Both ``sweep`` and ``memory sweep`` install these into each
+    per-version venv. Direct pins are kept in pyproject as the single
+    source of truth — bump them there and both sweeps pick up the
+    change. Transitive deps resolve fresh per venv; uv's deterministic
+    resolution gives identical results across versions within one sweep.
+    """
+    import tomllib
+
+    pyproject = Path(__file__).resolve().parents[1] / "pyproject.toml"
+    data = tomllib.loads(pyproject.read_text())
+    return list(data["project"]["optional-dependencies"]["benchmarks"])
+
+
+def _linopy_install_spec(version: str) -> str:
+    """Turn ``0.4.0`` → ``linopy==0.4.0``, leave anything URL-y untouched."""
+    if _PLAIN_VERSION_RE.match(version):
+        return f"linopy=={version}"
+    return version
+
+
+def _venv_python(venv: Path) -> Path:
+    return (
+        venv / "Scripts" / "python.exe" if os.name == "nt" else venv / "bin" / "python"
+    )
+
+
+@dataclass(frozen=True)
+class _ProvisionedVenv:
+    """
+    One fresh per-version venv from :func:`_provision_venvs`.
+
+    On success, ``python``, ``env``, and ``import_dir`` are populated
+    and ``failed_at`` is ``None``. The caller MUST use ``import_dir``
+    as cwd for per-version subprocesses — see :func:`_provision_venvs`
+    for why. On failure, ``failed_at`` names the step that failed
+    (``"venv"``, ``"install"``, or ``"isolation"``); the caller skips
+    its per-version action and records the failure.
+    """
+
+    version: str
+    python: Path | None
+    env: dict[str, str] | None
+    import_dir: Path | None
+    failed_at: str | None
+
+
+def _provision_venvs(
+    versions: list[str], tmp_prefix: str, as_of: str | None = None
+) -> Iterator[_ProvisionedVenv]:
+    """
+    Yield one fresh per-version uv venv for each linopy version.
+
+    Used by both ``sweep`` and ``memory sweep`` so the venv plumbing
+    (uv venv → install ``[benchmarks]`` pins + the target linopy →
+    set up an isolated import root) lives in one place. The caller
+    supplies the tempdir prefix (so ``ps``/``lsof`` can distinguish
+    concurrent runs) and does whatever per-version action it needs.
+
+    **Isolation:** the repo root contains a ``linopy/`` package (the
+    one we're developing). Running the per-version pytest with the
+    repo root on ``sys.path`` — either via ``PYTHONPATH=repo`` or via
+    ``cwd=repo`` (Python prepends cwd as ``''``) — shadows the venv's
+    installed linopy with the dev tree. The whole sweep then measures
+    the dev linopy against itself instead of the requested version.
+    To avoid this, ``import_dir`` is a fresh tempdir per version that
+    holds a filtered *copy* of ``benchmarks/`` and nothing else — a
+    copy rather than a symlink so the sweep runs on Windows without
+    symlink privileges and so no per-version subprocess (nor its
+    ``__pycache__`` writes) ever touches the working tree. Running
+    subprocesses with ``cwd=import_dir`` and no ``PYTHONPATH`` makes
+    ``import benchmarks`` resolve to that copy while ``import linopy``
+    falls through to the venv's site-packages — i.e. the requested
+    version. The preflight below asserts that resolution actually held.
+
+    Each version's tempdir is cleaned up when the generator advances
+    (or exits). The caller can break the loop early — Python's
+    generator close protocol fires the ``with`` teardown.
+
+    **Cross-time reproducibility:** if ``as_of`` is a date string
+    (``YYYY-MM-DD`` or any ISO 8601 timestamp), passes
+    ``--exclude-newer`` to uv so the entire transitive resolution is
+    frozen to releases on or before that date. Pinning direct deps
+    alone (current default) keeps results reproducible *within* one
+    sweep call, but unpinned transitives can drift between sweep calls
+    days apart; ``as_of`` closes that gap.
+    """
+    if shutil.which("uv") is None:
+        typer.secho(
+            "uv not found on PATH — install via https://docs.astral.sh/uv/",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
+
+    repo_root = Path.cwd()
+    for version in versions:
+        typer.secho(f"\n=== linopy {version} ===", fg=typer.colors.CYAN, bold=True)
+        with tempfile.TemporaryDirectory(prefix=tmp_prefix) as tmp:
+            venv = Path(tmp) / "venv"
+
+            r = subprocess.run(
+                ["uv", "venv", "--python", sys.executable, str(venv)],
+                check=False,
+            )
+            if r.returncode != 0:
+                typer.secho(
+                    f"venv creation failed: {version}",
+                    fg=typer.colors.RED,
+                    err=True,
+                )
+                yield _ProvisionedVenv(version, None, None, None, "venv")
+                continue
+
+            vpy = _venv_python(venv)
+            spec = _linopy_install_spec(version)
+
+            # Single install pass: pinned infra from pyproject + linopy.
+            # Direct pins in [benchmarks] are sufficient for sweep
+            # reproducibility — uv resolves the same input deterministically
+            # into each per-version venv.
+            install_args = [
+                "uv",
+                "pip",
+                "install",
+                "--python",
+                str(vpy),
+                *(["--exclude-newer", as_of] if as_of else []),
+                *_benchmarks_extra_pins(),
+                spec,
+            ]
+            r = subprocess.run(install_args, check=False)
+            if r.returncode != 0:
+                typer.secho(f"install failed: {version}", fg=typer.colors.RED, err=True)
+                yield _ProvisionedVenv(version, None, None, None, "install")
+                continue
+
+            # Build the isolated import root described in the docstring:
+            # a filtered copy of ``benchmarks/`` and nothing else. The
+            # heavy, sweep-irrelevant artifacts (the executed notebook,
+            # bytecode caches, macOS cruft) are skipped to keep the
+            # per-version copy cheap.
+            import_dir = Path(tmp) / "iso"
+            import_dir.mkdir()
+            shutil.copytree(
+                repo_root / "benchmarks",
+                import_dir / "benchmarks",
+                ignore=shutil.ignore_patterns("__pycache__", "*.ipynb", ".DS_Store"),
+            )
+
+            # No PYTHONPATH manipulation: the copied ``benchmarks`` under
+            # cwd=import_dir carries the harness without pulling the
+            # repo's ``linopy/`` into the import path. Bytecode the
+            # subprocess writes lands in this throwaway copy, never the
+            # working tree, so no PYTHONDONTWRITEBYTECODE is needed.
+            env = os.environ.copy()
+            env.pop("PYTHONPATH", None)
+
+            # Preflight: confirm the venv's linopy is what gets imported
+            # under cwd=import_dir. If a future change reintroduces the
+            # dev-linopy shadow bug, this fails loudly here rather than
+            # silently corrupting every snapshot in the sweep.
+            preflight = subprocess.run(
+                [
+                    str(vpy),
+                    "-c",
+                    (
+                        "import linopy; "
+                        f"assert {str(venv)!r} in linopy.__file__, "
+                        "f'isolation leak: linopy resolved to "
+                        "{linopy.__file__}, not the venv'"
+                    ),
+                ],
+                cwd=str(import_dir),
+                env=env,
+                capture_output=True,
+                text=True,
+                check=False,
+            )
+            if preflight.returncode != 0:
+                typer.secho(
+                    f"isolation preflight failed: {version}",
+                    fg=typer.colors.RED,
+                    err=True,
+                )
+                typer.echo(preflight.stderr.strip(), err=True)
+                yield _ProvisionedVenv(version, None, None, None, "isolation")
+                continue
+
+            yield _ProvisionedVenv(version, vpy, env, import_dir, None)
+
+
+def run_sweep(
+    versions: list[str],
+    *,
+    output_dir: Path,
+    test_target: str,
+    smoke_args: list[str],
+    long: bool = False,
+    quick: bool = False,
+    rounds: int | None = None,
+    model: str | None = None,
+    filter_expr: str | None = None,
+    smoke: bool = False,
+    as_of: str | None = None,
+    extra_args: list[str] | None = None,
+) -> None:
+    """
+    Timing sweep: run the benchmark suite in each per-version venv.
+
+    ``test_target`` is the pytest target the caller resolved from
+    ``--phase`` (or ``benchmarks/``); ``smoke_args`` is the shared smoke
+    invocation; ``extra_args`` are trailing args forwarded to pytest. The
+    pytest-benchmark JSON snapshot lands in
+    ``<output_dir>/linopy-<version>.json``.
+    """
+    extra_args = extra_args or []
+
+    if quick and long:
+        typer.secho(
+            "--quick and --long are mutually exclusive",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
+
+    if smoke and (long or rounds is not None):
+        typer.secho(
+            "--smoke can't be combined with --long or --rounds "
+            "(no timings are recorded in smoke mode).",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
+
+    if not smoke:
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    failed: list[str] = []
+    for prov in _provision_venvs(versions, "linopy-bench-", as_of=as_of):
+        if prov.failed_at:
+            failed.append(prov.version)
+            continue
+
+        if smoke:
+            # Smoke mode: reuse the same pytest args as the top-level
+            # ``smoke`` command. No JSON snapshot, return code is the
+            # signal.
+            pytest_cmd = [str(prov.python), "-m", "pytest", *smoke_args]
+            k_parts = [p for p in (model, filter_expr) if p]
+            if k_parts:
+                pytest_cmd.extend(["-k", " and ".join(k_parts)])
+            pytest_cmd.extend(extra_args)
+
+            typer.secho(f"$ {' '.join(pytest_cmd)}", fg=typer.colors.BRIGHT_BLACK)
+            r = subprocess.run(
+                pytest_cmd, env=prov.env, cwd=str(prov.import_dir), check=False
+            )
+            if r.returncode != 0:
+                typer.secho(
+                    f"smoke failed: {prov.version}", fg=typer.colors.RED, err=True
+                )
+                failed.append(prov.version)
+            else:
+                typer.secho(f"smoke ok: {prov.version}", fg=typer.colors.GREEN)
+            continue
+
+        snapshot = (output_dir / f"linopy-{prov.version}.json").resolve()
+        pytest_cmd = [
+            str(prov.python),
+            "-m",
+            "pytest",
+            test_target,
+            "--benchmark-only",
+            "--benchmark-json",
+            str(snapshot),
+        ]
+        if quick:
+            pytest_cmd.append("--quick")
+        elif long:
+            pytest_cmd.append("--long")
+        if rounds is not None:
+            pytest_cmd.extend(
+                [f"--benchmark-min-rounds={rounds}", "--benchmark-max-time=0"]
+            )
+
+        k_parts = [p for p in (model, filter_expr) if p]
+        if k_parts:
+            pytest_cmd.extend(["-k", " and ".join(k_parts)])
+
+        pytest_cmd.extend(extra_args)
+
+        typer.secho(f"$ {' '.join(pytest_cmd)}", fg=typer.colors.BRIGHT_BLACK)
+        subprocess.run(pytest_cmd, env=prov.env, cwd=str(prov.import_dir), check=False)
+
+        if snapshot.exists():
+            typer.secho(f"saved {snapshot}", fg=typer.colors.GREEN)
+        else:
+            typer.secho(
+                f"no snapshot produced for {prov.version}",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            failed.append(prov.version)
+
+    if failed:
+        typer.secho(f"\nFailed versions: {failed}", fg=typer.colors.RED, err=True)
+        raise typer.Exit(code=1)
+
+
+def run_memory_sweep(
+    versions: list[str],
+    *,
+    output_dir: Path,
+    quick: bool = False,
+    phases: list[str] | None = None,
+    repeats: int = 1,
+    as_of: str | None = None,
+) -> None:
+    """
+    Memory sweep: invoke ``memory save`` in each per-version venv.
+
+    Mirrors :func:`run_sweep` but tracks peak RSS. Each version's
+    snapshot lands at ``<output_dir>/linopy-<version>.json``.
+    """
+    from benchmarks.memory import MEMORY_PHASES
+
+    if phases:
+        unknown = [p for p in phases if p not in MEMORY_PHASES]
+        if unknown:
+            typer.secho(
+                f"unknown phase(s): {unknown}; valid options: {list(MEMORY_PHASES)}",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            raise typer.Exit(code=2)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    failed: list[str] = []
+    for prov in _provision_venvs(versions, "linopy-mem-", as_of=as_of):
+        if prov.failed_at:
+            failed.append(prov.version)
+            continue
+        # ``failed_at is None`` guarantees these are populated (see
+        # ``_ProvisionedVenv``); narrow for the type checker.
+        assert prov.python is not None and prov.import_dir is not None
+
+        # ``memory save`` writes to ``.benchmarks/memory/<label>.json``
+        # under cwd; we run it with cwd pinned to repo root, then move
+        # the file if the user asked for a custom output dir.
+        label = f"linopy-{prov.version}"
+        mem_cmd = [
+            str(prov.python),
+            "-m",
+            "benchmarks",
+            "memory",
+            "save",
+            label,
+        ]
+        if quick:
+            mem_cmd.append("--quick")
+        for ph in phases or []:
+            mem_cmd.extend(["--phase", ph])
+        if repeats > 1:
+            mem_cmd.extend(["--repeats", str(repeats)])
+
+        typer.secho(f"$ {' '.join(mem_cmd)}", fg=typer.colors.BRIGHT_BLACK)
+        subprocess.run(mem_cmd, env=prov.env, cwd=str(prov.import_dir), check=False)
+
+        # ``memory save`` writes to ``.benchmarks/memory/<label>.json``
+        # relative to its cwd — here, the isolated import_dir. Move it
+        # under the user's chosen output_dir (resolves under repo_root
+        # by default).
+        default_path = prov.import_dir / ".benchmarks" / "memory" / f"{label}.json"
+        target = output_dir / f"{label}.json"
+        if default_path.exists() and default_path.resolve() != target.resolve():
+            target.parent.mkdir(parents=True, exist_ok=True)
+            default_path.replace(target)
+
+        if target.exists():
+            typer.secho(f"saved {target}", fg=typer.colors.GREEN)
+        else:
+            typer.secho(
+                f"no snapshot produced for {prov.version}",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            failed.append(prov.version)
+
+    if failed:
+        typer.secho(f"\nFailed versions: {failed}", fg=typer.colors.RED, err=True)
+        raise typer.Exit(code=1)

From 4502fed475082cd3da02c9dd5ecc5a40f4d0c96e Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 29 May 2026 11:37:32 +0200
Subject: [PATCH 66/68] benchmarks: drop the "Other CLI surfaces" table from
 the walkthrough

It duplicated the intro's pointer to `--help` and was the only
hand-maintained, unverified block in an otherwise all-executed
walkthrough. Discovery already routes through `python -m benchmarks
--help` (and `--help` on any subcommand), per the intro.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/walkthrough.md | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/benchmarks/walkthrough.md b/benchmarks/walkthrough.md
index 8821a1b5..0129bdd0 100644
--- a/benchmarks/walkthrough.md
+++ b/benchmarks/walkthrough.md
@@ -270,20 +270,6 @@ snapshot:
         snap, model="basic", size=100, phase="build"
     )
 
-## Other CLI surfaces
-
-| Command                            | Purpose                                                              |
-| ---------------------------------- | -------------------------------------------------------------------- |
-| `smoke`                            | CI smoke run — every model/phase at quickest size, no timings (~20s) |
-| `run --long`                       | Full sweep including heaviest sizes (knapsack 1M, basic 1600); slow  |
-| `sweep <v1> <v2> ...`              | Build fresh venv per linopy version and run the suite in each        |
-| `memory sweep <v1> <v2> ...`       | Same shape as `sweep`, but tracks peak RSS per version               |
-| `plot --view sweep <s1> <s2> ...`  | Heatmap of ratios across 3+ snapshots                                |
-| `plot --view scaling <snap>`       | Log-log time vs `n` for size-parametrized tests, faceted by phase    |
-| `notebook`                         | Re-execute this walkthrough end-to-end (what CI runs)                |
-
-Each has its own `--help` with all flags.
-
 ## Extending the suite
 
 Add a new model:

From 99f4f5602607d9be91390e1c46770e25e92ea0cc Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 29 May 2026 11:44:47 +0200
Subject: [PATCH 67/68] benchmarks: show load_long_df from-file diff in the
 walkthrough
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add an "In Python — load straight from file" subsection to the diff
section: load the baseline/candidate snapshots with load_long_df, then
pivot to a one-column-per-snapshot DataFrame with a candidate/baseline
ratio. Demonstrates the programmatic path the CLI views sit on, for
custom analysis from file. Executes end-to-end under the CI notebook run.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/walkthrough.md | 32 ++++++++++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/benchmarks/walkthrough.md b/benchmarks/walkthrough.md
index 0129bdd0..72d75267 100644
--- a/benchmarks/walkthrough.md
+++ b/benchmarks/walkthrough.md
@@ -171,6 +171,38 @@ to percent). Diverging colour around zero.
 HTML(compare_html.read_text())
 ```
 
+### In Python — load straight from file
+
+The CLI views above all sit on one function, `load_long_df`, which reads
+snapshot json files (timing *or* memory) into a tidy frame —  `snapshot`,
+`test_id`, `phase`, `model`, `size`, `value` — plus the unit. Re-exported
+from the package so you can do your own analysis without pulling in
+plotly:
+
+```{code-cell} ipython3
+from benchmarks import load_long_df
+
+df, unit = load_long_df([baseline, candidate])
+print(f"unit: {unit}")
+df.head()
+```
+
+Pivot to one column per snapshot and the comparison is a couple of pandas
+lines — the same baseline-vs-candidate diff the `compare` view draws,
+here as a DataFrame you can sort, filter, or feed onward:
+
+```{code-cell} ipython3
+wide = df.pivot_table(
+    index=["phase", "model", "size"], columns="snapshot", values="value"
+)
+wide["ratio"] = wide["candidate"] / wide["baseline"]
+wide.sort_values("ratio", ascending=False)
+```
+
+(Two `--quick` runs of the same code, so the ratios are ~1 ± noise; on a
+real PR they'd move. The same frame feeds the plot views — pass the files
+to `python -m benchmarks plot` for the rendered version.)
+
 ## Memory snapshots
 
 `memory save <label>` runs benchmarks under `memray.Tracker` and

From 927750f297024d5f126bf04101c727d9137dadfa Mon Sep 17 00:00:00 2001
From: FBumann <117816358+FBumann@users.noreply.github.com>
Date: Fri, 29 May 2026 12:34:09 +0200
Subject: [PATCH 68/68] benchmarks: label sweep snapshots by ref/sha for
 git/file specs

sweep named snapshots linopy-<version>.json with the raw version arg
interpolated. Fine for plain releases, but a git/file spec
(git+...@<sha>, linopy @ file://...) put slashes in the filename and the
snapshot write failed. Add _snapshot_label: for a spec with an @-ref take
the part after the last @ (sha/tag/branch), then sanitise to a safe path
segment. So git+...@<sha> -> linopy-<sha>.json (clean and reproducible);
plain releases are unchanged. Applied to both sweep and memory sweep.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 benchmarks/sweep.py      | 22 ++++++++++++++++++++--
 benchmarks/test_sweep.py | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 2 deletions(-)
 create mode 100644 benchmarks/test_sweep.py

diff --git a/benchmarks/sweep.py b/benchmarks/sweep.py
index 821c7cbc..ba01191d 100644
--- a/benchmarks/sweep.py
+++ b/benchmarks/sweep.py
@@ -51,6 +51,22 @@ def _linopy_install_spec(version: str) -> str:
     return version
 
 
+def _snapshot_label(version: str) -> str:
+    """
+    Filesystem-safe label for a snapshot filename, derived from a spec.
+
+    Plain releases pass through (``0.6.1`` → ``0.6.1``). For a pip spec
+    with a ref — ``git+https://…/linopy.git@<sha>`` or ``linopy @ <url>``
+    — take the part after the last ``@`` (the sha / tag / branch) so a
+    pinned commit writes a clean ``linopy-<sha>.json`` instead of a
+    slash-laden, unwritable name. Whatever's chosen is then sanitised to
+    ``[0-9A-Za-z._-]``.
+    """
+    label = version.rsplit("@", 1)[-1] if "@" in version else version
+    label = re.sub(r"[^0-9A-Za-z._-]+", "-", label).strip("-._")
+    return label or "spec"
+
+
 def _venv_python(venv: Path) -> Path:
     return (
         venv / "Scripts" / "python.exe" if os.name == "nt" else venv / "bin" / "python"
@@ -297,7 +313,9 @@ def run_sweep(
                 typer.secho(f"smoke ok: {prov.version}", fg=typer.colors.GREEN)
             continue
 
-        snapshot = (output_dir / f"linopy-{prov.version}.json").resolve()
+        snapshot = (
+            output_dir / f"linopy-{_snapshot_label(prov.version)}.json"
+        ).resolve()
         pytest_cmd = [
             str(prov.python),
             "-m",
@@ -381,7 +399,7 @@ def run_memory_sweep(
         # ``memory save`` writes to ``.benchmarks/memory/<label>.json``
         # under cwd; we run it with cwd pinned to repo root, then move
         # the file if the user asked for a custom output dir.
-        label = f"linopy-{prov.version}"
+        label = f"linopy-{_snapshot_label(prov.version)}"
         mem_cmd = [
             str(prov.python),
             "-m",
diff --git a/benchmarks/test_sweep.py b/benchmarks/test_sweep.py
new file mode 100644
index 00000000..3531aebb
--- /dev/null
+++ b/benchmarks/test_sweep.py
@@ -0,0 +1,33 @@
+"""Unit tests for sweep helpers (no venvs spun up)."""
+
+from __future__ import annotations
+
+import pytest
+
+from benchmarks.sweep import _snapshot_label
+
+
+@pytest.mark.parametrize(
+    "spec,expected",
+    [
+        # plain releases pass through unchanged
+        ("0.6.1", "0.6.1"),
+        ("0.5.0a1", "0.5.0a1"),
+        # git spec pinned to a sha -> the sha (clean, reproducible filename)
+        ("git+https://github.com/PyPSA/linopy.git@2993b95", "2993b95"),
+        # git spec on a branch -> the branch name
+        ("git+https://github.com/PyPSA/linopy.git@main", "main"),
+        # PEP 508 local file url -> sanitised (no slashes survive)
+        ("linopy @ file:///home/me/linopy", "file-home-me-linopy"),
+    ],
+)
+def test_snapshot_label(spec: str, expected: str) -> None:
+    label = _snapshot_label(spec)
+    assert label == expected
+    # whatever the input, the label must be a safe single path segment.
+    assert "/" not in label and " " not in label and label
+
+
+def test_snapshot_label_never_empty() -> None:
+    # a spec that sanitises to nothing still yields a usable stub.
+    assert _snapshot_label("@@@") == "spec"