diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index f8f779b5..e7750e98 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -12,3 +12,30 @@ updates:
     github-actions:
       patterns:
       - '*'
+
+# Pinned ``[benchmarks]`` extra in pyproject.toml. One PR per dep bump
+# → CodSpeed CI runs and attributes any perf delta to that specific
+# bump. Keeps the cross-version ``sweep`` baseline (lockfile-pinned)
+# stable while still surfacing upstream perf changes per-PR with
+# eyes-open review. Loose ``[project.dependencies]`` (numpy, scipy, ...)
+# have no version specifier so Dependabot leaves them alone — only the
+# ``==`` pins in ``[benchmarks]`` produce PRs.
+- package-ecosystem: pip
+  directory: /
+  schedule:
+    interval: monthly
+  open-pull-requests-limit: 5
+  groups:
+    # Measurement scaffolding + CLI/notebook tooling. Perf-irrelevant —
+    # they don't move CodSpeed signal, so batching into one PR cuts
+    # review noise. Perf-relevant deps (numpy, xarray, highspy, …) stay
+    # un-grouped so each gets its own attributed CodSpeed delta.
+    benchmark-tooling:
+      patterns:
+        - pytest
+        - pytest-benchmark
+        - pytest-memray
+        - pytest-codspeed
+        - nbconvert
+        - typer
+        - plotly
diff --git a/.github/workflows/benchmark-smoke.yml b/.github/workflows/benchmark-smoke.yml
new file mode 100644
index 00000000..59f6462f
--- /dev/null
+++ b/.github/workflows/benchmark-smoke.yml
@@ -0,0 +1,81 @@
+name: Benchmark smoke
+
+# Runs the internal benchmark suite under --quick --benchmark-disable so every
+# model spec is built and every phase fires at least once, but no timings are
+# recorded. The goal is "did refactor X break a model spec?" — not regression
+# tracking, which is done out-of-CI on dedicated hardware.
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ '*' ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  smoke:
+    name: Benchmark smoke (quick)
+    runs-on: ubuntu-latest
+
+    steps:
+    - uses: actions/checkout@v6
+      with:
+        fetch-depth: 0  # setuptools_scm
+
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v6
+      with:
+        python-version: "3.12"
+
+    - name: Install package and benchmark dependencies
+      run: |
+        python -m pip install uv
+        # [dev] for pytest + netcdf4; [benchmarks] for pytest-benchmark + pypsa.
+        uv pip install --system -e ".[dev,benchmarks]"
+
+    - name: Run benchmark smoke
+      run: |
+        python -m benchmarks smoke
+
+    - name: Execute walkthrough notebook
+      # Catches doc rot — walkthrough.md is the canonical CLI walkthrough
+      # for the suite and must stay runnable end-to-end.
+      run: |
+        python -m benchmarks notebook
+
+  codspeed:
+    name: CodSpeed (micro regression detection)
+    runs-on: ubuntu-latest
+    # Cachegrind is ~10–20× slower than native, so we restrict to ``--quick``
+    # (smallest size per spec) and skip PyPSA end-to-end. The signal we want
+    # here is "did this PR change the instruction count of the hot paths?";
+    # full wall-clock cross-version comparison stays in ``sweep``.
+    steps:
+    - uses: actions/checkout@v6
+      with:
+        fetch-depth: 0  # setuptools_scm
+
+    - name: Set up Python 3.12
+      uses: actions/setup-python@v6
+      with:
+        python-version: "3.12"
+
+    - name: Install pinned benchmark environment
+      # Install from the pinned ``[benchmarks]`` extra (not the lockfile)
+      # so Dependabot can auto-detect pyproject.toml and propose bumps
+      # to top-level deps. Each bump → one attributed CodSpeed delta.
+      # The full transitive lockfile is reserved for cross-version
+      # ``sweep`` reproducibility, where machine variance matters more.
+      run: |
+        python -m pip install uv
+        uv pip install --system -e ".[dev,benchmarks]"
+
+    - name: Run benchmarks under CodSpeed
+      uses: CodSpeedHQ/action@v3
+      with:
+        token: ${{ secrets.CODSPEED_TOKEN }}
+        run: |
+          pytest benchmarks/ --quick --codspeed
diff --git a/.gitignore b/.gitignore
index 654b686d..45c1fb7b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -45,6 +45,12 @@ benchmark/scripts/__pycache__
 benchmark/scripts/benchmarks-pypsa-eur/__pycache__
 benchmark/scripts/leftovers/
 
+# Benchmarks (internal suite in benchmarks/) — the .md walkthrough is
+# canonical; ``python -m benchmarks notebook --build`` regenerates the
+# .ipynb sibling as a throwaway viewing/running artifact.
+benchmarks/walkthrough.ipynb
+benchmarks/.ipynb_checkpoints/
+
 # IDE
 .idea/
 
diff --git a/benchmarks/README.md b/benchmarks/README.md
index 22ac73ce..320bbcb3 100644
--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@@ -1,94 +1,44 @@
 # Internal Performance Benchmarks
 
-Measures linopy's own performance (build time, LP write speed, memory usage) across problem sizes using [pytest-benchmark](https://pytest-benchmark.readthedocs.io/) and [pytest-memray](https://pytest-memray.readthedocs.io/). Use these to check whether a code change introduces a regression or improvement.
+End-to-end performance tracking for `linopy` — build → solver handoff
+→ netCDF (de)serialization → fixed PyPSA model. Solver algorithm
+runtime is out of scope.
 
-> **Note:** The `benchmark/` directory (singular) contains *external* benchmarks comparing linopy against other modeling frameworks. This directory (`benchmarks/`) is for *internal* performance tracking only.
+**The walkthrough is load-bearing.** Phase coverage, CLI introspection,
+the two-snapshot regression workflow with inline Plotly views, and
+how to extend the suite live in [`walkthrough.md`](walkthrough.md).
+This README only covers install and how to open the walkthrough.
 
-## Setup
+> `benchmark/` (singular) is the legacy external-framework suite.
+> `benchmarks/` (plural) is this internal suite.
 
-```bash
-pip install -e ".[benchmarks]"
-```
-
-## Running benchmarks
+## Install
 
 ```bash
-# Quick smoke test (small sizes only)
-pytest benchmarks/ --quick
-
-# Full timing benchmarks
-pytest benchmarks/test_build.py benchmarks/test_lp_write.py benchmarks/test_matrices.py
-
-# Run a specific model
-pytest benchmarks/test_build.py -k basic
-```
-
-## Comparing timing between branches
-
-```bash
-# Save baseline results on master
-git checkout master
-pytest benchmarks/test_build.py --benchmark-save=master
-
-# Switch to feature branch and compare
-git checkout my-feature
-pytest benchmarks/test_build.py --benchmark-save=my-feature --benchmark-compare=0001_master
-
-# Compare saved results without re-running
-pytest-benchmark compare 0001_master 0002_my-feature --columns=median,iqr
+uv sync --extra dev --extra benchmarks
+source .venv/bin/activate
 ```
 
-Results are stored in `.benchmarks/` (gitignored).
+`pypsa` is optional — `pypsa_scigrid` and
+`test_pypsa_carbon_management.py` skip gracefully without it. Install
+when you need them: `uv pip install pypsa`.
 
-## Memory benchmarks
+The `[benchmarks]` extra in `pyproject.toml` pins every direct dep that
+affects measurement (`numpy`, `scipy`, `xarray`, `pandas`, `polars`,
+`dask`, etc.). `sweep` installs these into each per-version venv, so
+"same deps, only linopy varies" comes for free without a separate
+lockfile — bump the pins in pyproject and the next sweep picks them up.
 
-`memory.py` runs each test in a separate process with pytest-memray to get accurate per-test peak memory (including C/numpy allocations). Results are saved as JSON and can be compared across branches.
-
-By default, only the build phase (`test_build.py`) is measured. Unlike timing benchmarks where `benchmark()` isolates the measured function, memray tracks all allocations within a test — including model construction in setup. This means LP write and matrix tests would report build + phase memory combined, making the phase-specific contribution impossible to isolate. Since model construction dominates memory usage, measuring build alone gives the most actionable numbers.
+## Open the walkthrough
 
 ```bash
-# Save baseline on master
-git checkout master
-python benchmarks/memory.py save master
-
-# Save feature branch
-git checkout my-feature
-python benchmarks/memory.py save my-feature
-
-# Compare
-python benchmarks/memory.py compare master my-feature
-
-# Quick mode (smaller sizes, faster)
-python benchmarks/memory.py save master --quick
-
-# Measure a specific phase (includes build overhead)
-python benchmarks/memory.py save master --test-path benchmarks/test_lp_write.py
+python -m benchmarks notebook --build       # (re)generate walkthrough.ipynb
+jupyter lab benchmarks/walkthrough.ipynb    # ...or PyCharm / VSCode
 ```
 
-Results are stored in `.benchmarks/memory/` (gitignored). Requires Linux or macOS (memray is not available on Windows).
-
-> **Note:** Small tests (~5 MiB) are near the import-overhead floor and may show noise of ~1 MiB between runs. Focus on larger tests for meaningful memory comparisons. Do not combine `--memray` with timing benchmarks — memray adds ~2x overhead that invalidates timing results.
-
-## Models
-
-| Model | Description | Sizes |
-|-------|-------------|-------|
-| `basic` | Dense N*N model, 2*N^2 vars/cons | 10 — 1600 |
-| `knapsack` | N binary variables, 1 constraint | 100 — 1M |
-| `expression_arithmetic` | Broadcasting, scaling, summation across dims | 10 — 1000 |
-| `sparse_network` | Ring network with mismatched bus/line coords | 10 — 1000 |
-| `pypsa_scigrid` | Real power system (requires `pypsa`) | 10 — 200 snapshots |
-
-## Phases
-
-| Phase | File | What it measures |
-|-------|------|------------------|
-| Build | `test_build.py` | Model construction (add_variables, add_constraints, add_objective) |
-| LP write | `test_lp_write.py` | Writing the model to an LP file |
-| Matrices | `test_matrices.py` | Generating sparse matrices (A, b, c, bounds) from the model |
-
-## Adding a new model
+The `.md` is the source of truth; the `.ipynb` is a disposable,
+gitignored build artifact. Edit the `.md`, re-run `--build`, re-open.
+Same workflow in any editor.
 
-1. Create `benchmarks/models/my_model.py` with a `build_my_model(n)` function and a `SIZES` list
-2. Add parametrized tests in the relevant `test_*.py` files
-3. Add a quick threshold in `conftest.py`
+CI executes the walkthrough end-to-end on every PR
+(`python -m benchmarks notebook`) so the examples can't silently rot.
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
index 6bf202cc..2f476484 100644
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
@@ -1 +1,114 @@
-"""Linopy benchmark suite — run with ``pytest benchmarks/`` (use ``--quick`` for smaller sizes)."""
+"""
+Linopy benchmark suite.
+
+Run with ``pytest benchmarks/`` (use ``--quick`` for smaller sizes).
+
+This package also exposes a **reusable model registry** for any test, profiling
+session, or example that wants ready-made linopy models of varying sizes and
+features. Each entry exposes a ``build(size) -> linopy.Model`` callable plus
+metadata::
+
+    from benchmarks import REGISTRY, QUADRATIC
+
+    # Look up by name
+    model = REGISTRY["basic"].build(100)
+
+    # Iterate / filter
+    for spec in REGISTRY.values():
+        m = spec.build(spec.sizes[0])
+        ...
+
+    from benchmarks import filter_by
+    qp_specs = filter_by(has_feature=QUADRATIC)
+"""
+
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    import pandas as pd
+
+    from benchmarks.snapshot import Metric
+
+# Importing the models package triggers each module's ``register(...)`` call.
+from benchmarks import bench, models  # noqa: F401, E402
+
+
+def load_long_df(
+    snapshots: list[Path], metric: Metric = "min"
+) -> tuple[pd.DataFrame, str]:
+    """
+    Load one or more benchmark JSON snapshots into a tidy DataFrame.
+
+    Thin re-export of :func:`benchmarks.snapshot.load_long_df` so callers
+    can do their own analysis without importing the plotting module
+    (which pulls in plotly). Returns ``(df, unit)`` where ``df`` has one
+    row per ``(snapshot, test_id)`` with columns ``snapshot, test_id,
+    phase, model, size, value``, and ``unit`` is ``"s"`` (timing) or
+    ``"MiB"`` (memory).
+    """
+    from benchmarks.snapshot import load_long_df as _impl
+
+    return _impl(snapshots, metric)
+
+
+from benchmarks.registry import (  # noqa: F401, E402 — re-export
+    ALL_FEATURES,
+    ALL_PHASES,
+    BINARY,
+    BUILD,
+    CONTINUOUS,
+    DEFAULT_PHASES,
+    INTEGER,
+    LP_WRITE,
+    MASKED,
+    MATRICES,
+    NETCDF,
+    PIECEWISE,
+    QUADRATIC,
+    REGISTRY,
+    SOS,
+    TO_GUROBIPY,
+    TO_HIGHSPY,
+    TO_MOSEK,
+    TO_XPRESS,
+    ModelSpec,
+    filter_by,
+    get,
+    iter_params,
+    param_ids,
+    register,
+)
+
+__all__ = [
+    "ALL_FEATURES",
+    "ALL_PHASES",
+    "BINARY",
+    "BUILD",
+    "CONTINUOUS",
+    "DEFAULT_PHASES",
+    "INTEGER",
+    "LP_WRITE",
+    "MASKED",
+    "MATRICES",
+    "ModelSpec",
+    "NETCDF",
+    "PIECEWISE",
+    "QUADRATIC",
+    "REGISTRY",
+    "SOS",
+    "TO_GUROBIPY",
+    "TO_HIGHSPY",
+    "TO_MOSEK",
+    "TO_XPRESS",
+    "bench",
+    "filter_by",
+    "get",
+    "iter_params",
+    "load_long_df",
+    "param_ids",
+    "register",
+]
diff --git a/benchmarks/__main__.py b/benchmarks/__main__.py
new file mode 100644
index 00000000..34a28439
--- /dev/null
+++ b/benchmarks/__main__.py
@@ -0,0 +1,5 @@
+"""Allow ``python -m benchmarks <command>``."""
+
+from benchmarks.cli import app
+
+app()
diff --git a/benchmarks/bench.py b/benchmarks/bench.py
new file mode 100644
index 00000000..682523f2
--- /dev/null
+++ b/benchmarks/bench.py
@@ -0,0 +1,356 @@
+"""
+Ad-hoc benchmarking of arbitrary callables on the *current* linopy tree.
+
+Where the pytest suite measures the fixed registry grid and ``sweep``
+measures across installed linopy versions, ``bench`` is for the
+interactive middle: time or memory-profile any callable — a registry
+builder, a phase verb applied to a model you built by hand, or a one-off
+lambda — get a result object back, and either inspect it as a DataFrame
+or drop it into a snapshot the existing ``plot`` / ``compare`` machinery
+already understands::
+
+    from benchmarks import bench, REGISTRY
+
+    r = bench.time(REGISTRY["basic"].build, 100)
+    r                                  # rich repr in a notebook
+    r.to_snapshot("a.json", model="basic", size=100, phase="build")
+
+    bench.compare({"v1": f1, "v2": f2}).to_snapshot("cmp.json")
+
+This plugs into the *output* side of the pipeline (snapshot JSON read by
+``snapshot.load_long_df``), not into ``sweep``: a sweep runs pytest inside
+per-version venvs as subprocesses, so it can only measure importable
+registry models — an in-process callable can't cross that boundary. To
+sweep a custom model across versions, promote it to ``benchmarks/models/``.
+
+**Methodology.** Timing is built on :class:`timeit.Timer`: an
+``autorange`` calibration picks the inner iteration count (so timer
+resolution doesn't dominate fast callables), then the per-iteration time
+is sampled across rounds with the suite's min-of-N convention (the
+fastest sample approximates the no-noise floor). It is *not*
+pytest-benchmark's calibrated timer, so absolute numbers are not
+interchangeable with suite snapshots — compare ``bench`` to ``bench`` and
+suite to suite.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable, Sequence
+from dataclasses import dataclass, field
+from pathlib import Path
+from statistics import mean, median, stdev
+from timeit import Timer
+from typing import TYPE_CHECKING, Any, Literal
+
+from benchmarks.snapshot import (
+    parse_test_id,
+    synth_test_id,
+    write_memory_snapshot,
+    write_timing_snapshot,
+)
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+__all__ = [
+    "MemoryResult",
+    "ResultSet",
+    "TimingResult",
+    "compare",
+    "memory",
+    "time",
+]
+
+# Floor / cap on the auto-tuned round count when ``rounds`` is unset.
+# The floor guarantees a meaningful min-of-N even for slow callables that
+# blow past ``min_time`` in one shot; the cap stops a microsecond callable
+# from spinning forever.
+_ROUND_FLOOR = 5
+_ROUND_CAP = 10_000
+
+
+def _fn_name(fn: Callable[..., object]) -> str:
+    """Best-effort label for a callable (``functools.partial`` has no name)."""
+    return getattr(fn, "__name__", None) or repr(fn)
+
+
+def _row(test_id: str, value: float) -> dict[str, object]:
+    """One ``load_long_df``-shaped row for an in-process result."""
+    phase, model, size = parse_test_id(test_id)
+    return {
+        "snapshot": test_id,
+        "test_id": test_id,
+        "phase": phase,
+        "model": model,
+        "size": size,
+        "value": value,
+    }
+
+
+def _frame(rows: list[dict[str, object]]) -> pd.DataFrame:
+    """Build a DataFrame with the exact column set/dtype of ``load_long_df``."""
+    import pandas as pd
+
+    df = pd.DataFrame(
+        rows, columns=["snapshot", "test_id", "phase", "model", "size", "value"]
+    )
+    df["size"] = df["size"].astype("Int64")
+    return df
+
+
+# --- Result types ----------------------------------------------------------
+
+
+@dataclass(frozen=True)
+class TimingResult:
+    """One timed callable: per-round stats with ``min`` as the headline."""
+
+    label: str
+    stats: dict[str, float]
+    unit: Literal["s"] = "s"
+
+    def to_snapshot(
+        self,
+        path: str | Path,
+        *,
+        model: str | None = None,
+        size: int | None = None,
+        phase: str | None = None,
+    ) -> Path:
+        """Write a pytest-benchmark-shaped timing snapshot (seconds)."""
+        test_id = synth_test_id(self.label, model=model, size=size, phase=phase)
+        return write_timing_snapshot(path, [(test_id, dict(self.stats))])
+
+    def to_df(self) -> pd.DataFrame:
+        """``load_long_df``-shaped frame (one row, ``value`` = min seconds)."""
+        return _frame([_row(self.label, self.stats["min"])])
+
+    def __repr__(self) -> str:
+        return (
+            f"TimingResult({self.label!r}, min={self.stats['min']:.4g}s, "
+            f"rounds={int(self.stats['rounds'])}x{int(self.stats.get('iterations', 1))})"
+        )
+
+    def _repr_html_(self) -> str:
+        rows = [
+            ("min", f"{self.stats['min']:.4g} s"),
+            ("median", f"{self.stats['median']:.4g} s"),
+            ("mean", f"{self.stats['mean']:.4g} s"),
+            ("max", f"{self.stats['max']:.4g} s"),
+            ("stddev", f"{self.stats['stddev']:.4g} s"),
+            ("rounds", int(self.stats["rounds"])),
+            ("iterations", int(self.stats.get("iterations", 1))),
+        ]
+        return _html_table("TimingResult", self.label, rows)
+
+
+@dataclass(frozen=True)
+class MemoryResult:
+    """One memory-profiled callable: peak RSS in MiB."""
+
+    label: str
+    peak_mib: float
+    unit: Literal["MiB"] = "MiB"
+
+    def to_snapshot(
+        self,
+        path: str | Path,
+        *,
+        model: str | None = None,
+        size: int | None = None,
+        phase: str | None = None,
+    ) -> Path:
+        """Write a memory.py-shaped snapshot (peak MiB)."""
+        test_id = synth_test_id(self.label, model=model, size=size, phase=phase)
+        return write_memory_snapshot(path, self.label, {test_id: self.peak_mib})
+
+    def to_df(self) -> pd.DataFrame:
+        """``load_long_df``-shaped frame (one row, ``value`` = peak MiB)."""
+        return _frame([_row(self.label, self.peak_mib)])
+
+    def __repr__(self) -> str:
+        return f"MemoryResult({self.label!r}, peak={self.peak_mib:.1f} MiB)"
+
+    def _repr_html_(self) -> str:
+        return _html_table(
+            "MemoryResult", self.label, [("peak", f"{self.peak_mib:.1f} MiB")]
+        )
+
+
+@dataclass(frozen=True)
+class ResultSet:
+    """
+    Several results of one kind (all timing, or all memory).
+
+    ``to_snapshot`` writes every result into a single file keyed by its
+    label — the natural "compare these N variants" case. For
+    size-parametrized ``scaling`` plots, write each result individually
+    with ``model``/``size``/``phase`` instead.
+    """
+
+    results: list[TimingResult | MemoryResult] = field(default_factory=list)
+    unit: Literal["s", "MiB"] = "s"
+
+    def to_snapshot(self, path: str | Path) -> Path:
+        """Write all results into one snapshot, each keyed by its label."""
+        if self.unit == "s":
+            return write_timing_snapshot(
+                path,
+                [
+                    (r.label, dict(r.stats))
+                    for r in self.results
+                    if isinstance(r, TimingResult)
+                ],
+            )
+        peaks = {
+            r.label: r.peak_mib for r in self.results if isinstance(r, MemoryResult)
+        }
+        return write_memory_snapshot(path, "compare", peaks)
+
+    def to_df(self) -> pd.DataFrame:
+        """Concatenate the per-result frames (shares ``load_long_df`` columns)."""
+        import pandas as pd
+
+        return pd.concat([r.to_df() for r in self.results], ignore_index=True)
+
+    def __repr__(self) -> str:
+        labels = ", ".join(r.label for r in self.results)
+        return f"ResultSet(unit={self.unit!r}, [{labels}])"
+
+    def _repr_html_(self) -> str:
+        rows = [
+            (
+                r.label,
+                f"{r.stats['min']:.4g} s"
+                if isinstance(r, TimingResult)
+                else f"{r.peak_mib:.1f} MiB",
+            )
+            for r in self.results
+        ]
+        return _html_table("ResultSet", self.unit, rows)
+
+
+def _html_table(kind: str, header: str, rows: Sequence[tuple[str, object]]) -> str:
+    """Compact two-column Jupyter table, mirroring ``ModelSpec._repr_html_``."""
+    body = "".join(
+        f"<tr><th style='text-align:left;padding-right:1em'>{k}</th><td>{v}</td></tr>"
+        for k, v in rows
+    )
+    return (
+        f"<b>{kind}</b> <code>{header}</code>"
+        f"<table style='font-size:90%'>{body}</table>"
+    )
+
+
+# --- Entry points ----------------------------------------------------------
+
+
+def time(
+    fn: Callable[..., object],
+    /,
+    *args: object,
+    rounds: int | None = None,
+    warmup: int = 1,
+    min_time: float = 0.5,
+    label: str | None = None,
+    **kwargs: object,
+) -> TimingResult:
+    """
+    Time ``fn(*args, **kwargs)`` and return a :class:`TimingResult`.
+
+    Built on :class:`timeit.Timer`: an ``autorange`` calibration first
+    picks the inner iteration count so timer resolution doesn't dominate
+    for fast callables (the bespoke "one call per round" loop this
+    replaced was unstable in exactly that regime). Each round then runs
+    that many calibrated iterations; the per-iteration time is the
+    sample. ``warmup`` rounds are discarded to prime caches.
+
+    With ``rounds`` set, run exactly that many rounds; otherwise
+    auto-tune — keep going until cumulative timed wall-clock reaches
+    ``min_time`` (floor of 5 rounds, hard cap). The headline number is
+    ``stats["min"]``; ``stats["iterations"]`` records the calibrated
+    inner count.
+
+    This is *not* pytest-benchmark's calibrated timer — ``bench`` numbers
+    are only comparable to other ``bench`` numbers, not to suite
+    snapshots.
+    """
+    timer = Timer(lambda: fn(*args, **kwargs))
+
+    # Calibrate inner iterations so a single round is long enough that
+    # ``perf_counter`` granularity is negligible (timeit targets ~0.2 s).
+    number, _ = timer.autorange()
+
+    for _ in range(max(0, warmup)):
+        timer.timeit(number)
+
+    samples: list[float] = []  # per-iteration seconds
+    if rounds is not None:
+        samples = [
+            t / number for t in timer.repeat(repeat=max(1, rounds), number=number)
+        ]
+    else:
+        total = 0.0
+        while True:
+            t = timer.timeit(number)
+            samples.append(t / number)
+            total += t
+            if len(samples) >= _ROUND_FLOOR and total >= min_time:
+                break
+            if len(samples) >= _ROUND_CAP:
+                break
+
+    stats = {
+        "min": min(samples),
+        "max": max(samples),
+        "mean": mean(samples),
+        "median": median(samples),
+        "stddev": stdev(samples) if len(samples) > 1 else 0.0,
+        "rounds": float(len(samples)),
+        "iterations": float(number),
+    }
+    return TimingResult(label=label or _fn_name(fn), stats=stats)
+
+
+def memory(
+    fn: Callable[..., object],
+    /,
+    *args: object,
+    repeats: int = 1,
+    label: str | None = None,
+    **kwargs: object,
+) -> MemoryResult:
+    """
+    Peak-RSS profile ``fn(*args, **kwargs)`` and return a :class:`MemoryResult`.
+
+    Thin wrapper over :func:`benchmarks.memory.measure_peak`; ``repeats > 1``
+    keeps the minimum peak. Raises on Windows (no ``memray``).
+    """
+    from benchmarks.memory import measure_peak
+
+    peak = measure_peak(lambda: fn(*args, **kwargs), repeats=repeats)
+    return MemoryResult(label=label or _fn_name(fn), peak_mib=peak)
+
+
+def compare(
+    cases: dict[str, Callable[[], object]],
+    *,
+    kind: Literal["time", "memory"] = "time",
+    **opts: Any,
+) -> ResultSet:
+    """
+    Run each zero-arg callable in ``cases`` and collect a :class:`ResultSet`.
+
+    ``kind`` selects timing (default) or memory; ``opts`` are forwarded to
+    :func:`time` / :func:`memory` (e.g. ``rounds=``, ``repeats=``). The
+    dict key becomes each case's label.
+    """
+    if kind == "time":
+        results: list[TimingResult | MemoryResult] = [
+            time(fn, label=name, **opts) for name, fn in cases.items()
+        ]
+        return ResultSet(results=results, unit="s")
+    if kind == "memory":
+        results = [memory(fn, label=name, **opts) for name, fn in cases.items()]
+        return ResultSet(results=results, unit="MiB")
+    raise ValueError(f"kind must be 'time' or 'memory', got {kind!r}")
diff --git a/benchmarks/cli.py b/benchmarks/cli.py
new file mode 100644
index 00000000..9fde533f
--- /dev/null
+++ b/benchmarks/cli.py
@@ -0,0 +1,882 @@
+"""
+linopy benchmark CLI — one entry point for the suite.
+
+Run with::
+
+    python -m benchmarks <command> [options]
+
+The CLI is a thin layer over pytest for the timing / smoke commands, plus
+direct dispatch for registry introspection and memory snapshots.
+"""
+
+from __future__ import annotations
+
+import os
+import subprocess
+import sys
+import tempfile
+from pathlib import Path
+from typing import Annotated, Literal
+
+import typer
+
+from benchmarks import (
+    REGISTRY,
+    filter_by,
+    get,
+)
+from benchmarks.memory import compare as memory_compare
+from benchmarks.memory import save as memory_save
+from benchmarks.plotting import FacetBy, Metric, PlotView, SortMode
+from benchmarks.snapshot import discover_snapshots
+from benchmarks.sweep import run_memory_sweep, run_sweep
+
+app = typer.Typer(
+    help=(
+        "Linopy internal benchmark suite — a thin layer over pytest plus "
+        "registry introspection and memory snapshots."
+    ),
+    no_args_is_help=True,
+    rich_markup_mode="rich",
+)
+
+memory_app = typer.Typer(
+    help="Peak-RSS memory snapshots (pytest-memray under the hood).",
+    no_args_is_help=True,
+)
+app.add_typer(memory_app, name="memory")
+
+
+PhaseName = Literal["build", "matrices", "lp_write", "netcdf", "solver_handoff"]
+
+
+_PHASE_TEST_FILE: dict[PhaseName, str] = {
+    "build": "benchmarks/test_build.py",
+    "matrices": "benchmarks/test_matrices.py",
+    "lp_write": "benchmarks/test_lp_write.py",
+    "netcdf": "benchmarks/test_netcdf.py",
+    "solver_handoff": "benchmarks/test_solver_handoff.py",
+}
+
+# pytest args that constitute a "smoke" run — quick sizes, no timings.
+# Shared between the top-level ``smoke`` command and ``sweep --smoke`` so
+# bumping the definition stays single-source.
+_SMOKE_PYTEST_ARGS = ["benchmarks/", "--quick", "--benchmark-disable", "-q"]
+
+
+# --- Introspection commands ------------------------------------------------
+
+
+@app.command("list")
+def list_(
+    details: Annotated[
+        bool,
+        typer.Option("--details", "-d", help="Show features and size range."),
+    ] = False,
+) -> None:
+    """
+    List the registered model specs.
+
+    By default emits one name per line — suitable for piping into other
+    tools. Pass ``--details`` for a small table that also shows the
+    features tags and the size range.
+    """
+    if not details:
+        for name in sorted(REGISTRY):
+            typer.echo(name)
+        return
+
+    rows = [
+        (
+            spec.name,
+            ",".join(sorted(spec.features)),
+            f"{spec.sizes[0]}..{spec.sizes[-1]}",
+        )
+        for spec in REGISTRY.values()
+    ]
+    name_w = max(len(r[0]) for r in rows)
+    feat_w = max(len(r[1]) for r in rows)
+    # ``secho`` strips colour automatically when stdout isn't a TTY, so
+    # piping ``list --details | grep`` still gets plain text.
+    typer.secho(
+        f"{'name':<{name_w}}  {'features':<{feat_w}}  sizes",
+        dim=True,
+    )
+    typer.secho("-" * (name_w + feat_w + 20), dim=True)
+    for name, feats, sizes in rows:
+        typer.secho(f"{name:<{name_w}}", fg=typer.colors.CYAN, nl=False)
+        typer.echo(f"  {feats:<{feat_w}}  {sizes}")
+
+
+@app.command()
+def show(
+    name: Annotated[str, typer.Argument(help="Spec name (see ``list``).")],
+) -> None:
+    """
+    Print full attributes of one model spec.
+
+    Output includes sizes, feature tags, applicable phases, the quick /
+    long size thresholds, and any optional ``requires=`` dependencies the
+    spec advertises.
+    """
+    try:
+        spec = get(name)
+    except KeyError as exc:
+        typer.secho(f"unknown model: {name!r}", fg=typer.colors.RED, err=True)
+        typer.echo(f"available: {', '.join(sorted(REGISTRY))}", err=True)
+        raise typer.Exit(code=2) from exc
+    typer.echo(repr(spec))
+
+    def _row(label: str, value: object) -> None:
+        # Dim the label so the eye lands on the value first; ``secho``
+        # auto-strips colour when stdout isn't a TTY.
+        typer.secho(f"  {label:<17}", dim=True, nl=False)
+        typer.echo(value)
+
+    _row("sizes:", spec.sizes)
+    _row("features:", sorted(spec.features))
+    _row("phases:", sorted(spec.phases))
+    _row("quick_threshold:", spec.quick_threshold)
+    _row("long_threshold:", spec.long_threshold)
+    if spec.requires:
+        _row("requires:", list(spec.requires))
+
+
+@app.command("filter")
+def filter_(
+    feature: Annotated[
+        str | None,
+        typer.Option(help="Feature tag, e.g. 'quadratic', 'integer', 'sos'."),
+    ] = None,
+    phase: Annotated[
+        str | None,
+        typer.Option(help="Phase tag, e.g. 'to_gurobipy', 'lp_write'."),
+    ] = None,
+) -> None:
+    """
+    Filter specs by feature or phase tag.
+
+    Both filters can be combined; the result is the intersection.
+    At least one of ``--feature`` / ``--phase`` must be supplied.
+    """
+    if feature is None and phase is None:
+        typer.secho("pass --feature and/or --phase", fg=typer.colors.RED, err=True)
+        raise typer.Exit(code=2)
+    matches = filter_by(has_feature=feature, has_phase=phase)
+    for spec in matches:
+        typer.echo(repr(spec))
+
+
+# --- Execution commands ----------------------------------------------------
+
+
+def _run_pytest(args: list[str]) -> None:
+    """Invoke pytest as a subprocess and propagate its exit code."""
+    cmd = [sys.executable, "-m", "pytest", *args]
+    typer.secho(f"$ {' '.join(cmd)}", fg=typer.colors.BRIGHT_BLACK)
+    result = subprocess.run(cmd, check=False)
+    if result.returncode != 0:
+        raise typer.Exit(code=result.returncode)
+
+
+@app.command(
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def smoke(ctx: typer.Context) -> None:
+    """
+    Quick smoke run — what CI uses on every PR.
+
+    Equivalent to ``pytest benchmarks/ --quick --benchmark-disable -q``.
+    Every model builds at one size and every phase fires once, no timings
+    recorded. Typical wall-clock: ~20s.
+
+    Any trailing arguments are forwarded to pytest verbatim, e.g.::
+
+        python -m benchmarks smoke -k basic --tb=short
+    """
+    _run_pytest([*_SMOKE_PYTEST_ARGS, *ctx.args])
+
+
+@app.command(
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def run(
+    ctx: typer.Context,
+    long: Annotated[
+        bool,
+        typer.Option(
+            "--long",
+            help="Include the slowest sizes (above each spec's long_threshold).",
+        ),
+    ] = False,
+    phase: Annotated[
+        PhaseName | None,
+        typer.Option(help="Restrict to one phase's test file."),
+    ] = None,
+    model: Annotated[
+        str | None,
+        typer.Option(help="Restrict to one model (passed as pytest ``-k``)."),
+    ] = None,
+    filter_expr: Annotated[
+        str | None,
+        typer.Option(
+            "--filter",
+            "-k",
+            help="Arbitrary pytest ``-k`` expression (AND-ed with ``--model``).",
+        ),
+    ] = None,
+    json_out: Annotated[
+        Path | None,
+        typer.Option("--json", help="Save pytest-benchmark JSON to this path."),
+    ] = None,
+    rounds: Annotated[
+        int | None,
+        typer.Option(
+            "--rounds",
+            help=(
+                "Force pytest-benchmark to run exactly N rounds per test "
+                "(passes ``--benchmark-min-rounds=N --benchmark-max-time=0``). "
+                "Default: pytest-benchmark auto-tunes per test (5–40+ rounds "
+                "depending on cost). Use a fixed N for uniform measurement "
+                "across versions in a sweep."
+            ),
+        ),
+    ] = None,
+) -> None:
+    """
+    Default timing run. Records timings with pytest-benchmark.
+
+    Without ``--long``, sizes above each spec's ``long_threshold`` are
+    skipped — keeps the wall-clock around 45s instead of several minutes.
+    Add ``--long`` for the full sweep including the heaviest sizes
+    (knapsack at 1M, basic at 1600, pypsa_scigrid at >50).
+
+    Any trailing arguments are forwarded to pytest verbatim, e.g.::
+
+        python -m benchmarks run --long -- --tb=short -x
+
+    To skip timing entirely (e.g. just verifying everything runs at a
+    bigger size), use ``smoke`` instead, or pass ``--benchmark-disable``
+    as a trailing arg.
+    """
+    args: list[str] = []
+    args.append(_PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/")
+    if long:
+        args.append("--long")
+    args.append("--benchmark-only")
+    if json_out is not None:
+        args.extend(["--benchmark-json", str(json_out)])
+    if rounds is not None:
+        args.extend([f"--benchmark-min-rounds={rounds}", "--benchmark-max-time=0"])
+
+    k_parts = [p for p in (model, filter_expr) if p]
+    if k_parts:
+        args.extend(["-k", " and ".join(k_parts)])
+
+    args.extend(ctx.args)
+    _run_pytest(args)
+
+
+@app.command()
+def notebook(
+    build: Annotated[
+        bool,
+        typer.Option(
+            "--build",
+            help=(
+                "Regenerate ``walkthrough.ipynb`` from the ``.md`` source. "
+                "One-way build — the ``.ipynb`` is a throwaway artifact for "
+                "opening in any editor (JupyterLab, PyCharm, VSCode), the "
+                "``.md`` stays canonical. Re-run after editing the ``.md``. "
+                "The ``.ipynb`` is gitignored."
+            ),
+        ),
+    ] = False,
+) -> None:
+    """
+    Execute the walkthrough notebook end-to-end (default) or rebuild the
+    ``.ipynb`` artifact for interactive viewing (``--build``).
+
+    The walkthrough is a Jupytext MyST markdown file
+    (``benchmarks/walkthrough.md``) — diffs cleanly in git, runs as a
+    notebook in Jupyter. The ``.md`` is the source of truth; the paired
+    ``.ipynb`` is generated output. Edit the ``.md``, re-run ``--build``,
+    open the ``.ipynb`` in your editor of choice.
+
+    CI calls this with no flags to catch doc rot; the executed copy goes
+    to a tempdir and is discarded so the source file stays output-free.
+    """
+    nb = Path("benchmarks/walkthrough.md")
+    if not nb.exists():
+        typer.secho(f"walkthrough not found: {nb}", fg=typer.colors.RED, err=True)
+        raise typer.Exit(code=1)
+
+    if build:
+        # ``--to ipynb`` is a one-way conversion (no ``formats`` metadata
+        # written into the .md). The generated .ipynb is editor-agnostic;
+        # contributors regenerate it after editing the .md.
+        cmd = [
+            sys.executable,
+            "-m",
+            "jupytext",
+            "--to",
+            "ipynb",
+            str(nb),
+        ]
+        typer.secho(f"$ {' '.join(cmd)}", fg=typer.colors.BRIGHT_BLACK)
+        result = subprocess.run(cmd, check=False)
+        if result.returncode != 0:
+            raise typer.Exit(code=result.returncode)
+        ipynb = nb.with_suffix(".ipynb")
+        typer.secho(f"built: {ipynb}  (regenerable from {nb})", fg=typer.colors.GREEN)
+        typer.echo(f"Open it:  jupyter lab {ipynb}    # or PyCharm / VSCode / …")
+        return
+
+    with tempfile.TemporaryDirectory() as tmp:
+        # Jupytext sets the kernel cwd to the output directory (the
+        # tempdir here), so forward the repo root via
+        # ``LINOPY_REPO_ROOT`` for the walkthrough's first cell to find
+        # ``benchmarks/``.
+        env = {**os.environ, "LINOPY_REPO_ROOT": str(Path.cwd().resolve())}
+        cmd = [
+            sys.executable,
+            "-m",
+            "jupytext",
+            "--to",
+            "notebook",
+            "--execute",
+            "--output",
+            str(Path(tmp) / "executed.ipynb"),
+            str(nb),
+        ]
+        typer.secho(f"$ {' '.join(cmd)}", fg=typer.colors.BRIGHT_BLACK)
+        result = subprocess.run(cmd, env=env, check=False)
+    if result.returncode != 0:
+        raise typer.Exit(code=result.returncode)
+
+
+# --- Sweep across linopy versions ------------------------------------------
+
+
+@app.command(
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def sweep(
+    ctx: typer.Context,
+    versions: Annotated[
+        list[str],
+        typer.Argument(help="linopy versions, e.g. 0.4.0 0.5.0 (or any pip spec)."),
+    ],
+    output_dir: Annotated[
+        Path,
+        typer.Option("--output-dir", "-o", help="Where to save snapshot JSONs."),
+    ] = Path(".benchmarks/sweep"),
+    long: Annotated[
+        bool, typer.Option("--long", help="Include the slowest sizes.")
+    ] = False,
+    quick: Annotated[
+        bool,
+        typer.Option("--quick", help="Use only the smallest sizes (faster sweep)."),
+    ] = False,
+    phase: Annotated[
+        PhaseName | None,
+        typer.Option(help="Restrict each version's run to one phase's test file."),
+    ] = None,
+    model: Annotated[
+        str | None,
+        typer.Option(help="Restrict to one model (passed as pytest ``-k``)."),
+    ] = None,
+    filter_expr: Annotated[
+        str | None,
+        typer.Option(
+            "--filter",
+            "-k",
+            help="Arbitrary pytest ``-k`` expression (AND-ed with ``--model``).",
+        ),
+    ] = None,
+    rounds: Annotated[
+        int | None,
+        typer.Option(
+            "--rounds",
+            help=(
+                "Force pytest-benchmark to run exactly N rounds per test in "
+                "every version (uniform measurement across the sweep). "
+                "Default: pytest-benchmark auto-tunes per test."
+            ),
+        ),
+    ] = None,
+    smoke: Annotated[
+        bool,
+        typer.Option(
+            "--smoke",
+            help=(
+                "Run the smoke suite in each version's venv instead of the "
+                "full timing run. Same pytest invocation as the top-level "
+                "``smoke`` command — every model/phase fires once at the "
+                "quickest size, no timings, ~20 s per version. Useful before "
+                "bumping a perf-sensitive pin to check the combination is "
+                "viable across every linopy version you'd sweep against."
+            ),
+        ),
+    ] = False,
+    as_of: Annotated[
+        str | None,
+        typer.Option(
+            "--as-of",
+            help=(
+                "Freeze every dep's resolution to releases on or before this "
+                "date (``YYYY-MM-DD`` or ISO 8601). Passes ``--exclude-newer`` "
+                "to uv. Use a consistent value across invocations for "
+                "cross-time-reproducible sweeps — direct pins alone keep "
+                "results stable within one call but transitives can drift "
+                "between calls."
+            ),
+        ),
+    ] = None,
+) -> None:
+    """
+    Run the benchmark suite against several linopy versions.
+
+    Uses ``uv`` to build a fresh venv per version (near-instant) and to
+    install the benchmark infra + target linopy in a single resolution
+    pass. The pytest-benchmark JSON snapshot lands in
+    ``<output-dir>/linopy-<version>.json``.
+
+    Versions are accepted in two forms:
+
+    - Plain releases: ``0.4.0``, ``0.5.0a1`` — expanded to ``linopy==X``.
+    - Pip specs verbatim: ``git+https://github.com/PyPSA/linopy.git@<sha>``
+      or ``linopy @ file:///path/to/checkout``.
+
+    The current (repo-tip) benchmark code runs against each linopy
+    version, so the measurement layer is constant. ``_API_AVAILABLE``
+    gates in the ``sos`` / ``piecewise`` specs let older linopy versions
+    skip those phases gracefully.
+
+    Filter knobs (``--phase``, ``--model``, ``--filter``) mirror ``run``
+    and apply to every version's pytest invocation. Trailing arguments
+    after ``--`` are forwarded to pytest verbatim:
+
+        python -m benchmarks sweep 0.6.7 --phase build --model basic
+        python -m benchmarks sweep 0.6.7 -- --tb=short -x
+
+    Wall-clock: roughly 1-2 minutes per version (venv + install +
+    benchmarks). uv's wheel cache makes repeated runs much faster.
+    """
+    test_target = _PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/"
+    run_sweep(
+        versions,
+        output_dir=output_dir,
+        test_target=test_target,
+        smoke_args=_SMOKE_PYTEST_ARGS,
+        long=long,
+        quick=quick,
+        rounds=rounds,
+        model=model,
+        filter_expr=filter_expr,
+        smoke=smoke,
+        as_of=as_of,
+        extra_args=ctx.args,
+    )
+
+
+# --- Compare timing snapshots ---------------------------------------------
+
+
+def _suggest_snapshots(reason: str) -> None:
+    """Print an error + a hint listing whatever snapshots we can find."""
+    typer.secho(reason, fg=typer.colors.RED, err=True)
+    found = discover_snapshots()
+    if found:
+        typer.echo("\nAvailable snapshots under .benchmarks/:", err=True)
+        for p in found:
+            typer.echo(f"  {p}", err=True)
+    else:
+        typer.echo(
+            "\nNo snapshots found under .benchmarks/. Generate one with:\n"
+            "  python -m benchmarks run --json .benchmarks/<label>.json",
+            err=True,
+        )
+
+
+@app.command(
+    context_settings={"allow_extra_args": True, "ignore_unknown_options": True},
+)
+def compare(ctx: typer.Context) -> None:
+    """
+    Compare timing snapshots side-by-side via ``pytest-benchmark compare``.
+
+    Thin wrapper around the upstream tool so the whole suite stays under
+    one entry point. Pass the snapshot paths first, then any pytest-benchmark
+    flags::
+
+        python -m benchmarks compare a.json b.json
+        python -m benchmarks compare a.json b.json --group-by=name
+        python -m benchmarks compare a.json b.json --histogram=plots/cmp
+
+    With no arguments (or missing paths), prints what snapshots exist
+    under ``.benchmarks/`` so you can copy-paste the path you want.
+
+    For memory snapshots use ``memory compare`` instead — different format,
+    different tool.
+
+    Implementation note: typer/click don't have a clean idiom for "list-typed
+    positional + pass-through", so this command parses ``ctx.args`` by hand
+    — anything before the first flag is a snapshot path, everything after
+    is forwarded.
+    """
+    # Snapshots come first; once we see a flag (``-x`` / ``--foo``) every
+    # subsequent token is forwarded to pytest-benchmark. That way the value
+    # of a flag like ``-k "build and basic"`` doesn't get mistaken for a path.
+    snapshots: list[Path] = []
+    extra: list[str] = []
+    seen_flag = False
+    for arg in ctx.args:
+        if arg.startswith("-"):
+            seen_flag = True
+        if seen_flag:
+            extra.append(arg)
+        else:
+            snapshots.append(Path(arg))
+
+    if len(snapshots) < 2:
+        _suggest_snapshots(
+            f"compare needs at least two snapshot paths (got {len(snapshots)})."
+        )
+        raise typer.Exit(code=2)
+
+    missing = [p for p in snapshots if not p.exists()]
+    if missing:
+        _suggest_snapshots(f"missing snapshots: {[str(p) for p in missing]}")
+        raise typer.Exit(code=2)
+
+    # Sensible defaults — pytest-benchmark's defaults emit 10 columns wide,
+    # grouped by parametrize group, which is unreadable for two-snapshot diffs.
+    # ``--group-by=fullname`` puts each test's (baseline, candidate) rows in
+    # their own mini-table; ``--columns=min,iqr`` shows the lowest observed
+    # time (approximates the no-noise floor) plus the spread.
+    # Each default is only applied if the user didn't override it.
+    if not any(a.startswith("--columns") for a in extra):
+        extra.insert(0, "--columns=min,iqr")
+    if not any(a.startswith("--sort") for a in extra):
+        extra.insert(0, "--sort=min")
+    if not any(a.startswith("--group-by") for a in extra):
+        extra.insert(0, "--group-by=fullname")
+
+    cmd = [
+        sys.executable,
+        "-m",
+        "pytest_benchmark",
+        "compare",
+        *[str(p) for p in snapshots],
+        *extra,
+    ]
+    typer.secho(f"$ {' '.join(cmd)}", fg=typer.colors.BRIGHT_BLACK)
+    result = subprocess.run(cmd, check=False)
+    if result.returncode != 0:
+        raise typer.Exit(code=result.returncode)
+
+
+# --- Plotting --------------------------------------------------------------
+
+
+@app.command()
+def plot(
+    snapshots: Annotated[
+        list[Path],
+        typer.Argument(help="pytest-benchmark JSON snapshot(s)."),
+    ],
+    view: Annotated[
+        PlotView | None,
+        typer.Option(
+            help=(
+                "Which plot to produce. Default: ``scaling`` for 1 input, "
+                "``scatter`` for 2, ``sweep`` for 3+. ``compare`` (delta "
+                "bar chart) is still available via ``--view compare``."
+            )
+        ),
+    ] = None,
+    metric: Annotated[
+        Metric,
+        typer.Option(
+            help=(
+                "Stat to drive the plot. ``min`` (default) is closest to "
+                "the 'true' cost — noise can only slow things down. ``median``"
+                " is more robust to a single fast warmup round."
+            )
+        ),
+    ] = "min",
+    sort: Annotated[
+        SortMode,
+        typer.Option(
+            help=(
+                "Compare-view sort and bar dimension. ``absolute`` (default) "
+                "uses ``b - a`` in seconds so the biggest actual-time impacts "
+                "float to the bottom — avoids over-weighting cheap "
+                "microsecond tests. ``relative`` uses percent change."
+            )
+        ),
+    ] = "absolute",
+    facets: Annotated[
+        FacetBy | None,
+        typer.Option(
+            "--facets",
+            help=(
+                "Split compare / scatter into subplots by ``phase`` (test "
+                "file) or ``model`` (parametrize id). Default: no faceting. "
+                "Tests whose ids don't match ``[<model>-n=<size>]`` (e.g. "
+                "PyPSA carbon-management) land in an ``other`` facet."
+            ),
+        ),
+    ] = None,
+    output: Annotated[
+        Path | None,
+        typer.Option(
+            "--output",
+            "-o",
+            help=(
+                "Where to write the HTML. Defaults to "
+                "``.benchmarks/plots/<view>.html`` (gitignored) so "
+                "different views don't clobber each other."
+            ),
+        ),
+    ] = None,
+    open_browser: Annotated[
+        bool,
+        typer.Option("--open/--no-open", help="Open the result in a browser."),
+    ] = False,
+) -> None:
+    """
+    Render an interactive HTML plot from one or more snapshots.
+
+    Four views, picked automatically from the snapshot count (compare
+    for 2, sweep for 3+, scaling for 1) or set explicitly via ``--view``:
+
+    - **compare** (2 snapshots) — horizontal bar chart of per-test delta,
+      sorted by magnitude. The "did this PR regress anything?" picture.
+    - **scatter** (2 snapshots) — exploratory two-axis plot: baseline
+      cost on log-x, ratio on y, absolute Δ encoded in colour. Tests
+      in the top-right are the real regressions (slow tests that got
+      slower); top-left = cheap tests with big ratio swings (noise,
+      not real change); bottom-right = already-slow-but-unchanged.
+      Resolves the absolute-vs-relative tension visually.
+    - **sweep** (3+ snapshots) — heatmap of ratio relative to the first
+      snapshot, rows = tests, columns = snapshot labels.
+    - **scaling** (1 snapshot) — log-log time vs ``n`` for
+      size-parametrized tests, faceted by phase.
+
+    Output is an interactive Plotly HTML file. Open it in any browser
+    (or pass ``--open``).
+    """
+    missing = [p for p in snapshots if not p.exists()]
+    if missing:
+        _suggest_snapshots(f"missing snapshots: {[str(p) for p in missing]}")
+        raise typer.Exit(code=2)
+
+    chosen = view or (
+        "scaling"
+        if len(snapshots) == 1
+        else "scatter"
+        if len(snapshots) == 2
+        else "sweep"
+    )
+    if chosen == "compare" and len(snapshots) != 2:
+        typer.secho(
+            "compare view needs exactly 2 snapshots", fg=typer.colors.RED, err=True
+        )
+        raise typer.Exit(code=2)
+    if chosen == "scatter" and len(snapshots) < 2:
+        typer.secho(
+            "scatter view needs at least 2 snapshots (baseline + 1)",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
+    if chosen == "scaling" and len(snapshots) != 1:
+        typer.secho(
+            "scaling view takes exactly 1 snapshot", fg=typer.colors.RED, err=True
+        )
+        raise typer.Exit(code=2)
+
+    try:
+        from benchmarks.plotting import RENDERERS
+    except ImportError as exc:
+        typer.secho(
+            "plotly is required for ``plot`` — ``pip install plotly``",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2) from exc
+
+    # Default filename: ``.benchmarks/plots/<view>.html``. Matches where
+    # snapshots already live (and is gitignored), and the per-view name
+    # means consecutive ``plot`` calls don't clobber each other.
+    if output is None:
+        output = Path(".benchmarks") / "plots" / f"{chosen}.html"
+
+    try:
+        fig, n_tests = RENDERERS[chosen](snapshots, metric, sort, facets)
+    except ValueError as exc:
+        typer.secho(str(exc), fg=typer.colors.RED, err=True)
+        raise typer.Exit(code=1) from exc
+
+    output.parent.mkdir(parents=True, exist_ok=True)
+    fig.write_html(output)
+
+    typer.secho(
+        f"{chosen} view ({metric}): {n_tests} tests → {output}",
+        fg=typer.colors.GREEN,
+    )
+    if open_browser:
+        import webbrowser
+
+        webbrowser.open(output.resolve().as_uri())
+
+
+# --- Memory subcommands ----------------------------------------------------
+
+
+@memory_app.command("save")
+def memory_save_cmd(
+    label: Annotated[
+        str, typer.Argument(help="Label to attach to this snapshot, e.g. a git sha.")
+    ],
+    quick: Annotated[
+        bool, typer.Option("--quick", help="Use smaller problem sizes.")
+    ] = False,
+    phase: Annotated[
+        list[str] | None,
+        typer.Option(
+            "--phase",
+            help=(
+                "Restrict measurement to these phases. Pass multiple ``--phase`` "
+                "to select more than one. Default: all (build, matrices, lp_write,"
+                " netcdf, solver_handoff)."
+            ),
+        ),
+    ] = None,
+    repeats: Annotated[
+        int,
+        typer.Option(
+            "--repeats",
+            help=(
+                "Re-run each measurement N times and keep the min peak. Default "
+                "1 (single shot). Memory peaks have ~1–3 %% wobble from GC "
+                "timing, lazy-import priming, and netcdf page-cache effects — "
+                "min-of-3 tightens that signal."
+            ),
+        ),
+    ] = 1,
+) -> None:
+    """
+    Measure peak memory across the registry × phase grid via ``memray.Tracker``.
+
+    Each ``(phase, spec, size)`` runs under its own tracker so setup
+    allocations (model construction) are excluded from the peak — only the
+    phase work itself is counted. Phases run in separate subprocesses for
+    isolation.
+
+    Results land in ``.benchmarks/memory/<label>.json``, keyed by full
+    pytest-style test IDs so ``compare`` diffs cleanly across runs that
+    selected different subsets.
+    """
+    from benchmarks.memory import MEMORY_PHASES
+
+    if phase:
+        unknown = [p for p in phase if p not in MEMORY_PHASES]
+        if unknown:
+            typer.secho(
+                f"unknown phase(s): {unknown}; valid options: {list(MEMORY_PHASES)}",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            raise typer.Exit(code=2)
+    memory_save(label, quick=quick, phases=phase, repeats=repeats)
+
+
+@memory_app.command("sweep")
+def memory_sweep_cmd(
+    versions: Annotated[
+        list[str],
+        typer.Argument(help="linopy versions, e.g. 0.4.0 0.5.0 (or any pip spec)."),
+    ],
+    output_dir: Annotated[
+        Path,
+        typer.Option(
+            "--output-dir",
+            "-o",
+            help="Where to save snapshot JSONs.",
+        ),
+    ] = Path(".benchmarks/memory"),
+    quick: Annotated[
+        bool,
+        typer.Option("--quick", help="Use only the smallest sizes (faster sweep)."),
+    ] = False,
+    phase: Annotated[
+        list[str] | None,
+        typer.Option(
+            "--phase",
+            help=(
+                "Restrict each version's run to these phases. Pass multiple "
+                "``--phase`` to select more than one."
+            ),
+        ),
+    ] = None,
+    repeats: Annotated[
+        int,
+        typer.Option(
+            "--repeats",
+            help="min-of-N peak per measurement (default 1).",
+        ),
+    ] = 1,
+    as_of: Annotated[
+        str | None,
+        typer.Option(
+            "--as-of",
+            help=(
+                "Freeze every dep's resolution to releases on or before this "
+                "date (``YYYY-MM-DD`` or ISO 8601). Same semantics as "
+                "``sweep --as-of`` — see that command's help."
+            ),
+        ),
+    ] = None,
+) -> None:
+    """
+    Sweep peak-memory measurements across several linopy versions.
+
+    Mirrors the timing :func:`sweep` but invokes ``memory save`` inside
+    each per-version uv venv. Each version's snapshot lands at
+    ``<output-dir>/linopy-<version>.json`` and is auto-detected by
+    ``plot`` (the ``peak_mib`` key distinguishes memory from timing).
+
+    Memory peaks are much more deterministic than wall time, so
+    ``--repeats 1`` (default) is usually plenty. Use ``--repeats 3``
+    if you need <5%% regression detection.
+    """
+    run_memory_sweep(
+        versions,
+        output_dir=output_dir,
+        quick=quick,
+        phases=phase,
+        repeats=repeats,
+        as_of=as_of,
+    )
+
+
+@memory_app.command("compare")
+def memory_compare_cmd(
+    label_a: Annotated[str, typer.Argument(help="Baseline label (typically master).")],
+    label_b: Annotated[str, typer.Argument(help="Candidate label (your branch).")],
+) -> None:
+    """
+    Compare two saved memory snapshots side-by-side.
+
+    Prints a per-test table of label_a vs label_b peak RSS and a percent
+    change. Tests present in only one snapshot are shown with ``—`` for
+    the missing column.
+    """
+    memory_compare(label_a, label_b)
+
+
+if __name__ == "__main__":  # pragma: no cover
+    app()
diff --git a/benchmarks/conftest.py b/benchmarks/conftest.py
index 6f9a9467..abe56ac7 100644
--- a/benchmarks/conftest.py
+++ b/benchmarks/conftest.py
@@ -4,27 +4,67 @@
 
 import pytest
 
-QUICK_THRESHOLD = {
-    "basic": 100,
-    "knapsack": 10_000,
-    "pypsa_scigrid": 50,
-    "expression_arithmetic": 100,
-    "sparse_network": 100,
-}
+from benchmarks.registry import ModelSpec
 
 
-def pytest_addoption(parser):
+def pytest_addoption(parser: pytest.Parser) -> None:
     parser.addoption(
         "--quick",
         action="store_true",
         default=False,
-        help="Use smaller problem sizes for quick benchmarking",
+        help="Use smaller problem sizes for quick benchmarking (CI smoke).",
     )
+    parser.addoption(
+        "--long",
+        action="store_true",
+        default=False,
+        help=(
+            "Include the slowest sizes (above each spec's long_threshold). "
+            "Default runs skip them."
+        ),
+    )
+
+
+def pytest_collection_modifyitems(
+    config: pytest.Config, items: list[pytest.Item]
+) -> None:
+    """
+    Drop PyPSA end-to-end tests under ``--quick``.
+
+    The PyPSA carbon-management network is ~30s by itself; CodSpeed under
+    cachegrind would make it minutes. ``--quick`` is for sub-30s sweeps,
+    so the end-to-end module doesn't belong there.
+    """
+    if not config.getoption("--quick"):
+        return
+    skip = pytest.mark.skip(reason="--quick: pypsa end-to-end skipped")
+    for item in items:
+        if "test_pypsa_carbon_management" in item.nodeid:
+            item.add_marker(skip)
+
+
+def maybe_skip(request: pytest.FixtureRequest, spec: ModelSpec, size: int) -> None:
+    """
+    Apply size-tier skips and ``spec.requires`` importorskips.
+
+    Tiers (most restrictive first):
+
+    - ``--quick``                 → skip ``size > quick_threshold``
+    - default (no flag)           → skip ``size > long_threshold``
+    - ``--long``                  → no size cap
+
+    If both ``--quick`` and ``--long`` are passed, ``--quick`` wins (the more
+    restrictive mode is honoured).
+    """
+    for mod in spec.requires:
+        pytest.importorskip(mod)
 
+    quick = request.config.getoption("--quick")
+    long_ = request.config.getoption("--long")
 
-def skip_if_quick(request, model: str, size: int):
-    """Skip large sizes when --quick is passed."""
-    if request.config.getoption("--quick"):
-        threshold = QUICK_THRESHOLD.get(model, float("inf"))
-        if size > threshold:
-            pytest.skip(f"--quick: skipping {model} size {size}")
+    if quick:
+        if size > spec.quick_threshold:
+            pytest.skip(f"--quick: skipping {spec.name} size {size}")
+    elif not long_:
+        if size > spec.long_threshold:
+            pytest.skip(f"long size needs --long: skipping {spec.name} size {size}")
diff --git a/benchmarks/memory.py b/benchmarks/memory.py
index 20af4b8a..b3bf4cc6 100644
--- a/benchmarks/memory.py
+++ b/benchmarks/memory.py
@@ -1,137 +1,343 @@
-#!/usr/bin/env python
 """
-Measure and compare peak memory using pytest-memray.
+Measure and compare peak memory across the registry × phase grid.
 
-Usage:
-    # Save a baseline (on master)
-    python benchmarks/memory.py save master
+Each measurement uses ``memray.Tracker`` directly so the model construction
+(setup) lives *outside* the tracked region and the peak reflects only the
+phase work itself::
 
-    # Save current branch
-    python benchmarks/memory.py save my-feature
+    m = spec.build(size)            # setup, not tracked
+    with memray.Tracker(bin_path):
+        wrapper(m)                  # tracked
+    peak = FileReader(bin_path).metadata.peak_memory
 
-    # Compare two saved runs
-    python benchmarks/memory.py compare master my-feature
+This module exposes ``save(label, ...)`` and ``compare(label_a, label_b)`` as
+plain functions; user-facing invocation goes through the typer CLI::
 
-    # Quick mode (smaller sizes)
-    python benchmarks/memory.py save master --quick
+    python -m benchmarks memory save <label> [--quick] [--phase build] ...
+    python -m benchmarks memory compare <a> <b>
 
-Results are stored in .benchmarks/memory/.
+Results land in ``.benchmarks/memory/`` as JSON keyed by full pytest-style
+test IDs (``benchmarks/test_<phase>.py::test_<phase>[<spec>-n=<size>]``)
+so cross-snapshot diffs work uniformly regardless of which phases were run.
 """
 
 from __future__ import annotations
 
 import argparse
+import gc
 import json
 import platform
-import re
 import subprocess
 import sys
+import tempfile
+from collections.abc import Callable, Iterator
 from pathlib import Path
+from typing import TYPE_CHECKING
+
+from benchmarks.snapshot import write_memory_snapshot
+
+if TYPE_CHECKING:
+    from benchmarks.registry import ModelSpec
+
+
+def _require_memray() -> None:
+    """
+    Raise if memory measurement isn't supported on this platform.
+
+    Called at the top of every entry point that actually measures
+    (:func:`measure_peak`, :func:`run_phase`, :func:`save`) rather than
+    at import time, so the module imports cleanly everywhere — notably
+    ``benchmarks.bench`` reuses :func:`measure_peak` and must import on
+    Windows. Only *measuring* fails there, with the original message.
+    """
+    if platform.system() == "Windows":
+        raise RuntimeError(
+            "memory measurement requires ``memray`` which is not available on "
+            "Windows. Run memory benchmarks on Linux or macOS."
+        )
 
-if platform.system() == "Windows":
-    raise RuntimeError(
-        "memory.py requires pytest-memray which is not available on Windows. "
-        "Run memory benchmarks on Linux or macOS."
-    )
 
 RESULTS_DIR = Path(".benchmarks/memory")
-MEMORY_RE = re.compile(
-    r"Allocation results for (.+?) at the high watermark\s+"
-    r"📦 Total memory allocated: ([\d.]+)(MiB|KiB|GiB|B)",
+MEMORY_PHASES: tuple[str, ...] = (
+    "build",
+    "matrices",
+    "lp_write",
+    "netcdf",
+    "solver_handoff",
 )
-# Only the build phase is measured by default. Unlike timing benchmarks (where
-# pytest-benchmark isolates the measured function), memray tracks all allocations
-# within a test — including model construction in setup. This means LP write and
-# matrix tests would report build + phase memory combined, making the phase-specific
-# contribution hard to isolate. Since model construction dominates memory usage,
-# measuring build alone gives the most accurate and actionable numbers.
-DEFAULT_TEST_PATHS = [
-    "benchmarks/test_build.py",
-]
-
-
-def _to_mib(value: float, unit: str) -> float:
-    factors = {"B": 1 / 1048576, "KiB": 1 / 1024, "MiB": 1, "GiB": 1024}
-    return value * factors[unit]
-
-
-def _collect_test_ids(test_paths: list[str], quick: bool) -> list[str]:
-    """Collect test IDs without running them."""
-    cmd = [
-        sys.executable,
-        "-m",
-        "pytest",
-        *test_paths,
-        "--collect-only",
-        "-q",
-    ]
-    if quick:
-        cmd.append("--quick")
-    result = subprocess.run(cmd, capture_output=True, text=True)
-    return [
-        line.strip()
-        for line in result.stdout.splitlines()
-        if "::" in line and not line.startswith(("=", "-", " "))
-    ]
-
-
-def save(label: str, quick: bool = False, test_paths: list[str] | None = None) -> Path:
-    """Run each benchmark in a separate process for accurate memory measurement."""
-    if test_paths is None:
-        test_paths = DEFAULT_TEST_PATHS
-    test_ids = _collect_test_ids(test_paths, quick)
-    if not test_ids:
-        print("No tests collected.", file=sys.stderr)
-        sys.exit(1)
 
-    print(f"Running {len(test_ids)} tests (each in a separate process)...")
-    entries = {}
-    for i, test_id in enumerate(test_ids, 1):
-        short = test_id.split("::")[-1]
-        print(f"  [{i}/{len(test_ids)}] {short}...", end=" ", flush=True)
 
+def _phase_tag(phase: str) -> str:
+    """Map a phase name to the registry phase tag used by ``spec.applies_to``."""
+    from benchmarks.registry import (
+        BUILD,
+        LP_WRITE,
+        MATRICES,
+        NETCDF,
+        TO_HIGHSPY,
+    )
+
+    return {
+        "build": BUILD,
+        "matrices": MATRICES,
+        "lp_write": LP_WRITE,
+        "netcdf": NETCDF,
+        "solver_handoff": TO_HIGHSPY,  # we always measure the highs handoff
+    }[phase]
+
+
+def measure_peak(action: Callable[[], object], repeats: int = 1) -> float:
+    """
+    Run ``action()`` under ``memray.Tracker`` and return peak MiB.
+
+    With ``repeats > 1`` the action runs that many times in fresh
+    trackers and the *minimum* peak is returned — peak memory is
+    noisier than naive expectations (GC timing, lazy-import priming,
+    file-system page cache for netcdf) so the min-of-N is the cleanest
+    estimate of "the floor this code can hit".
+    """
+    _require_memray()
+
+    import memray
+
+    peaks: list[float] = []
+    for _ in range(max(1, repeats)):
+        fd, tmp = tempfile.mkstemp(suffix=".bin")
+        Path(tmp).unlink()  # memray needs to create the file itself
+        # Close the fd; the path is what matters.
+        try:
+            from os import close as _close
+
+            _close(fd)
+        except OSError:
+            pass
+
+        try:
+            with memray.Tracker(tmp):
+                action()
+            peak_bytes = memray.FileReader(tmp).metadata.peak_memory
+            peaks.append(round(peak_bytes / (1024**2), 3))
+        finally:
+            Path(tmp).unlink(missing_ok=True)
+        gc.collect()
+
+    return min(peaks)
+
+
+# Back-compat alias: ``_measure_peak`` was the private name before
+# ``benchmarks.bench`` needed to reuse it.
+_measure_peak = measure_peak
+
+
+def _measurements(
+    phase: str, spec: ModelSpec, size: int
+) -> Iterator[tuple[str, Callable[[], object]]]:
+    """
+    Yield ``(test_id, action)`` pairs for one ``(phase, spec, size)``.
+
+    ``action`` is a zero-arg callable; the caller runs it inside a tracker.
+    For non-build phases, the model is built once up front (outside the
+    tracker) and the action closes over it so only the phase work is
+    counted.
+    """
+    name = spec.name
+
+    if phase == "build":
+        yield (
+            f"benchmarks/test_build.py::test_build[{name}-n={size}]",
+            lambda: spec.build(size),
+        )
+        return
+
+    m = spec.build(size)
+
+    if phase == "matrices":
+        from benchmarks.phases import touch_matrices
+
+        yield (
+            f"benchmarks/test_matrices.py::test_matrices[{name}-n={size}]",
+            lambda: touch_matrices(m),
+        )
+
+    elif phase == "lp_write":
+        from benchmarks.phases import write_lp
+
+        tmpdir = tempfile.TemporaryDirectory()
+        lp_path = Path(tmpdir.name) / "m.lp"
+        try:
+            yield (
+                f"benchmarks/test_lp_write.py::test_lp_write[{name}-n={size}]",
+                lambda: write_lp(m, lp_path),
+            )
+        finally:
+            tmpdir.cleanup()
+
+    elif phase == "netcdf":
+        from benchmarks.phases import read_netcdf, write_netcdf
+
+        tmpdir = tempfile.TemporaryDirectory()
+        nc_path = Path(tmpdir.name) / "m.nc"
+        try:
+            yield (
+                f"benchmarks/test_netcdf.py::test_netcdf_write[{name}-n={size}]",
+                lambda: write_netcdf(m, nc_path),
+            )
+            # ``write_netcdf`` was called by the caller as part of the
+            # measurement, so ``nc_path`` now exists for the read.
+            yield (
+                f"benchmarks/test_netcdf.py::test_netcdf_read[{name}-n={size}]",
+                lambda: read_netcdf(nc_path),
+            )
+        finally:
+            tmpdir.cleanup()
+
+    elif phase == "solver_handoff":
+        from benchmarks.phases import SOLVER_HANDOFFS
+
+        # Memory currently tracks only HiGHS — look it up by name so a
+        # reordering of SOLVER_HANDOFFS doesn't silently swap solvers.
+        # Older linopy releases without ``to_highspy`` skip the phase
+        # silently rather than emitting an id with no possible match.
+        highs = next((w for n, _, w in SOLVER_HANDOFFS if n == "highs"), None)
+        if highs is None:
+            return
+
+        yield (
+            (
+                f"benchmarks/test_solver_handoff.py::test_solver_handoff"
+                f"[highs-{name}-n={size}]"
+            ),
+            lambda: highs(m),
+        )
+
+    else:
+        raise ValueError(f"unknown phase: {phase!r}")
+
+
+def run_phase(phase: str, quick: bool = False, repeats: int = 1) -> dict[str, float]:
+    """
+    Measure peak memory for every applicable ``(spec, size)`` under one phase.
+
+    Returns a ``{test_id: peak_mib}`` mapping. Invoked once per phase as a
+    subprocess by :func:`save` for isolation. ``repeats`` is forwarded to
+    :func:`measure_peak` so callers can dial up signal-to-noise.
+    """
+    _require_memray()
+
+    from benchmarks import REGISTRY
+
+    tag = _phase_tag(phase)
+    results: dict[str, float] = {}
+
+    for spec in REGISTRY.values():
+        if not spec.applies_to(tag):
+            continue
+
+        # Optional-dep gate (e.g. pypsa_scigrid needs pypsa).
+        for mod in spec.requires:
+            try:
+                __import__(mod)
+            except ImportError:
+                break
+        else:
+            for size in spec.sizes:
+                if quick and size > spec.quick_threshold:
+                    continue
+                try:
+                    for test_id, action in _measurements(phase, spec, size):
+                        try:
+                            results[test_id] = _measure_peak(action, repeats=repeats)
+                            print(
+                                f"  {test_id} → {results[test_id]:.1f} MiB",
+                                file=sys.stderr,
+                            )
+                        except Exception as exc:  # noqa: BLE001
+                            print(
+                                f"  skip {test_id}: {type(exc).__name__}: {exc}",
+                                file=sys.stderr,
+                            )
+                except Exception as exc:  # noqa: BLE001
+                    print(
+                        f"  setup failed {spec.name}/{size}: "
+                        f"{type(exc).__name__}: {exc}",
+                        file=sys.stderr,
+                    )
+                gc.collect()
+
+    return results
+
+
+def save(
+    label: str,
+    quick: bool = False,
+    phases: list[str] | None = None,
+    repeats: int = 1,
+) -> Path:
+    """
+    Run one subprocess per phase and merge the results into ``<label>.json``.
+
+    Per-phase subprocesses keep allocations from one phase out of another's
+    measurement; ``memray.Tracker`` only counts what's allocated inside its
+    ``with`` block, but the subprocess boundary makes the isolation total.
+    """
+    _require_memray()
+
+    phases = list(phases) if phases else list(MEMORY_PHASES)
+
+    all_results: dict[str, float] = {}
+    for phase in phases:
+        print(f"\n=== {phase} ===", file=sys.stderr)
+        # Worker writes JSON to a sidecar file rather than stdout — HiGHS
+        # (and other solvers) print to stdout from C code inside the tracked
+        # region, which would pollute the data channel.
+        fd, out_tmp = tempfile.mkstemp(suffix=".json", prefix=f"mem-{phase}-")
+        from os import close as _close
+
+        _close(fd)
         cmd = [
             sys.executable,
             "-m",
-            "pytest",
-            test_id,
-            "--memray",
-            "--benchmark-disable",
-            "-v",
-            "--tb=short",
-            "-q",
+            "benchmarks.memory",
+            "_worker",
+            phase,
+            "--out",
+            out_tmp,
         ]
-        result = subprocess.run(cmd, capture_output=True, text=True)
-        output = result.stdout + result.stderr
-
-        match = MEMORY_RE.search(output)
-        if match:
-            value = float(match.group(2))
-            unit = match.group(3)
-            mib = round(_to_mib(value, unit), 3)
-            entries[test_id] = mib
-            print(f"{mib:.1f} MiB")
-        elif "SKIPPED" in output or "skipped" in output:
-            print("skipped")
-        else:
-            print(
-                "WARNING: no memray data (pytest-memray output format may have changed)",
-                file=sys.stderr,
-            )
-
-    if not entries:
-        print("No memray results found. Is pytest-memray installed?", file=sys.stderr)
+        if quick:
+            cmd.append("--quick")
+        if repeats > 1:
+            cmd.extend(["--repeats", str(repeats)])
+        try:
+            result = subprocess.run(cmd, check=False, capture_output=True, text=True)
+            if result.stderr:
+                sys.stderr.write(result.stderr)
+            if result.returncode != 0:
+                print(
+                    f"phase {phase} subprocess failed (exit {result.returncode})",
+                    file=sys.stderr,
+                )
+                continue
+            try:
+                phase_results = json.loads(Path(out_tmp).read_text())
+            except (json.JSONDecodeError, FileNotFoundError) as exc:
+                print(f"phase {phase} JSON parse error: {exc}", file=sys.stderr)
+                continue
+            all_results.update(phase_results)
+        finally:
+            Path(out_tmp).unlink(missing_ok=True)
+
+    if not all_results:
+        print("No measurements produced.", file=sys.stderr)
         sys.exit(1)
 
     RESULTS_DIR.mkdir(parents=True, exist_ok=True)
-    out_path = RESULTS_DIR / f"{label}.json"
-    out_path.write_text(json.dumps({"label": label, "peak_mib": entries}, indent=2))
-    print(f"\nSaved {len(entries)} results to {out_path}")
+    out_path = write_memory_snapshot(RESULTS_DIR / f"{label}.json", label, all_results)
+    print(f"\nSaved {len(all_results)} measurements to {out_path}", file=sys.stderr)
     return out_path
 
 
 def compare(label_a: str, label_b: str) -> None:
-    """Compare two saved memory results."""
+    """Diff two saved memory snapshots side-by-side."""
     path_a = RESULTS_DIR / f"{label_a}.json"
     path_b = RESULTS_DIR / f"{label_b}.json"
     for p in (path_a, path_b):
@@ -144,8 +350,8 @@ def compare(label_a: str, label_b: str) -> None:
 
     all_tests = sorted(set(data_a) | set(data_b))
 
-    print(f"\n{'Test':<60} {label_a:>10} {label_b:>10} {'Change':>10}")
-    print("-" * 94)
+    print(f"\n{'Test':<70} {label_a:>10} {label_b:>10} {'Change':>10}")
+    print("-" * 104)
 
     for test in all_tests:
         a = data_a.get(test)
@@ -157,43 +363,31 @@ def compare(label_a: str, label_b: str) -> None:
             change = f"{pct:+.1f}%"
         else:
             change = "—"
-        # Shorten test name for readability
         short = test.split("::")[-1] if "::" in test else test
-        print(f"{short:<60} {a_str:>10} {b_str:>10} {change:>10}")
+        print(f"{short:<70} {a_str:>10} {b_str:>10} {change:>10}")
 
     print()
 
 
-def main():
-    parser = argparse.ArgumentParser(
-        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
-    )
-    sub = parser.add_subparsers(dest="cmd", required=True)
+# ---- subprocess worker ---------------------------------------------------
 
-    p_save = sub.add_parser("save", help="Run benchmarks and save memory results")
-    p_save.add_argument(
-        "label", help="Label for this run (e.g. 'master', 'my-feature')"
+if __name__ == "__main__":  # pragma: no cover
+    parser = argparse.ArgumentParser(description="memory.py worker")
+    parser.add_argument("cmd", choices=["_worker"])
+    parser.add_argument("phase")
+    parser.add_argument("--quick", action="store_true")
+    parser.add_argument(
+        "--repeats",
+        type=int,
+        default=1,
+        help="Run each measurement N times and keep the min peak (default 1).",
     )
-    p_save.add_argument(
-        "--quick", action="store_true", help="Use smaller problem sizes"
+    parser.add_argument(
+        "--out",
+        required=True,
+        help="Path to write the JSON result to (stdout is reserved for solver chatter).",
     )
-    p_save.add_argument(
-        "--test-path",
-        nargs="+",
-        default=None,
-        help="Test file(s) to run (default: all phases)",
-    )
-
-    p_cmp = sub.add_parser("compare", help="Compare two saved runs")
-    p_cmp.add_argument("label_a", help="First run label (baseline)")
-    p_cmp.add_argument("label_b", help="Second run label")
-
     args = parser.parse_args()
-    if args.cmd == "save":
-        save(args.label, quick=args.quick, test_paths=args.test_path)
-    elif args.cmd == "compare":
-        compare(args.label_a, args.label_b)
-
-
-if __name__ == "__main__":
-    main()
+    if args.cmd == "_worker":
+        out = run_phase(args.phase, quick=args.quick, repeats=args.repeats)
+        Path(args.out).write_text(json.dumps(out))
diff --git a/benchmarks/models/__init__.py b/benchmarks/models/__init__.py
index fcff9caf..a471b216 100644
--- a/benchmarks/models/__init__.py
+++ b/benchmarks/models/__init__.py
@@ -1,21 +1,24 @@
-"""Model builders for benchmarks."""
+"""
+Model builders for benchmarks.
 
-from benchmarks.models.basic import SIZES as BASIC_SIZES
-from benchmarks.models.basic import build_basic
-from benchmarks.models.expression_arithmetic import SIZES as EXPR_SIZES
-from benchmarks.models.expression_arithmetic import build_expression_arithmetic
-from benchmarks.models.knapsack import SIZES as KNAPSACK_SIZES
-from benchmarks.models.knapsack import build_knapsack
-from benchmarks.models.sparse_network import SIZES as SPARSE_SIZES
-from benchmarks.models.sparse_network import build_sparse_network
+Importing this package triggers every submodule's ``register(...)`` call,
+populating :data:`benchmarks.registry.REGISTRY`. Each submodule exposes a
+``build_<name>(size) -> linopy.Model`` callable and a module-level ``SPEC``
+:class:`~benchmarks.registry.ModelSpec`. The documented access path is
+``REGISTRY["<name>"]``; submodule re-exports are intentionally not exposed
+here so that adding a new model is one new file plus one import below.
+"""
 
-__all__ = [
-    "BASIC_SIZES",
-    "EXPR_SIZES",
-    "KNAPSACK_SIZES",
-    "SPARSE_SIZES",
-    "build_basic",
-    "build_expression_arithmetic",
-    "build_knapsack",
-    "build_sparse_network",
-]
+# Side-effect imports — each module calls ``register(...)`` at import time.
+from benchmarks.models import (  # noqa: F401
+    basic,
+    expression_arithmetic,
+    knapsack,
+    masked,
+    milp,
+    piecewise,
+    pypsa_scigrid,
+    qp,
+    sos,
+    sparse_network,
+)
diff --git a/benchmarks/models/basic.py b/benchmarks/models/basic.py
index 2aea49d9..6959e188 100644
--- a/benchmarks/models/basic.py
+++ b/benchmarks/models/basic.py
@@ -1,10 +1,11 @@
-"""Basic benchmark model: 2*N^2 variables and constraints."""
+"""Basic benchmark model: 2*N^2 variables and constraints (continuous LP)."""
 
 from __future__ import annotations
 
 import linopy
+from benchmarks.registry import CONTINUOUS, ModelSpec, register
 
-SIZES = [10, 50, 100, 250, 500, 1000, 1600]
+SIZES = (10, 50, 100, 250, 500, 1000, 1600)
 
 
 def build_basic(n: int) -> linopy.Model:
@@ -16,3 +17,15 @@ def build_basic(n: int) -> linopy.Model:
     m.add_constraints(x - y >= -5, name="lower")
     m.add_objective(x.sum() + 2 * y.sum())
     return m
+
+
+SPEC = register(
+    ModelSpec(
+        name="basic",
+        build=build_basic,
+        sizes=SIZES,
+        features=frozenset({CONTINUOUS}),
+        quick_threshold=10,
+        long_threshold=500,
+    )
+)
diff --git a/benchmarks/models/expression_arithmetic.py b/benchmarks/models/expression_arithmetic.py
index 339c651d..80590951 100644
--- a/benchmarks/models/expression_arithmetic.py
+++ b/benchmarks/models/expression_arithmetic.py
@@ -5,8 +5,9 @@
 import numpy as np
 
 import linopy
+from benchmarks.registry import CONTINUOUS, ModelSpec, register
 
-SIZES = [10, 50, 100, 250, 500, 1000]
+SIZES = (10, 50, 100, 250, 500, 1000)
 
 
 def build_expression_arithmetic(n: int) -> linopy.Model:
@@ -28,3 +29,15 @@ def build_expression_arithmetic(n: int) -> linopy.Model:
     m.add_constraints(expr1.sum("j") >= -10, name="row_sum")
     m.add_objective(combined.sum())
     return m
+
+
+SPEC = register(
+    ModelSpec(
+        name="expression_arithmetic",
+        build=build_expression_arithmetic,
+        sizes=SIZES,
+        features=frozenset({CONTINUOUS}),
+        quick_threshold=10,
+        long_threshold=500,
+    )
+)
diff --git a/benchmarks/models/knapsack.py b/benchmarks/models/knapsack.py
index 83ce7394..7860f285 100644
--- a/benchmarks/models/knapsack.py
+++ b/benchmarks/models/knapsack.py
@@ -1,12 +1,13 @@
-"""Knapsack benchmark model: N binary variables, 1 constraint."""
+"""Knapsack benchmark model: N binary variables, 1 constraint (MILP, binary)."""
 
 from __future__ import annotations
 
 import numpy as np
 
 import linopy
+from benchmarks.registry import BINARY, DEFAULT_PHASES, ModelSpec, register
 
-SIZES = [100, 1_000, 10_000, 100_000, 1_000_000]
+SIZES = (100, 1_000, 10_000, 100_000, 1_000_000)
 
 
 def build_knapsack(n: int) -> linopy.Model:
@@ -21,3 +22,16 @@ def build_knapsack(n: int) -> linopy.Model:
     m.add_constraints((x * weights).sum() <= capacity, name="capacity")
     m.add_objective(-(x * values).sum())
     return m
+
+
+SPEC = register(
+    ModelSpec(
+        name="knapsack",
+        build=build_knapsack,
+        sizes=SIZES,
+        features=frozenset({BINARY}),
+        phases=DEFAULT_PHASES,  # HiGHS handles binary; matrices handles MILP
+        quick_threshold=100,
+        long_threshold=10_000,
+    )
+)
diff --git a/benchmarks/models/masked.py b/benchmarks/models/masked.py
new file mode 100644
index 00000000..fccac137
--- /dev/null
+++ b/benchmarks/models/masked.py
@@ -0,0 +1,91 @@
+"""
+Masked-variables benchmark: transportation with sparse allowed routes.
+
+A standard transportation LP, but only a sparse subset of (origin, dest) pairs
+are valid routes. The ``mask=`` keyword on ``add_variables`` skips the rest,
+keeping the variable count sub-quadratic.
+
+Decision variables:
+    x[origin, dest] >= 0   continuous, only created for allowed routes
+
+Constraints:
+    sum_dest x[o, .]   <= supply[o]
+    sum_orig x[., d]   == demand[d]
+
+Objective:
+    minimize  sum cost[o, d] * x[o, d]
+
+The mask is dense at small sizes and sparser at large sizes, mimicking
+real-world transport networks where each origin only serves a fixed
+fan-out regardless of total node count.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import xarray as xr
+
+import linopy
+from benchmarks.registry import (
+    CONTINUOUS,
+    DEFAULT_PHASES,
+    MASKED,
+    ModelSpec,
+    register,
+)
+
+SIZES = (10, 50, 100, 500, 1000)
+
+
+def build_masked(n: int) -> linopy.Model:
+    rng = np.random.default_rng(42)
+    origins = np.arange(n)
+    dests = np.arange(n)
+
+    # Each origin serves at most ~min(20, n) destinations.
+    fan_out = min(20, n)
+    mask_np = np.zeros((n, n), dtype=bool)
+    for o in range(n):
+        # Deterministic fan-out so size determines connectivity.
+        targets = rng.choice(n, size=fan_out, replace=False)
+        mask_np[o, targets] = True
+
+    mask = xr.DataArray(mask_np, coords=[("origin", origins), ("dest", dests)])
+    cost = xr.DataArray(
+        rng.uniform(1, 10, size=(n, n)),
+        coords=[("origin", origins), ("dest", dests)],
+    )
+
+    # Supply scaled so the problem stays feasible at any size:
+    # each origin can ship up to ``demand_per_dest * fan_out`` units.
+    demand_per_dest = 5.0
+    supply_per_origin = demand_per_dest * n  # plenty of slack
+    supply = xr.DataArray(np.full(n, supply_per_origin), coords=[("origin", origins)])
+    demand = xr.DataArray(np.full(n, demand_per_dest), coords=[("dest", dests)])
+
+    m = linopy.Model()
+    x = m.add_variables(
+        lower=0,
+        coords=[("origin", origins), ("dest", dests)],
+        mask=mask,
+        name="x",
+    )
+
+    m.add_constraints(x.sum("dest") <= supply, name="supply", mask=mask.any("dest"))
+    m.add_constraints(x.sum("origin") == demand, name="demand", mask=mask.any("origin"))
+
+    m.add_objective((cost * x).sum())
+    return m
+
+
+SPEC = register(
+    ModelSpec(
+        name="masked",
+        build=build_masked,
+        sizes=SIZES,
+        features=frozenset({CONTINUOUS, MASKED}),
+        phases=DEFAULT_PHASES,
+        quick_threshold=10,
+        long_threshold=500,
+    )
+)
diff --git a/benchmarks/models/milp.py b/benchmarks/models/milp.py
new file mode 100644
index 00000000..e762f207
--- /dev/null
+++ b/benchmarks/models/milp.py
@@ -0,0 +1,80 @@
+"""
+MILP benchmark: capacitated facility location with general integers.
+
+Decision variables:
+    y_f  in {0,1,...,K}      integer "modules" to open at facility f
+    x_{f,c} >= 0             continuous flow from facility f to customer c
+
+Constraints:
+    sum_c x_{f,c}  <=  cap * y_f       (capacity per facility)
+    sum_f x_{f,c}  ==  d_c             (demand at each customer)
+
+Objective:
+    minimize  sum_{f,c} t_{f,c} * x_{f,c}  +  sum_f f_f * y_f
+
+The general-integer ``y`` exercises the matrix accessor's MIP integer-section
+path and the LP-writer's general-integer block — neither the binary knapsack
+nor the continuous LPs hit those paths.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+
+import linopy
+from benchmarks.registry import (
+    CONTINUOUS,
+    DEFAULT_PHASES,
+    INTEGER,
+    ModelSpec,
+    register,
+)
+
+SIZES = (10, 25, 50, 100, 200)
+
+
+def build_milp(n: int) -> linopy.Model:
+    rng = np.random.default_rng(42)
+    facilities = np.arange(n)
+    customers = np.arange(n)
+
+    cap = 100.0  # capacity per module
+    Y_MAX = 5  # max modules per facility
+    transport = rng.uniform(1, 20, size=(n, n))  # per-unit shipping cost
+    fixed = rng.uniform(50, 200, size=n)  # cost per facility module
+    demand = rng.uniform(20, 80, size=n)  # demand at each customer
+
+    m = linopy.Model()
+    y = m.add_variables(
+        lower=0,
+        upper=Y_MAX,
+        coords=[facilities],
+        dims=["facility"],
+        integer=True,
+        name="y",
+    )
+    x = m.add_variables(
+        lower=0,
+        coords=[facilities, customers],
+        dims=["facility", "customer"],
+        name="x",
+    )
+
+    m.add_constraints(x.sum("customer") - cap * y <= 0, name="capacity")
+    m.add_constraints(x.sum("facility") == demand, name="demand")
+
+    m.add_objective((transport * x).sum() + (fixed * y).sum())
+    return m
+
+
+SPEC = register(
+    ModelSpec(
+        name="milp",
+        build=build_milp,
+        sizes=SIZES,
+        features=frozenset({INTEGER, CONTINUOUS}),
+        phases=DEFAULT_PHASES,
+        quick_threshold=10,
+        long_threshold=100,
+    )
+)
diff --git a/benchmarks/models/piecewise.py b/benchmarks/models/piecewise.py
new file mode 100644
index 00000000..77157ba1
--- /dev/null
+++ b/benchmarks/models/piecewise.py
@@ -0,0 +1,94 @@
+"""
+Piecewise-linear benchmark: generation with piecewise fuel-cost curves.
+
+Each generator has a piecewise fuel cost curve pinned via
+``add_piecewise_formulation``. The default ``method="auto"`` picks an
+SOS2 or incremental expansion, generating auxiliary variables and
+constraints — that overhead is what we want to measure.
+
+Decision variables:
+    power[gen]  in [0, 100]      (continuous)
+    fuel[gen]   in [0, inf)      (continuous, pinned to piecewise curve)
+
+Constraints:
+    sum_gen power[gen]  >=  demand
+    piecewise:  fuel[gen] = f(power[gen])    for each gen
+
+Objective:
+    minimize  sum_gen fuel[gen]
+"""
+
+from __future__ import annotations
+
+import warnings
+
+import linopy
+from benchmarks.registry import (
+    CONTINUOUS,
+    DEFAULT_PHASES,
+    PIECEWISE,
+    ModelSpec,
+    register,
+)
+
+SIZES = (10, 100, 1_000, 5_000)
+
+_API_AVAILABLE = hasattr(linopy.Model, "add_piecewise_formulation") and hasattr(
+    linopy, "EvolvingAPIWarning"
+)
+
+
+def build_piecewise(n_gens: int) -> linopy.Model:
+    # Shared breakpoints, broadcast across generators.
+    x_pts = [0.0, 30.0, 60.0, 100.0]
+    y_pts = [0.0, 36.0, 84.0, 170.0]  # convex-ish fuel curve
+
+    m = linopy.Model()
+    power = m.add_variables(
+        lower=0,
+        upper=100,
+        coords=[range(n_gens)],
+        dims=["gen"],
+        name="power",
+    )
+    fuel = m.add_variables(
+        lower=0,
+        coords=[range(n_gens)],
+        dims=["gen"],
+        name="fuel",
+    )
+
+    demand = 0.5 * n_gens * x_pts[-1]
+    m.add_constraints(power.sum() >= demand, name="demand")
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("ignore", category=linopy.EvolvingAPIWarning)
+        m.add_piecewise_formulation(
+            (power, x_pts),
+            (fuel, y_pts),
+        )
+
+    m.add_objective(fuel.sum())
+    return m
+
+
+# ``add_piecewise_formulation`` is a recent (still-evolving) API. Skip
+# registration silently on older linopy so the rest of the suite stays usable.
+SPEC: ModelSpec | None
+if _API_AVAILABLE:
+    SPEC = register(
+        ModelSpec(
+            name="piecewise",
+            build=build_piecewise,
+            sizes=SIZES,
+            features=frozenset({CONTINUOUS, PIECEWISE}),
+            # Monotonic breakpoints + ``method="auto"`` → incremental
+            # reformulation (pure MILP with binaries), which every supported
+            # solver handles.
+            phases=DEFAULT_PHASES,
+            quick_threshold=10,
+            long_threshold=1_000,
+        )
+    )
+else:
+    SPEC = None
diff --git a/benchmarks/models/pypsa_scigrid.py b/benchmarks/models/pypsa_scigrid.py
index 2fcce217..41d8836b 100644
--- a/benchmarks/models/pypsa_scigrid.py
+++ b/benchmarks/models/pypsa_scigrid.py
@@ -1,13 +1,15 @@
-"""PyPSA SciGrid-DE benchmark model."""
+"""PyPSA SciGrid-DE benchmark model (requires pypsa)."""
 
 from __future__ import annotations
 
 from typing import TYPE_CHECKING
 
+from benchmarks.registry import CONTINUOUS, ModelSpec, register
+
 if TYPE_CHECKING:
     import linopy
 
-SIZES = [10, 50, 100, 200]
+SIZES = (10, 50, 100, 200)
 
 
 def build_pypsa_scigrid(snapshots: int = 100) -> linopy.Model:
@@ -18,3 +20,19 @@ def build_pypsa_scigrid(snapshots: int = 100) -> linopy.Model:
     n.set_snapshots(n.snapshots[:snapshots])
     n.optimize.create_model()
     return n.model
+
+
+SPEC = register(
+    ModelSpec(
+        name="pypsa_scigrid",
+        build=build_pypsa_scigrid,
+        sizes=SIZES,
+        features=frozenset({CONTINUOUS}),
+        # quick_threshold=0 keeps pypsa_scigrid out of --quick entirely —
+        # PyPSA import + example loading dominates the smoke wall-clock
+        # otherwise. It still runs in default and --long modes.
+        quick_threshold=0,
+        long_threshold=50,
+        requires=("pypsa",),
+    )
+)
diff --git a/benchmarks/models/qp.py b/benchmarks/models/qp.py
new file mode 100644
index 00000000..a040df45
--- /dev/null
+++ b/benchmarks/models/qp.py
@@ -0,0 +1,66 @@
+"""
+QP benchmark: continuous quadratic objective on a portfolio-style model.
+
+Decision variables:
+    x_i  >= 0   (weight on asset i, continuous)
+
+Constraints:
+    sum_i x_i  == 1
+    x_i        <= 0.3        (no asset > 30% of portfolio)
+
+Objective:
+    minimize  sum_i q_i * x_i^2  -  sum_i r_i * x_i
+
+A pure diagonal quadratic — enough to exercise the QP build / write / matrix
+paths without paying for cross-terms. Cross-term coupling needs single-term
+factors on both sides (see ``LinearExpression._multiply_by_linear_expression``),
+which is awkward to set up cleanly via the public API.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+
+import linopy
+from benchmarks.registry import (
+    CONTINUOUS,
+    DEFAULT_PHASES,
+    QUADRATIC,
+    ModelSpec,
+    register,
+)
+
+SIZES = (10, 100, 1_000, 5_000, 20_000)
+
+
+def build_qp(n_assets: int) -> linopy.Model:
+    rng = np.random.default_rng(42)
+    q = rng.uniform(0.5, 2.0, size=n_assets)
+    r = rng.uniform(0.05, 0.15, size=n_assets)
+
+    m = linopy.Model()
+    x = m.add_variables(
+        lower=0,
+        upper=0.3,
+        coords=[range(n_assets)],
+        dims=["asset"],
+        name="x",
+    )
+
+    m.add_constraints(x.sum() == 1, name="budget")
+
+    m.add_objective((q * x**2).sum() - (r * x).sum())
+    return m
+
+
+SPEC = register(
+    ModelSpec(
+        name="qp",
+        build=build_qp,
+        sizes=SIZES,
+        features=frozenset({CONTINUOUS, QUADRATIC}),
+        phases=DEFAULT_PHASES,
+        quick_threshold=10,
+        long_threshold=1_000,
+    )
+)
diff --git a/benchmarks/models/sos.py b/benchmarks/models/sos.py
new file mode 100644
index 00000000..26ff2fb7
--- /dev/null
+++ b/benchmarks/models/sos.py
@@ -0,0 +1,99 @@
+"""
+SOS1 benchmark: multi-mode generation with at-most-one-mode-per-generator.
+
+Each generator has ``n_modes`` operating modes (different cap/cost tradeoff).
+SOS1 over the ``mode`` dimension enforces that each generator picks at most
+one mode.
+
+Decision variables:
+    y[gen, mode]  >= 0     continuous output per (generator, mode)
+
+Constraints:
+    y[gen, mode]  <= cap[mode]
+    sum_{gen,mode} y  >= demand_total
+    SOS1 over "mode" for each gen
+
+This benchmark exercises ``Model.add_sos_constraints`` (commits be6d3a3 /
+8aa8d0c) and the LP-writer's SOS section. In linopy, native SOS support is
+declared by Gurobi / Cplex / Xpress only (see ``SolverFeature.SOS_CONSTRAINTS``).
+HiGHS and Mosek would need ``apply_sos_reformulation()`` first.
+"""
+
+from __future__ import annotations
+
+import numpy as np
+import xarray as xr
+
+import linopy
+from benchmarks.registry import (
+    BUILD,
+    CONTINUOUS,
+    LP_WRITE,
+    MATRICES,
+    NETCDF,
+    SOS,
+    TO_GUROBIPY,
+    TO_XPRESS,
+    ModelSpec,
+    register,
+)
+
+SIZES = (10, 100, 1_000, 10_000)
+
+_N_MODES = 5
+_API_AVAILABLE = hasattr(linopy.Model, "add_sos_constraints")
+
+
+def build_sos(n_gens: int) -> linopy.Model:
+    modes = np.arange(_N_MODES)
+    cap = xr.DataArray(np.linspace(20.0, 100.0, _N_MODES), coords=[("mode", modes)])
+    cost = xr.DataArray(np.linspace(1.0, 8.0, _N_MODES), coords=[("mode", modes)])
+
+    m = linopy.Model()
+    y = m.add_variables(
+        lower=0,
+        upper=float(cap.max()),
+        coords=[range(n_gens), modes],
+        dims=["gen", "mode"],
+        name="y",
+    )
+
+    m.add_constraints(y <= cap, name="mode_cap")
+    demand_total = 0.4 * n_gens * float(cap.max())
+    m.add_constraints(y.sum() >= demand_total, name="demand")
+
+    m.add_sos_constraints(y, sos_type=1, sos_dim="mode")
+
+    m.add_objective((cost * y).sum())
+    return m
+
+
+# ``add_sos_constraints`` is a recent API. On older linopy we silently skip
+# registering this model — the rest of the suite stays usable.
+SPEC: ModelSpec | None
+if _API_AVAILABLE:
+    SPEC = register(
+        ModelSpec(
+            name="sos",
+            build=build_sos,
+            sizes=SIZES,
+            features=frozenset({CONTINUOUS, SOS}),
+            # HiGHS / Mosek lack native SOS in linopy — would need
+            # ``reformulate_sos=True``, which mutates the model and defeats
+            # the benchmark. Only solvers with native SOS appear here.
+            phases=frozenset(
+                {
+                    BUILD,
+                    MATRICES,
+                    LP_WRITE,
+                    NETCDF,
+                    TO_GUROBIPY,
+                    TO_XPRESS,
+                }
+            ),
+            quick_threshold=10,
+            long_threshold=1_000,
+        )
+    )
+else:
+    SPEC = None
diff --git a/benchmarks/models/sparse_network.py b/benchmarks/models/sparse_network.py
index afc6be06..7ac71db1 100644
--- a/benchmarks/models/sparse_network.py
+++ b/benchmarks/models/sparse_network.py
@@ -7,8 +7,9 @@
 import xarray as xr
 
 import linopy
+from benchmarks.registry import CONTINUOUS, ModelSpec, register
 
-SIZES = [10, 50, 100, 250, 500, 1000]
+SIZES = (10, 50, 100, 250, 500, 1000)
 
 
 def build_sparse_network(n_buses: int) -> linopy.Model:
@@ -48,3 +49,15 @@ def build_sparse_network(n_buses: int) -> linopy.Model:
 
     m.add_objective(gen.sum())
     return m
+
+
+SPEC = register(
+    ModelSpec(
+        name="sparse_network",
+        build=build_sparse_network,
+        sizes=SIZES,
+        features=frozenset({CONTINUOUS}),
+        quick_threshold=10,
+        long_threshold=500,
+    )
+)
diff --git a/benchmarks/phases.py b/benchmarks/phases.py
new file mode 100644
index 00000000..58e7e67f
--- /dev/null
+++ b/benchmarks/phases.py
@@ -0,0 +1,86 @@
+"""
+Single source of truth for *what each benchmark phase does to a model*.
+
+Both drivers import these verbs:
+
+- the pytest ``test_<phase>.py`` files wrap them in ``benchmark(...)``;
+- ``memory.py`` wraps them in ``memray.Tracker(...)``.
+
+So the measured operation is defined once. Setup — building the model,
+creating scratch files — stays in the caller; only the verb itself
+lives here.
+"""
+
+from __future__ import annotations
+
+import inspect
+from collections.abc import Callable
+from pathlib import Path
+
+import linopy
+import linopy.io as lio
+from benchmarks.registry import TO_GUROBIPY, TO_HIGHSPY, TO_MOSEK, TO_XPRESS
+from linopy import read_netcdf
+
+# linopy <0.4.1's ``to_file`` doesn't accept ``progress``. Check once
+# at import so the benchmark loop stays branchless on the hot path.
+_TO_FILE_HAS_PROGRESS = "progress" in inspect.signature(linopy.Model.to_file).parameters
+
+# Re-export so callers can ``from benchmarks.phases import read_netcdf``
+# alongside the wrappers.
+__all__ = [
+    "SOLVER_HANDOFFS",
+    "read_netcdf",
+    "touch_matrices",
+    "write_lp",
+    "write_netcdf",
+]
+
+
+def touch_matrices(m: linopy.Model) -> None:
+    """Force every matrix block to materialise — the thing we measure."""
+    mats = m.matrices
+    for attr in ("A", "b", "c", "lb", "ub", "sense", "vlabels", "clabels"):
+        getattr(mats, attr)
+    if m.is_quadratic:
+        mats.Q
+
+
+def write_lp(m: linopy.Model, path: Path) -> None:
+    """
+    Write the model as an LP file.
+
+    Where supported, ``progress=False`` is pinned here so the
+    benchmark stays uniform across drivers — the progress bar's
+    overhead would otherwise leak into the measurement. linopy <0.4.1
+    doesn't accept the kwarg; falls back to the native call.
+    """
+    if _TO_FILE_HAS_PROGRESS:
+        m.to_file(path, progress=False)
+    else:
+        m.to_file(path)
+
+
+def write_netcdf(m: linopy.Model, path: Path) -> None:
+    m.to_netcdf(path)
+
+
+# (solver_name, registry phase tag, wrapper) — consumed by the pytest
+# parametrization in ``test_solver_handoff.py`` and by ``memory.py``,
+# which looks up the "highs" entry. Adding a solver here automatically
+# extends both drivers.
+#
+# Each wrapper is fetched via ``getattr`` so the tuple silently drops
+# any solver wrapper missing from the installed ``linopy`` — necessary
+# for cross-version ``sweep`` runs against older releases (e.g.
+# ``to_xpress`` doesn't exist before linopy 0.7.1).
+SOLVER_HANDOFFS: tuple[tuple[str, str, Callable[[linopy.Model], object]], ...] = tuple(
+    (name, tag, wrapper)
+    for name, tag, wrapper in (
+        ("highs", TO_HIGHSPY, getattr(lio, "to_highspy", None)),
+        ("gurobi", TO_GUROBIPY, getattr(lio, "to_gurobipy", None)),
+        ("mosek", TO_MOSEK, getattr(lio, "to_mosek", None)),
+        ("xpress", TO_XPRESS, getattr(lio, "to_xpress", None)),
+    )
+    if wrapper is not None
+)
diff --git a/benchmarks/plotting.py b/benchmarks/plotting.py
new file mode 100644
index 00000000..9a107de2
--- /dev/null
+++ b/benchmarks/plotting.py
@@ -0,0 +1,481 @@
+"""
+Interactive plotly views over pytest-benchmark JSON snapshots.
+
+Three opinionated views, all returning the number of tests rendered:
+
+- :func:`plot_compare` (2 snapshots) — sorted-by-delta bar chart.
+- :func:`plot_sweep` (3+ snapshots) — heatmap of per-test ratio
+  relative to the first snapshot. Useful for cross-version sweeps.
+- :func:`plot_scaling` (1 snapshot) — log-log time vs ``n`` for
+  size-parametrized tests, faceted by phase.
+
+All three accept a ``metric`` argument selecting which pytest-benchmark
+stat drives the plot. Default is ``min`` — the fastest observed sample
+approximates the no-noise floor (GC, scheduling, cache thrash can only
+add time). ``median`` is more robust to a single weirdly-fast warmup
+round; ``mean`` and ``max`` are also accepted.
+
+plotly is imported lazily by the dispatcher so the rest of the benchmark
+suite still works without it.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from pathlib import Path
+from typing import TYPE_CHECKING, Literal
+
+from benchmarks.snapshot import Metric, load_long_df
+
+if TYPE_CHECKING:
+    from plotly.graph_objects import Figure
+
+PlotView = Literal["compare", "scatter", "sweep", "scaling"]
+SortMode = Literal["absolute", "relative"]
+FacetBy = Literal["phase", "model"]
+
+
+def _axis_kwargs(unit: str) -> dict:
+    """Return ``update_xaxes`` kwargs for a given unit."""
+    if unit == "s":
+        return {"tickformat": ".2s", "ticksuffix": "s"}
+    return {"ticksuffix": f" {unit}"}
+
+
+def _hide_non_leftmost_yticks(fig: Figure, wrap: int) -> None:
+    """
+    Hide y-axis tick labels on every facet except the leftmost column.
+
+    Plotly express lays facets out left-to-right, top-to-bottom: with
+    ``facet_col_wrap=N`` the leftmost facets are at indices 0, N, 2N…
+    Hiding tick labels on the rest keeps the row labels visible only
+    once per row instead of repeating at every subplot's left edge.
+    """
+    yaxes = []
+    fig.for_each_yaxis(lambda y: yaxes.append(y))
+    for idx, yaxis in enumerate(yaxes):
+        if idx % wrap != 0:
+            yaxis.update(showticklabels=False)
+
+
+def _share_axis_labels(fig: Figure, y_label: str, x_label: str) -> None:
+    """
+    Replace per-facet axis titles with one shared label per axis.
+
+    Plotly express renders the x/y titles on every facet by default,
+    which is noisy when faceting wraps a 5+ subplot grid. This clears
+    them and adds two ``paper``-coordinate annotations: one on the
+    left (rotated) for ``y_label``, one on the bottom for ``x_label``.
+    Leave either blank to skip that side.
+    """
+    fig.for_each_yaxis(lambda yaxis: yaxis.update(title_text=""))
+    fig.for_each_xaxis(lambda xaxis: xaxis.update(title_text=""))
+    if y_label:
+        fig.add_annotation(
+            text=y_label,
+            xref="paper",
+            yref="paper",
+            x=-0.05,
+            y=0.5,
+            textangle=-90,
+            showarrow=False,
+            font={"size": 13},
+        )
+    if x_label:
+        fig.add_annotation(
+            text=x_label,
+            xref="paper",
+            yref="paper",
+            x=0.5,
+            y=-0.08,
+            showarrow=False,
+            font={"size": 13},
+        )
+    # Give the annotations room.
+    fig.update_layout(margin={"l": 90, "b": 70})
+
+
+def plot_compare(
+    snapshots: list[Path],
+    metric: Metric = "min",
+    sort: SortMode = "absolute",
+    facets: FacetBy | None = None,
+) -> tuple[Figure, int]:
+    """
+    Bar chart of delta per test, in alphabetical test-id order.
+
+    ``sort`` chooses the bar *dimension*: ``absolute`` (default) plots
+    ``b - a`` in the data's native unit; ``relative`` plots the percent
+    change. Bars are not reordered by magnitude — alphabetical ids keep
+    related tests visually grouped. Use the scatter view for hunting
+    outliers.
+
+    ``facets`` splits the chart into subplots:
+
+    - ``None`` (default): one flat bar chart.
+    - ``"phase"``: facet by the test file (``test_build``,
+      ``test_lp_write``, ...). Best for "everything in this phase moved
+      together?".
+    - ``"model"``: facet by the model name (``basic``, ``knapsack``, ...).
+      Best for "what happened across all the basic-sized variants?".
+
+    Tests whose IDs don't match the standard ``[<model>-n=<size>]``
+    parametrize shape (e.g. PyPSA carbon-management) land in an
+    ``other`` facet.
+    """
+    import sys
+
+    import plotly.express as px
+
+    df_long, unit = load_long_df(snapshots[:2], metric)
+    metric_label = metric if unit == "s" else "peak"
+
+    labels = df_long["snapshot"].drop_duplicates().tolist()
+    a_label, b_label = labels[0], labels[1]
+
+    # Pivot to wide: one row per test, baseline + candidate as columns,
+    # phase / model / size carried through. Then compute deltas
+    # vectorised — no per-row dict construction.
+    wide = (
+        df_long.pivot(
+            index=["test_id", "phase", "model", "size"],
+            columns="snapshot",
+            values="value",
+        )
+        .reset_index()
+        .rename_axis(columns=None)
+    )
+    only_a = wide[wide[a_label].notna() & wide[b_label].isna()]
+    only_b = wide[wide[a_label].isna() & wide[b_label].notna()]
+    df = wide.dropna(subset=[a_label, b_label]).copy()
+    if df.empty:
+        raise ValueError("no tests in common between the two snapshots")
+    if len(only_a) or len(only_b):
+        print(
+            f"compare: {len(only_a)} test(s) only in {a_label}, "
+            f"{len(only_b)} only in {b_label} (intersection: {len(df)}).",
+            file=sys.stderr,
+        )
+
+    df["delta_abs"] = df[b_label] - df[a_label]
+    df["delta_pct"] = (df["delta_abs"] / df[a_label]) * 100.0
+    df = df.sort_values("test_id").reset_index(drop=True)
+    x_col = "delta_abs" if sort == "absolute" else "delta_pct"
+
+    if sort == "absolute":
+        x_label = f"{metric_label} delta ({unit})"
+        text_fmt = ".2s" if unit == "s" else ".2f"
+    else:
+        x_label = f"{metric_label} delta %"
+        text_fmt = ".1f"
+
+    direction = "slower" if unit == "s" else "more memory"
+    title = (
+        f"{metric_label} delta ({sort}): {a_label} → {b_label} (positive = {direction})"
+    )
+    if len(only_a) or len(only_b):
+        title += (
+            f"<br><sub>{len(only_a)} only in {a_label}, "
+            f"{len(only_b)} only in {b_label}</sub>"
+        )
+
+    # Inside a facet the y-axis labels whatever *varies* — drop the
+    # facetted dimension from the label, keep the rest. Flat ⇒ the full
+    # test_id so each bar is self-identifying.
+    facet_kwargs: dict = {}
+    if facets is None:
+        y_col = "test_id"
+    else:
+        varying = "model" if facets == "phase" else "phase"
+        size_str = df["size"].astype("Int64").astype(str)
+        df["_short"] = df[varying] + "-n=" + size_str
+        other_mask = df["phase"] == "other"
+        df.loc[other_mask, "_short"] = (
+            df.loc[other_mask, "test_id"].str.split("::").str[-1]
+        )
+        y_col = "_short"
+        facet_kwargs = {"facet_col": facets}
+        facet_kwargs["facet_col_wrap"] = 2 if facets == "phase" else 3
+
+    fig = px.bar(
+        df,
+        x=x_col,
+        y=y_col,
+        orientation="h",
+        color=x_col,
+        color_continuous_scale=["green", "white", "red"],
+        color_continuous_midpoint=0,
+        title=title,
+        labels={x_col: x_label, y_col: ""},
+        text_auto=text_fmt,
+        hover_data={
+            "test_id": True,
+            a_label: ":.4g",
+            b_label: ":.4g",
+            "delta_abs": ":.4g",
+            "delta_pct": ":.2f",
+        },
+        **facet_kwargs,
+    )
+    if sort == "absolute":
+        # SI-prefixed time on the x-axis (e.g. 24 ms, 2.4 ms, 240 µs) for
+        # timing snapshots; plain MiB for memory.
+        fig.update_xaxes(**_axis_kwargs(unit))
+    # Render the value text outside the bar (default is inside) so the
+    # number stays readable even when a bar is very short.
+    fig.update_traces(textposition="outside", cliponaxis=False)
+    if facets is not None:
+        # Each facet keeps its own y category list (no shared rows full
+        # of empty bars), but we hide tick labels on non-leftmost facets
+        # within each row so labels appear once per row.
+        fig.update_yaxes(matches=None)
+        wrap = facet_kwargs["facet_col_wrap"]
+        _hide_non_leftmost_yticks(fig, wrap=wrap)
+        _share_axis_labels(fig, y_label="test", x_label=x_label)
+        # Per-wrap-row equal-share layout is plotly's default. Facets
+        # with fewer categories than the row max will show empty space
+        # below their bars — visually loose but the facet header
+        # annotations stay correctly positioned, which a manual
+        # ``domain`` override would scramble.
+        rows_per_facet = df.groupby(facets)[y_col].nunique().max()
+        n_wrap_rows = (df[facets].nunique() + wrap - 1) // wrap
+        height = max(500, int(n_wrap_rows * rows_per_facet * 24) + 100)
+    else:
+        height = max(500, len(df) * 22)
+    fig.update_layout(height=height, showlegend=False)
+    return fig, len(df)
+
+
+def plot_scatter(
+    snapshots: list[Path],
+    metric: Metric = "min",
+    sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
+    facets: FacetBy | None = None,
+) -> tuple[Figure, int]:
+    """
+    Two-axis scatter — baseline cost on log-x, ratio on y.
+
+    Designed as the single best exploratory plot for regression hunting
+    across tests of wildly different magnitudes: a point lights up as
+    "fix this" only if it sits in the top-right corner — slow tests
+    that got slower. Top-left (big ratio, tiny absolute) is a cheap
+    test with noisy ratio swings — not a real change. Bottom-right (big
+    absolute, tiny ratio) is already-slow-but-unchanged. The combined
+    position resolves the tension that pure relative or pure absolute
+    sort each blind-spot.
+
+    The first snapshot is the baseline. With 2 snapshots, a static
+    scatter is drawn; with 3+, every subsequent snapshot becomes an
+    ``animation_frame`` — use the slider / play button to step through
+    versions and watch points drift across releases.
+
+    A horizontal reference at ``ratio = 1`` makes "no change" trivial
+    to see; the colour encodes absolute Δ as a third channel.
+    """
+    import numpy as np
+    import plotly.express as px
+
+    if len(snapshots) < 2:
+        raise ValueError("scatter needs at least 2 snapshots (baseline + 1)")
+
+    df_long, unit = load_long_df(snapshots, metric)
+    metric_label = metric if unit == "s" else "peak"
+
+    labels = df_long["snapshot"].drop_duplicates().tolist()
+    baseline_label = labels[0]
+
+    # Attach the baseline value to every row via a per-test groupby (each
+    # test's baseline = its value on the first snapshot). Tests with no
+    # baseline row (only in non-baseline snapshots) are dropped. Tests
+    # with non-positive baseline are dropped because the ratio is
+    # undefined for them.
+    baseline_vals = df_long.loc[
+        df_long["snapshot"] == baseline_label, ["test_id", "value"]
+    ].rename(columns={"value": "baseline_time"})
+    df = df_long.merge(baseline_vals, on="test_id", how="inner")
+    df = df[df["baseline_time"] > 0].copy()
+    if df.empty:
+        raise ValueError(
+            f"no tests in common between baseline ({baseline_label}) "
+            "and any of the other snapshots"
+        )
+
+    df = df.rename(columns={"snapshot": "version", "value": "candidate_time"})
+    df["ratio"] = df["candidate_time"] / df["baseline_time"]
+    df["delta_abs"] = df["candidate_time"] - df["baseline_time"]
+    df["delta_pct"] = df["delta_abs"] / df["baseline_time"] * 100.0
+    df = df.rename(columns={"test_id": "test"})
+    # Fix the axis ranges so the animation doesn't jitter; pad by a small
+    # margin so points on the edges aren't clipped.
+    x_lo, x_hi = df["baseline_time"].min(), df["baseline_time"].max()
+    # y-range uses min/max but is centered symmetrically around 1.0 (the
+    # "no change" line), so regressions above and improvements below are
+    # equally readable. Asymmetric data still resolves — the larger side
+    # just dictates how wide the symmetric window is.
+    y_lo, y_hi = df["ratio"].min(), df["ratio"].max()
+    max_dist = max(abs(1.0 - y_lo), abs(y_hi - 1.0), 0.05)
+    pad_y = max(0.05, max_dist * 0.05)
+    y_range = [1.0 - max_dist - pad_y, 1.0 + max_dist + pad_y]
+
+    # Clip the colour scale to the 95th-percentile absolute Δ so a single
+    # huge regression doesn't wash everything else to white. Outliers
+    # saturate at the bound, the rest stays readable.
+    clip = float(np.percentile(df["delta_abs"].abs(), 95)) if len(df) > 0 else 0.0
+    if clip == 0.0:
+        max_abs = float(df["delta_abs"].abs().max())
+        clip = max_abs if max_abs > 0 else 1e-9
+
+    animate = len(snapshots) >= 3
+    extra: dict = {}
+    if animate:
+        extra["animation_frame"] = "version"
+        extra["category_orders"] = {"version": labels}
+    if facets is not None:
+        extra["facet_col"] = facets
+        extra["facet_col_wrap"] = 2 if facets == "phase" else 3
+
+    fig = px.scatter(
+        df,
+        x="baseline_time",
+        y="ratio",
+        color="delta_abs",
+        color_continuous_scale=["green", "white", "red"],
+        color_continuous_midpoint=0,
+        range_color=[-clip, clip],
+        log_x=True,
+        range_x=[x_lo * 0.5, x_hi * 2],
+        range_y=y_range,
+        hover_name="test",
+        hover_data={
+            "baseline_time": ":.4g",
+            "candidate_time": ":.4g",
+            "delta_abs": ":.4g",
+            "delta_pct": ":.2f",
+            "ratio": ":.3f",
+            "version": True,
+        },
+        title=(
+            f"{metric_label} scatter vs baseline ({baseline_label}) — "
+            "top-right = the regressed corner"
+        ),
+        labels={
+            "baseline_time": f"baseline {metric_label} ({unit}, log scale)",
+            "ratio": f"{metric_label} ratio  (candidate / baseline)",
+            "candidate_time": "candidate",
+            "delta_abs": f"Δ ({unit}, p95-clipped)",
+        },
+        **extra,
+    )
+    fig.add_hline(
+        y=1.0, line_dash="dash", line_color="grey", annotation_text="no change"
+    )
+    fig.update_traces(marker=dict(size=8, line=dict(width=0.5, color="DarkSlateGrey")))
+    if facets is not None:
+        _share_axis_labels(
+            fig,
+            y_label=f"{metric_label} ratio (candidate / baseline)",
+            x_label=f"baseline {metric_label} ({unit}, log scale)",
+        )
+    fig.update_layout(height=600)
+    return fig, int(df["test"].nunique())
+
+
+def plot_sweep(
+    snapshots: list[Path],
+    metric: Metric = "min",
+    sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
+    facets: FacetBy | None = None,  # noqa: ARG001  (uniform signature, unused here)
+) -> tuple[Figure, int]:
+    """Heatmap of per-test ratio relative to the first snapshot."""
+    import plotly.express as px
+
+    df_long, unit = load_long_df(snapshots, metric)
+    metric_label = metric if unit == "s" else "peak"
+    versions = df_long["snapshot"].drop_duplicates().tolist()
+    baseline_label = versions[0]
+
+    # Pivot absolutes (rows=tests, cols=versions), then drop tests with
+    # no baseline reading and divide every column by the baseline column
+    # to get ratios in one shot.
+    abs_df = df_long.pivot(index="test_id", columns="snapshot", values="value").reindex(
+        columns=versions
+    )
+    abs_df = abs_df.dropna(subset=[baseline_label])
+    if abs_df.empty:
+        raise ValueError(f"no overlap with baseline snapshot {baseline_label}")
+    df = abs_df.div(abs_df[baseline_label], axis=0)
+    abs_df.index.name = "test"
+    df.index.name = "test"
+
+    fig = px.imshow(
+        df,
+        color_continuous_scale=["green", "white", "red"],
+        color_continuous_midpoint=1.0,
+        aspect="auto",
+        title=f"{metric_label} ratio relative to baseline ({versions[0]})",
+        labels={"x": "version", "y": "test", "color": "ratio"},
+        text_auto=".2f",
+    )
+    # Inject absolute values as customdata so hover shows both.
+    fig.update_traces(
+        customdata=abs_df.values,
+        hovertemplate=(
+            "test: %{y}<br>"
+            "version: %{x}<br>"
+            "ratio: %{z:.3f}<br>"
+            f"{metric_label}: %{{customdata:.4g}}{unit}"
+            "<extra></extra>"
+        ),
+    )
+    fig.update_layout(height=max(500, len(df) * 22))
+    return fig, len(df)
+
+
+def plot_scaling(
+    snapshots: list[Path],
+    metric: Metric = "min",
+    sort: SortMode = "absolute",  # noqa: ARG001  (uniform signature, unused here)
+    facets: FacetBy | None = None,  # noqa: ARG001  (uniform signature, unused here)
+) -> tuple[Figure, int]:
+    """Log-log time vs N for size-parametrized tests, faceted by phase."""
+    import plotly.express as px
+
+    df_long, unit = load_long_df(snapshots[:1], metric)
+    metric_label = metric if unit == "s" else "peak"
+    df = (
+        df_long.dropna(subset=["size"])
+        .rename(columns={"size": "n", "value": metric})
+        .sort_values(["phase", "model", "n"])
+    )
+    if df.empty:
+        raise ValueError(
+            "no size-parametrized tests found (expected ``...[<model>-n=<N>]`` ids)"
+        )
+
+    fig = px.line(
+        df,
+        x="n",
+        y=metric,
+        color="model",
+        facet_col="phase",
+        facet_col_wrap=3,
+        log_x=True,
+        log_y=True,
+        markers=True,
+        title=(
+            f"Scaling: {metric_label} ({unit}) vs problem size ({snapshots[0].stem})"
+        ),
+    )
+    fig.update_layout(height=max(400, ((df["phase"].nunique() + 2) // 3) * 350))
+    return fig, len(df)
+
+
+RENDERERS: dict[
+    PlotView,
+    Callable[[list[Path], Metric, SortMode, FacetBy | None], tuple[Figure, int]],
+] = {
+    "compare": plot_compare,
+    "scatter": plot_scatter,
+    "sweep": plot_sweep,
+    "scaling": plot_scaling,
+}
diff --git a/benchmarks/registry.py b/benchmarks/registry.py
new file mode 100644
index 00000000..b7735670
--- /dev/null
+++ b/benchmarks/registry.py
@@ -0,0 +1,197 @@
+"""
+Reusable registry of benchmark models.
+
+A :class:`ModelSpec` captures everything needed to drive a model through the
+benchmark suite *and* to use it from any other test or script:
+
+- ``build(size) -> linopy.Model``  the actual builder
+- ``sizes``                        canonical sizes the model has been tuned for
+- ``features``                     what kinds of variables / constraints it uses
+- ``phases``                       which benchmark phases apply (lp_write, to_highspy, ...)
+- ``quick_threshold``              max size to keep under ``pytest --quick``
+- ``requires``                     extra modules to ``pytest.importorskip``
+
+Pattern for downstream use::
+
+    from benchmarks import REGISTRY
+    model = REGISTRY["basic"].build(100)
+
+    # Or pick a subset by feature/phase:
+    from benchmarks import filter_by, QUADRATIC
+    qp_specs = filter_by(has_feature=QUADRATIC)
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from dataclasses import dataclass
+
+import linopy
+
+# --- Feature tags -----------------------------------------------------------
+
+CONTINUOUS = "continuous"
+BINARY = "binary"
+INTEGER = "integer"
+QUADRATIC = "quadratic"
+SOS = "sos"
+PIECEWISE = "piecewise"
+MASKED = "masked"
+
+ALL_FEATURES = frozenset(
+    {CONTINUOUS, BINARY, INTEGER, QUADRATIC, SOS, PIECEWISE, MASKED}
+)
+
+# --- Phase tags -------------------------------------------------------------
+
+BUILD = "build"
+MATRICES = "matrices"
+LP_WRITE = "lp_write"
+NETCDF = "netcdf"
+TO_HIGHSPY = "to_highspy"
+TO_GUROBIPY = "to_gurobipy"
+TO_MOSEK = "to_mosek"
+TO_XPRESS = "to_xpress"
+
+ALL_PHASES = frozenset(
+    {
+        BUILD,
+        MATRICES,
+        LP_WRITE,
+        NETCDF,
+        TO_HIGHSPY,
+        TO_GUROBIPY,
+        TO_MOSEK,
+        TO_XPRESS,
+    }
+)
+
+# Phases every "well-behaved LP / MILP" can do. Models with features the
+# default solvers can't ingest natively (e.g. native SOS for HiGHS) override
+# this with a narrower set.
+DEFAULT_PHASES = frozenset(
+    {
+        BUILD,
+        MATRICES,
+        LP_WRITE,
+        NETCDF,
+        TO_HIGHSPY,
+        TO_GUROBIPY,
+        TO_MOSEK,
+        TO_XPRESS,
+    }
+)
+
+
+@dataclass(frozen=True, repr=False)
+class ModelSpec:
+    """
+    Declarative description of one benchmark model.
+
+    Three size tiers gate the cost of a default ``pytest benchmarks/`` run:
+
+    - ``size <= quick_threshold``: included under ``--quick`` (smoke / CI).
+    - ``size <= long_threshold``: included by default (medium-cost regression).
+    - ``size >  long_threshold``: only included under ``--long`` (full sweep).
+
+    Without explicit values, both thresholds default to "no cap".
+    """
+
+    name: str
+    build: Callable[[int], linopy.Model]
+    sizes: tuple[int, ...]
+    features: frozenset[str] = frozenset({CONTINUOUS})
+    phases: frozenset[str] = DEFAULT_PHASES
+    quick_threshold: int = 10**9
+    long_threshold: int = 10**9
+    requires: tuple[str, ...] = ()
+
+    def applies_to(self, phase: str) -> bool:
+        return phase in self.phases
+
+    def has_feature(self, feature: str) -> bool:
+        return feature in self.features
+
+    def __repr__(self) -> str:
+        feats = ",".join(sorted(self.features))
+        size_range = (
+            f"{self.sizes[0]}..{self.sizes[-1]}"
+            if len(self.sizes) > 1
+            else str(self.sizes[0])
+        )
+        return f"ModelSpec({self.name!r}, features={{{feats}}}, sizes={size_range})"
+
+    def _repr_html_(self) -> str:
+        # Rich rendering for Jupyter — a compact two-column table.
+        rows = [
+            ("name", self.name),
+            ("features", ", ".join(sorted(self.features))),
+            ("sizes", ", ".join(str(s) for s in self.sizes)),
+            ("phases", ", ".join(sorted(self.phases))),
+            ("quick_threshold", self.quick_threshold),
+            ("long_threshold", self.long_threshold),
+            ("requires", ", ".join(self.requires) or "—"),
+        ]
+        body = "".join(
+            f"<tr><th style='text-align:left;padding-right:1em'>{k}</th>"
+            f"<td>{v}</td></tr>"
+            for k, v in rows
+        )
+        return (
+            f"<b>ModelSpec</b> <code>{self.name}</code>"
+            f"<table style='font-size:90%'>{body}</table>"
+        )
+
+
+REGISTRY: dict[str, ModelSpec] = {}
+
+
+def register(spec: ModelSpec) -> ModelSpec:
+    """Add ``spec`` to the global registry. Returns the spec for chaining."""
+    if spec.name in REGISTRY:
+        raise ValueError(f"model {spec.name!r} already registered")
+    unknown_features = spec.features - ALL_FEATURES
+    if unknown_features:
+        raise ValueError(
+            f"model {spec.name!r}: unknown features {sorted(unknown_features)}"
+        )
+    unknown_phases = spec.phases - ALL_PHASES
+    if unknown_phases:
+        raise ValueError(
+            f"model {spec.name!r}: unknown phases {sorted(unknown_phases)}"
+        )
+    REGISTRY[spec.name] = spec
+    return spec
+
+
+def get(name: str) -> ModelSpec:
+    return REGISTRY[name]
+
+
+def filter_by(
+    *,
+    has_feature: str | None = None,
+    has_phase: str | None = None,
+) -> list[ModelSpec]:
+    out = []
+    for spec in REGISTRY.values():
+        if has_feature is not None and not spec.has_feature(has_feature):
+            continue
+        if has_phase is not None and not spec.applies_to(has_phase):
+            continue
+        out.append(spec)
+    return out
+
+
+def iter_params(phase: str) -> list[tuple[ModelSpec, int]]:
+    """Pytest parametrize helper — flatten (spec, size) pairs for one phase."""
+    return [
+        (spec, size)
+        for spec in REGISTRY.values()
+        if spec.applies_to(phase)
+        for size in spec.sizes
+    ]
+
+
+def param_ids(params: list[tuple[ModelSpec, int]]) -> list[str]:
+    return [f"{spec.name}-n={size}" for spec, size in params]
diff --git a/benchmarks/snapshot.py b/benchmarks/snapshot.py
new file mode 100644
index 00000000..ab4ff95a
--- /dev/null
+++ b/benchmarks/snapshot.py
@@ -0,0 +1,186 @@
+"""
+The benchmark snapshot contract — one owner for the on-disk JSON shapes,
+the test-id grammar, and the long-DataFrame loader.
+
+Dependency-free within the package (stdlib plus a lazily-imported
+pandas), so every writer (pytest-benchmark via file, :func:`memory.save`,
+:mod:`benchmarks.bench`) and every reader (:mod:`benchmarks.plotting`,
+:func:`memory.compare`) can sit on it without import cycles.
+
+Two snapshot shapes, auto-detected on load:
+
+- **timing** — ``{"benchmarks": [{"fullname": <id>, "stats": {"min":…,
+  "median":…, "mean":…, "max":…}}]}`` → value in **seconds** (the shape
+  pytest-benchmark writes).
+- **memory** — ``{"label": <str>, "peak_mib": {<id>: <float>}}`` → value
+  in **MiB**.
+
+Test ids follow ``…[<model>-n=<size>]``; :func:`parse_test_id` splits one
+into ``(phase, model, size)`` and :func:`synth_test_id` builds one.
+"""
+
+from __future__ import annotations
+
+import json
+import re
+from pathlib import Path
+from typing import TYPE_CHECKING, Literal
+
+if TYPE_CHECKING:
+    import pandas as pd
+
+Metric = Literal["min", "median", "mean", "max"]
+
+_SIZE_RE = re.compile(r"(.*)\[([^\[\]]+?)-n=(\d+)\]")
+
+
+# --- test-id grammar -------------------------------------------------------
+
+
+def parse_test_id(test_id: str) -> tuple[str, str, int | None]:
+    """
+    Return ``(phase, model, size)`` for a pytest test id.
+
+    Falls back to ``("other", "other", None)`` for ids that don't match
+    the ``benchmarks/test_<phase>.py::test_<phase>[<model>-n=<size>]``
+    parametrize shape (e.g. ``test_pypsa_carbon_management``).
+    """
+    m = _SIZE_RE.match(test_id)
+    if m:
+        phase = m.group(1).split("::")[-1]
+        return phase, m.group(2), int(m.group(3))
+    return "other", "other", None
+
+
+def synth_test_id(
+    label: str, *, model: str | None, size: int | None, phase: str | None
+) -> str:
+    """
+    Build a snapshot test id from optional metadata.
+
+    With all of ``model``/``size``/``phase`` supplied, synthesize
+    ``bench::{phase}[{model}-n={size}]`` — this round-trips through
+    :func:`parse_test_id` into the three columns (so ``plot --view
+    scaling`` works across several sizes). With none supplied, fall back
+    to ``label`` verbatim (lands in the ``"other"`` bucket — still fine
+    for ``compare``). A partial spec is ambiguous and rejected.
+    """
+    given = (model is not None, size is not None, phase is not None)
+    if all(given):
+        return f"bench::{phase}[{model}-n={size}]"
+    if any(given):
+        raise ValueError(
+            "model, size, and phase must be given together (or all omitted)"
+        )
+    return label
+
+
+# --- writers ---------------------------------------------------------------
+
+
+def write_timing_snapshot(
+    path: str | Path, entries: list[tuple[str, dict[str, float]]]
+) -> Path:
+    """Write the pytest-benchmark timing shape (seconds) from ``(id, stats)``."""
+    data = {
+        "benchmarks": [
+            {"fullname": fullname, "stats": dict(stats)} for fullname, stats in entries
+        ]
+    }
+    out = Path(path)
+    out.write_text(json.dumps(data, indent=2))
+    return out
+
+
+def write_memory_snapshot(
+    path: str | Path, label: str, peaks: dict[str, float]
+) -> Path:
+    """Write the memory shape (``{id: peak_mib}``)."""
+    out = Path(path)
+    out.write_text(json.dumps({"label": label, "peak_mib": dict(peaks)}, indent=2))
+    return out
+
+
+# --- readers ---------------------------------------------------------------
+
+
+def load_snapshot(
+    path: Path, metric: Metric = "min"
+) -> tuple[str, dict[str, float], str]:
+    """
+    Return ``(label, {fullname: value}, unit)`` for one snapshot.
+
+    Auto-detects the JSON shape:
+
+    - timing (``{"benchmarks": [{"stats": {...}}]}``) → ``value`` is
+      ``stats[metric]`` in **seconds**.
+    - memory (``{"peak_mib": {id: float}}``) → ``value`` is the peak in
+      **MiB**; ``metric`` is ignored.
+    """
+    data = json.loads(path.read_text())
+    if "peak_mib" in data:
+        return path.stem, dict(data["peak_mib"]), "MiB"
+    values = {bm["fullname"]: bm["stats"][metric] for bm in data["benchmarks"]}
+    return path.stem, values, "s"
+
+
+def discover_snapshots() -> list[Path]:
+    """
+    Return JSON snapshot files under the canonical ``.benchmarks/`` tree.
+
+    Paths are relative to cwd so they're easier to copy-paste back into
+    the CLI than the absolute form would be. Used by ``compare`` / ``plot``
+    to suggest available snapshots when the user passes none.
+    """
+    root = Path(".benchmarks")
+    if not root.exists():
+        return []
+    return sorted(root.rglob("*.json"))
+
+
+def _check_same_unit(snapshots: list[tuple[str, dict[str, float], str]]) -> str:
+    """Validate that every snapshot has the same unit, return it."""
+    units = {u for _, _, u in snapshots}
+    if len(units) > 1:
+        raise ValueError(
+            f"snapshots mix units {units}; can't compare timing and memory"
+        )
+    return next(iter(units))
+
+
+def load_long_df(
+    snapshots: list[Path], metric: Metric = "min"
+) -> tuple[pd.DataFrame, str]:
+    """
+    Return ``(df, unit)`` — one row per ``(snapshot, test_id)`` pair.
+
+    Columns: ``snapshot``, ``test_id``, ``phase``, ``model``, ``size``
+    (``Int64``-nullable for the "other" bucket), ``value``. ``unit`` is
+    the shared unit string (``"s"`` for timing, ``"MiB"`` for memory)
+    — every loaded snapshot must agree.
+
+    Every plot view downstream pivots or filters this single frame so
+    test-id parsing, unit checking, and the "x snapshots, y tests"
+    matrix logic all live in one place.
+    """
+    import pandas as pd
+
+    raw = [load_snapshot(p, metric) for p in snapshots]
+    unit = _check_same_unit(raw)
+    rows = []
+    for label, vals, _ in raw:
+        for test_id, value in vals.items():
+            phase, model, size = parse_test_id(test_id)
+            rows.append(
+                {
+                    "snapshot": label,
+                    "test_id": test_id,
+                    "phase": phase,
+                    "model": model,
+                    "size": size,
+                    "value": value,
+                }
+            )
+    df = pd.DataFrame(rows)
+    df["size"] = df["size"].astype("Int64")
+    return df, unit
diff --git a/benchmarks/sweep.py b/benchmarks/sweep.py
new file mode 100644
index 00000000..ba01191d
--- /dev/null
+++ b/benchmarks/sweep.py
@@ -0,0 +1,443 @@
+"""
+Cross-version sweep orchestration — build a fresh per-version uv venv,
+install the pinned benchmark infra plus a target ``linopy``, and run the
+suite (timing) or ``memory save`` (peak RSS) inside it.
+
+The heavy provisioning loop and the two sweep bodies live here so
+``cli.py`` stays a thin layer of typer command shims. The CLI resolves
+its options (phase → test file, smoke args) and calls :func:`run_sweep`
+/ :func:`run_memory_sweep`; everything else — venv creation, isolation,
+the per-version subprocess — is internal to this module.
+"""
+
+from __future__ import annotations
+
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+from collections.abc import Iterator
+from dataclasses import dataclass
+from pathlib import Path
+
+import typer
+
+_PLAIN_VERSION_RE = re.compile(r"^\d+(\.\d+)*([a-z]+\d*)?$")
+
+
+def _benchmarks_extra_pins() -> list[str]:
+    """
+    Return the pins from ``pyproject.toml``'s ``[benchmarks]`` extra.
+
+    Both ``sweep`` and ``memory sweep`` install these into each
+    per-version venv. Direct pins are kept in pyproject as the single
+    source of truth — bump them there and both sweeps pick up the
+    change. Transitive deps resolve fresh per venv; uv's deterministic
+    resolution gives identical results across versions within one sweep.
+    """
+    import tomllib
+
+    pyproject = Path(__file__).resolve().parents[1] / "pyproject.toml"
+    data = tomllib.loads(pyproject.read_text())
+    return list(data["project"]["optional-dependencies"]["benchmarks"])
+
+
+def _linopy_install_spec(version: str) -> str:
+    """Turn ``0.4.0`` → ``linopy==0.4.0``, leave anything URL-y untouched."""
+    if _PLAIN_VERSION_RE.match(version):
+        return f"linopy=={version}"
+    return version
+
+
+def _snapshot_label(version: str) -> str:
+    """
+    Filesystem-safe label for a snapshot filename, derived from a spec.
+
+    Plain releases pass through (``0.6.1`` → ``0.6.1``). For a pip spec
+    with a ref — ``git+https://…/linopy.git@<sha>`` or ``linopy @ <url>``
+    — take the part after the last ``@`` (the sha / tag / branch) so a
+    pinned commit writes a clean ``linopy-<sha>.json`` instead of a
+    slash-laden, unwritable name. Whatever's chosen is then sanitised to
+    ``[0-9A-Za-z._-]``.
+    """
+    label = version.rsplit("@", 1)[-1] if "@" in version else version
+    label = re.sub(r"[^0-9A-Za-z._-]+", "-", label).strip("-._")
+    return label or "spec"
+
+
+def _venv_python(venv: Path) -> Path:
+    return (
+        venv / "Scripts" / "python.exe" if os.name == "nt" else venv / "bin" / "python"
+    )
+
+
+@dataclass(frozen=True)
+class _ProvisionedVenv:
+    """
+    One fresh per-version venv from :func:`_provision_venvs`.
+
+    On success, ``python``, ``env``, and ``import_dir`` are populated
+    and ``failed_at`` is ``None``. The caller MUST use ``import_dir``
+    as cwd for per-version subprocesses — see :func:`_provision_venvs`
+    for why. On failure, ``failed_at`` names the step that failed
+    (``"venv"``, ``"install"``, or ``"isolation"``); the caller skips
+    its per-version action and records the failure.
+    """
+
+    version: str
+    python: Path | None
+    env: dict[str, str] | None
+    import_dir: Path | None
+    failed_at: str | None
+
+
+def _provision_venvs(
+    versions: list[str], tmp_prefix: str, as_of: str | None = None
+) -> Iterator[_ProvisionedVenv]:
+    """
+    Yield one fresh per-version uv venv for each linopy version.
+
+    Used by both ``sweep`` and ``memory sweep`` so the venv plumbing
+    (uv venv → install ``[benchmarks]`` pins + the target linopy →
+    set up an isolated import root) lives in one place. The caller
+    supplies the tempdir prefix (so ``ps``/``lsof`` can distinguish
+    concurrent runs) and does whatever per-version action it needs.
+
+    **Isolation:** the repo root contains a ``linopy/`` package (the
+    one we're developing). Running the per-version pytest with the
+    repo root on ``sys.path`` — either via ``PYTHONPATH=repo`` or via
+    ``cwd=repo`` (Python prepends cwd as ``''``) — shadows the venv's
+    installed linopy with the dev tree. The whole sweep then measures
+    the dev linopy against itself instead of the requested version.
+    To avoid this, ``import_dir`` is a fresh tempdir per version that
+    holds a filtered *copy* of ``benchmarks/`` and nothing else — a
+    copy rather than a symlink so the sweep runs on Windows without
+    symlink privileges and so no per-version subprocess (nor its
+    ``__pycache__`` writes) ever touches the working tree. Running
+    subprocesses with ``cwd=import_dir`` and no ``PYTHONPATH`` makes
+    ``import benchmarks`` resolve to that copy while ``import linopy``
+    falls through to the venv's site-packages — i.e. the requested
+    version. The preflight below asserts that resolution actually held.
+
+    Each version's tempdir is cleaned up when the generator advances
+    (or exits). The caller can break the loop early — Python's
+    generator close protocol fires the ``with`` teardown.
+
+    **Cross-time reproducibility:** if ``as_of`` is a date string
+    (``YYYY-MM-DD`` or any ISO 8601 timestamp), passes
+    ``--exclude-newer`` to uv so the entire transitive resolution is
+    frozen to releases on or before that date. Pinning direct deps
+    alone (current default) keeps results reproducible *within* one
+    sweep call, but unpinned transitives can drift between sweep calls
+    days apart; ``as_of`` closes that gap.
+    """
+    if shutil.which("uv") is None:
+        typer.secho(
+            "uv not found on PATH — install via https://docs.astral.sh/uv/",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
+
+    repo_root = Path.cwd()
+    for version in versions:
+        typer.secho(f"\n=== linopy {version} ===", fg=typer.colors.CYAN, bold=True)
+        with tempfile.TemporaryDirectory(prefix=tmp_prefix) as tmp:
+            venv = Path(tmp) / "venv"
+
+            r = subprocess.run(
+                ["uv", "venv", "--python", sys.executable, str(venv)],
+                check=False,
+            )
+            if r.returncode != 0:
+                typer.secho(
+                    f"venv creation failed: {version}",
+                    fg=typer.colors.RED,
+                    err=True,
+                )
+                yield _ProvisionedVenv(version, None, None, None, "venv")
+                continue
+
+            vpy = _venv_python(venv)
+            spec = _linopy_install_spec(version)
+
+            # Single install pass: pinned infra from pyproject + linopy.
+            # Direct pins in [benchmarks] are sufficient for sweep
+            # reproducibility — uv resolves the same input deterministically
+            # into each per-version venv.
+            install_args = [
+                "uv",
+                "pip",
+                "install",
+                "--python",
+                str(vpy),
+                *(["--exclude-newer", as_of] if as_of else []),
+                *_benchmarks_extra_pins(),
+                spec,
+            ]
+            r = subprocess.run(install_args, check=False)
+            if r.returncode != 0:
+                typer.secho(f"install failed: {version}", fg=typer.colors.RED, err=True)
+                yield _ProvisionedVenv(version, None, None, None, "install")
+                continue
+
+            # Build the isolated import root described in the docstring:
+            # a filtered copy of ``benchmarks/`` and nothing else. The
+            # heavy, sweep-irrelevant artifacts (the executed notebook,
+            # bytecode caches, macOS cruft) are skipped to keep the
+            # per-version copy cheap.
+            import_dir = Path(tmp) / "iso"
+            import_dir.mkdir()
+            shutil.copytree(
+                repo_root / "benchmarks",
+                import_dir / "benchmarks",
+                ignore=shutil.ignore_patterns("__pycache__", "*.ipynb", ".DS_Store"),
+            )
+
+            # No PYTHONPATH manipulation: the copied ``benchmarks`` under
+            # cwd=import_dir carries the harness without pulling the
+            # repo's ``linopy/`` into the import path. Bytecode the
+            # subprocess writes lands in this throwaway copy, never the
+            # working tree, so no PYTHONDONTWRITEBYTECODE is needed.
+            env = os.environ.copy()
+            env.pop("PYTHONPATH", None)
+
+            # Preflight: confirm the venv's linopy is what gets imported
+            # under cwd=import_dir. If a future change reintroduces the
+            # dev-linopy shadow bug, this fails loudly here rather than
+            # silently corrupting every snapshot in the sweep.
+            preflight = subprocess.run(
+                [
+                    str(vpy),
+                    "-c",
+                    (
+                        "import linopy; "
+                        f"assert {str(venv)!r} in linopy.__file__, "
+                        "f'isolation leak: linopy resolved to "
+                        "{linopy.__file__}, not the venv'"
+                    ),
+                ],
+                cwd=str(import_dir),
+                env=env,
+                capture_output=True,
+                text=True,
+                check=False,
+            )
+            if preflight.returncode != 0:
+                typer.secho(
+                    f"isolation preflight failed: {version}",
+                    fg=typer.colors.RED,
+                    err=True,
+                )
+                typer.echo(preflight.stderr.strip(), err=True)
+                yield _ProvisionedVenv(version, None, None, None, "isolation")
+                continue
+
+            yield _ProvisionedVenv(version, vpy, env, import_dir, None)
+
+
+def run_sweep(
+    versions: list[str],
+    *,
+    output_dir: Path,
+    test_target: str,
+    smoke_args: list[str],
+    long: bool = False,
+    quick: bool = False,
+    rounds: int | None = None,
+    model: str | None = None,
+    filter_expr: str | None = None,
+    smoke: bool = False,
+    as_of: str | None = None,
+    extra_args: list[str] | None = None,
+) -> None:
+    """
+    Timing sweep: run the benchmark suite in each per-version venv.
+
+    ``test_target`` is the pytest target the caller resolved from
+    ``--phase`` (or ``benchmarks/``); ``smoke_args`` is the shared smoke
+    invocation; ``extra_args`` are trailing args forwarded to pytest. The
+    pytest-benchmark JSON snapshot lands in
+    ``<output_dir>/linopy-<version>.json``.
+    """
+    extra_args = extra_args or []
+
+    if quick and long:
+        typer.secho(
+            "--quick and --long are mutually exclusive",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
+
+    if smoke and (long or rounds is not None):
+        typer.secho(
+            "--smoke can't be combined with --long or --rounds "
+            "(no timings are recorded in smoke mode).",
+            fg=typer.colors.RED,
+            err=True,
+        )
+        raise typer.Exit(code=2)
+
+    if not smoke:
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+    failed: list[str] = []
+    for prov in _provision_venvs(versions, "linopy-bench-", as_of=as_of):
+        if prov.failed_at:
+            failed.append(prov.version)
+            continue
+
+        if smoke:
+            # Smoke mode: reuse the same pytest args as the top-level
+            # ``smoke`` command. No JSON snapshot, return code is the
+            # signal.
+            pytest_cmd = [str(prov.python), "-m", "pytest", *smoke_args]
+            k_parts = [p for p in (model, filter_expr) if p]
+            if k_parts:
+                pytest_cmd.extend(["-k", " and ".join(k_parts)])
+            pytest_cmd.extend(extra_args)
+
+            typer.secho(f"$ {' '.join(pytest_cmd)}", fg=typer.colors.BRIGHT_BLACK)
+            r = subprocess.run(
+                pytest_cmd, env=prov.env, cwd=str(prov.import_dir), check=False
+            )
+            if r.returncode != 0:
+                typer.secho(
+                    f"smoke failed: {prov.version}", fg=typer.colors.RED, err=True
+                )
+                failed.append(prov.version)
+            else:
+                typer.secho(f"smoke ok: {prov.version}", fg=typer.colors.GREEN)
+            continue
+
+        snapshot = (
+            output_dir / f"linopy-{_snapshot_label(prov.version)}.json"
+        ).resolve()
+        pytest_cmd = [
+            str(prov.python),
+            "-m",
+            "pytest",
+            test_target,
+            "--benchmark-only",
+            "--benchmark-json",
+            str(snapshot),
+        ]
+        if quick:
+            pytest_cmd.append("--quick")
+        elif long:
+            pytest_cmd.append("--long")
+        if rounds is not None:
+            pytest_cmd.extend(
+                [f"--benchmark-min-rounds={rounds}", "--benchmark-max-time=0"]
+            )
+
+        k_parts = [p for p in (model, filter_expr) if p]
+        if k_parts:
+            pytest_cmd.extend(["-k", " and ".join(k_parts)])
+
+        pytest_cmd.extend(extra_args)
+
+        typer.secho(f"$ {' '.join(pytest_cmd)}", fg=typer.colors.BRIGHT_BLACK)
+        subprocess.run(pytest_cmd, env=prov.env, cwd=str(prov.import_dir), check=False)
+
+        if snapshot.exists():
+            typer.secho(f"saved {snapshot}", fg=typer.colors.GREEN)
+        else:
+            typer.secho(
+                f"no snapshot produced for {prov.version}",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            failed.append(prov.version)
+
+    if failed:
+        typer.secho(f"\nFailed versions: {failed}", fg=typer.colors.RED, err=True)
+        raise typer.Exit(code=1)
+
+
+def run_memory_sweep(
+    versions: list[str],
+    *,
+    output_dir: Path,
+    quick: bool = False,
+    phases: list[str] | None = None,
+    repeats: int = 1,
+    as_of: str | None = None,
+) -> None:
+    """
+    Memory sweep: invoke ``memory save`` in each per-version venv.
+
+    Mirrors :func:`run_sweep` but tracks peak RSS. Each version's
+    snapshot lands at ``<output_dir>/linopy-<version>.json``.
+    """
+    from benchmarks.memory import MEMORY_PHASES
+
+    if phases:
+        unknown = [p for p in phases if p not in MEMORY_PHASES]
+        if unknown:
+            typer.secho(
+                f"unknown phase(s): {unknown}; valid options: {list(MEMORY_PHASES)}",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            raise typer.Exit(code=2)
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    failed: list[str] = []
+    for prov in _provision_venvs(versions, "linopy-mem-", as_of=as_of):
+        if prov.failed_at:
+            failed.append(prov.version)
+            continue
+        # ``failed_at is None`` guarantees these are populated (see
+        # ``_ProvisionedVenv``); narrow for the type checker.
+        assert prov.python is not None and prov.import_dir is not None
+
+        # ``memory save`` writes to ``.benchmarks/memory/<label>.json``
+        # under cwd; we run it with cwd pinned to repo root, then move
+        # the file if the user asked for a custom output dir.
+        label = f"linopy-{_snapshot_label(prov.version)}"
+        mem_cmd = [
+            str(prov.python),
+            "-m",
+            "benchmarks",
+            "memory",
+            "save",
+            label,
+        ]
+        if quick:
+            mem_cmd.append("--quick")
+        for ph in phases or []:
+            mem_cmd.extend(["--phase", ph])
+        if repeats > 1:
+            mem_cmd.extend(["--repeats", str(repeats)])
+
+        typer.secho(f"$ {' '.join(mem_cmd)}", fg=typer.colors.BRIGHT_BLACK)
+        subprocess.run(mem_cmd, env=prov.env, cwd=str(prov.import_dir), check=False)
+
+        # ``memory save`` writes to ``.benchmarks/memory/<label>.json``
+        # relative to its cwd — here, the isolated import_dir. Move it
+        # under the user's chosen output_dir (resolves under repo_root
+        # by default).
+        default_path = prov.import_dir / ".benchmarks" / "memory" / f"{label}.json"
+        target = output_dir / f"{label}.json"
+        if default_path.exists() and default_path.resolve() != target.resolve():
+            target.parent.mkdir(parents=True, exist_ok=True)
+            default_path.replace(target)
+
+        if target.exists():
+            typer.secho(f"saved {target}", fg=typer.colors.GREEN)
+        else:
+            typer.secho(
+                f"no snapshot produced for {prov.version}",
+                fg=typer.colors.RED,
+                err=True,
+            )
+            failed.append(prov.version)
+
+    if failed:
+        typer.secho(f"\nFailed versions: {failed}", fg=typer.colors.RED, err=True)
+        raise typer.Exit(code=1)
diff --git a/benchmarks/test_bench.py b/benchmarks/test_bench.py
new file mode 100644
index 00000000..78c94739
--- /dev/null
+++ b/benchmarks/test_bench.py
@@ -0,0 +1,107 @@
+"""
+Tests for the ad-hoc ``bench`` helper.
+
+The contract under test is the *seam*: a ``bench`` result must round-trip
+into ``snapshot.load_long_df`` exactly like a real snapshot, and its
+in-process ``to_df`` must line up column-for-column with the loaded frame.
+These are the only non-obvious behaviours — the timing math itself is not
+asserted beyond "finite and positive", since wall-clock values aren't
+reproducible.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import pytest
+
+import linopy
+from benchmarks import REGISTRY, bench
+from benchmarks.phases import touch_matrices
+from benchmarks.snapshot import load_long_df
+
+
+def _tiny() -> int:
+    return sum(range(1000))
+
+
+def _alloc() -> int:
+    # Allocate ~16 MB so the memray peak is unambiguously above zero;
+    # ``_tiny`` allocates nothing measurable.
+    data = [0] * 2_000_000
+    return len(data)
+
+
+def test_timing_snapshot_round_trips_into_loader(tmp_path: Path) -> None:
+    """A synthesized id parses back into the (phase, model, size) columns."""
+    snap = tmp_path / "t.json"
+    bench.time(_tiny, rounds=3).to_snapshot(
+        snap, model="basic", size=100, phase="build"
+    )
+
+    df, unit = load_long_df([snap])
+    assert unit == "s"
+    assert len(df) == 1
+    row = df.iloc[0]
+    assert (row["phase"], row["model"], row["size"]) == ("build", "basic", 100)
+    assert row["value"] > 0
+
+
+def test_compare_writes_n_entries(tmp_path: Path) -> None:
+    """``compare`` collects N cases into one snapshot → N loadable rows."""
+    snap = tmp_path / "cmp.json"
+    rs = bench.compare({"a": _tiny, "b": _tiny, "c": _tiny}, kind="time", rounds=2)
+    rs.to_snapshot(snap)
+
+    df, unit = load_long_df([snap])
+    assert unit == "s"
+    assert len(df) == 3
+    assert set(df["test_id"]) == {"a", "b", "c"}
+
+
+def test_to_df_columns_match_loader(tmp_path: Path) -> None:
+    """In-process ``to_df`` shares the loader's exact column set/order."""
+    snap = tmp_path / "t.json"
+    result = bench.time(_tiny, rounds=2)
+    result.to_snapshot(snap, model="basic", size=10, phase="build")
+
+    loaded, _ = load_long_df([snap])
+    assert list(result.to_df().columns) == list(loaded.columns)
+
+
+def test_memory_path_round_trips(tmp_path: Path) -> None:
+    """Memory results carry MiB and round-trip through the loader."""
+    pytest.importorskip("memray")
+    snap = tmp_path / "m.json"
+    result = bench.memory(_alloc)
+    assert result.peak_mib > 0
+    result.to_snapshot(snap, model="basic", size=10, phase="build")
+
+    df, unit = load_long_df([snap])
+    assert unit == "MiB"
+    assert df.iloc[0]["value"] > 0
+
+
+def test_phase_verb_on_custom_model() -> None:
+    """The headline use case: a phase verb timed on a hand-built model."""
+    m = linopy.Model()
+    x = m.add_variables(lower=0, name="x")
+    m.add_constraints(x >= 1)
+    m.add_objective(x)
+
+    result = bench.time(touch_matrices, m, rounds=2)
+    assert result.stats["min"] > 0
+    assert result.stats["rounds"] == 2
+
+
+def test_registry_builder_times() -> None:
+    """A registry builder is a plain callable — no special-casing needed."""
+    result = bench.time(REGISTRY["basic"].build, 50, rounds=2)
+    assert result.stats["min"] > 0
+
+
+def test_partial_id_spec_rejected(tmp_path: Path) -> None:
+    """A half-given (model/size/phase) id is ambiguous and must error."""
+    result = bench.time(_tiny, rounds=1)
+    with pytest.raises(ValueError, match="given together"):
+        result.to_snapshot(tmp_path / "x.json", model="basic")
diff --git a/benchmarks/test_build.py b/benchmarks/test_build.py
index f657715e..5bb3430b 100644
--- a/benchmarks/test_build.py
+++ b/benchmarks/test_build.py
@@ -2,52 +2,22 @@
 
 from __future__ import annotations
 
-import pytest
-
-from benchmarks.conftest import skip_if_quick
-from benchmarks.models import (
-    BASIC_SIZES,
-    EXPR_SIZES,
-    KNAPSACK_SIZES,
-    SPARSE_SIZES,
-    build_basic,
-    build_expression_arithmetic,
-    build_knapsack,
-    build_sparse_network,
-)
-from benchmarks.models.pypsa_scigrid import SIZES as PYPSA_SIZES
-
-
-@pytest.mark.parametrize("n", BASIC_SIZES, ids=[f"n={n}" for n in BASIC_SIZES])
-def test_build_basic(benchmark, n, request):
-    skip_if_quick(request, "basic", n)
-    benchmark(build_basic, n)
-
-
-@pytest.mark.parametrize("n", KNAPSACK_SIZES, ids=[f"n={n}" for n in KNAPSACK_SIZES])
-def test_build_knapsack(benchmark, n, request):
-    skip_if_quick(request, "knapsack", n)
-    benchmark(build_knapsack, n)
-
-
-@pytest.mark.parametrize("n", EXPR_SIZES, ids=[f"n={n}" for n in EXPR_SIZES])
-def test_build_expression_arithmetic(benchmark, n, request):
-    skip_if_quick(request, "expression_arithmetic", n)
-    benchmark(build_expression_arithmetic, n)
+from collections.abc import Callable
 
+import pytest
 
-@pytest.mark.parametrize("n", SPARSE_SIZES, ids=[f"n={n}" for n in SPARSE_SIZES])
-def test_build_sparse_network(benchmark, n, request):
-    skip_if_quick(request, "sparse_network", n)
-    benchmark(build_sparse_network, n)
+from benchmarks.conftest import maybe_skip
+from benchmarks.registry import BUILD, ModelSpec, iter_params, param_ids
 
+_PARAMS = iter_params(BUILD)
 
-@pytest.mark.parametrize(
-    "snapshots", PYPSA_SIZES, ids=[f"snapshots={s}" for s in PYPSA_SIZES]
-)
-def test_build_pypsa_scigrid(benchmark, snapshots, request):
-    pytest.importorskip("pypsa")
-    skip_if_quick(request, "pypsa_scigrid", snapshots)
-    from benchmarks.models.pypsa_scigrid import build_pypsa_scigrid
 
-    benchmark(build_pypsa_scigrid, snapshots)
+@pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
+def test_build(
+    benchmark: Callable[..., object],
+    spec: ModelSpec,
+    size: int,
+    request: pytest.FixtureRequest,
+) -> None:
+    maybe_skip(request, spec, size)
+    benchmark(spec.build, size)
diff --git a/benchmarks/test_lp_write.py b/benchmarks/test_lp_write.py
index 6442ccd6..f31c1284 100644
--- a/benchmarks/test_lp_write.py
+++ b/benchmarks/test_lp_write.py
@@ -2,62 +2,27 @@
 
 from __future__ import annotations
 
-import pytest
-
-from benchmarks.conftest import skip_if_quick
-from benchmarks.models import (
-    BASIC_SIZES,
-    EXPR_SIZES,
-    KNAPSACK_SIZES,
-    SPARSE_SIZES,
-    build_basic,
-    build_expression_arithmetic,
-    build_knapsack,
-    build_sparse_network,
-)
-from benchmarks.models.pypsa_scigrid import SIZES as PYPSA_SIZES
-
-
-@pytest.mark.parametrize("n", BASIC_SIZES, ids=[f"n={n}" for n in BASIC_SIZES])
-def test_lp_write_basic(benchmark, n, request, tmp_path):
-    skip_if_quick(request, "basic", n)
-    m = build_basic(n)
-    lp_file = tmp_path / "model.lp"
-    benchmark(m.to_file, lp_file, progress=False)
-
-
-@pytest.mark.parametrize("n", KNAPSACK_SIZES, ids=[f"n={n}" for n in KNAPSACK_SIZES])
-def test_lp_write_knapsack(benchmark, n, request, tmp_path):
-    skip_if_quick(request, "knapsack", n)
-    m = build_knapsack(n)
-    lp_file = tmp_path / "model.lp"
-    benchmark(m.to_file, lp_file, progress=False)
-
-
-@pytest.mark.parametrize("n", EXPR_SIZES, ids=[f"n={n}" for n in EXPR_SIZES])
-def test_lp_write_expression_arithmetic(benchmark, n, request, tmp_path):
-    skip_if_quick(request, "expression_arithmetic", n)
-    m = build_expression_arithmetic(n)
-    lp_file = tmp_path / "model.lp"
-    benchmark(m.to_file, lp_file, progress=False)
+from collections.abc import Callable
+from pathlib import Path
 
+import pytest
 
-@pytest.mark.parametrize("n", SPARSE_SIZES, ids=[f"n={n}" for n in SPARSE_SIZES])
-def test_lp_write_sparse_network(benchmark, n, request, tmp_path):
-    skip_if_quick(request, "sparse_network", n)
-    m = build_sparse_network(n)
-    lp_file = tmp_path / "model.lp"
-    benchmark(m.to_file, lp_file, progress=False)
+from benchmarks.conftest import maybe_skip
+from benchmarks.phases import write_lp
+from benchmarks.registry import LP_WRITE, ModelSpec, iter_params, param_ids
 
+_PARAMS = iter_params(LP_WRITE)
 
-@pytest.mark.parametrize(
-    "snapshots", PYPSA_SIZES, ids=[f"snapshots={s}" for s in PYPSA_SIZES]
-)
-def test_lp_write_pypsa_scigrid(benchmark, snapshots, request, tmp_path):
-    pytest.importorskip("pypsa")
-    skip_if_quick(request, "pypsa_scigrid", snapshots)
-    from benchmarks.models.pypsa_scigrid import build_pypsa_scigrid
 
-    m = build_pypsa_scigrid(snapshots)
+@pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
+def test_lp_write(
+    benchmark: Callable[..., object],
+    spec: ModelSpec,
+    size: int,
+    request: pytest.FixtureRequest,
+    tmp_path: Path,
+) -> None:
+    maybe_skip(request, spec, size)
+    m = spec.build(size)
     lp_file = tmp_path / "model.lp"
-    benchmark(m.to_file, lp_file, progress=False)
+    benchmark(write_lp, m, lp_file)
diff --git a/benchmarks/test_matrices.py b/benchmarks/test_matrices.py
index 352844fb..f985aec3 100644
--- a/benchmarks/test_matrices.py
+++ b/benchmarks/test_matrices.py
@@ -2,48 +2,24 @@
 
 from __future__ import annotations
 
-import pytest
-
-from benchmarks.conftest import skip_if_quick
-from benchmarks.models import (
-    BASIC_SIZES,
-    EXPR_SIZES,
-    SPARSE_SIZES,
-    build_basic,
-    build_expression_arithmetic,
-    build_sparse_network,
-)
-
+from collections.abc import Callable
 
-def _access_matrices(m):
-    """Access all matrix properties to force computation."""
-    matrices = m.matrices
-    _ = matrices.A
-    _ = matrices.b
-    _ = matrices.c
-    _ = matrices.lb
-    _ = matrices.ub
-    _ = matrices.sense
-    _ = matrices.vlabels
-    _ = matrices.clabels
-
-
-@pytest.mark.parametrize("n", BASIC_SIZES, ids=[f"n={n}" for n in BASIC_SIZES])
-def test_matrices_basic(benchmark, n, request):
-    skip_if_quick(request, "basic", n)
-    m = build_basic(n)
-    benchmark(_access_matrices, m)
+import pytest
 
+from benchmarks.conftest import maybe_skip
+from benchmarks.phases import touch_matrices
+from benchmarks.registry import MATRICES, ModelSpec, iter_params, param_ids
 
-@pytest.mark.parametrize("n", EXPR_SIZES, ids=[f"n={n}" for n in EXPR_SIZES])
-def test_matrices_expression_arithmetic(benchmark, n, request):
-    skip_if_quick(request, "expression_arithmetic", n)
-    m = build_expression_arithmetic(n)
-    benchmark(_access_matrices, m)
+_PARAMS = iter_params(MATRICES)
 
 
-@pytest.mark.parametrize("n", SPARSE_SIZES, ids=[f"n={n}" for n in SPARSE_SIZES])
-def test_matrices_sparse_network(benchmark, n, request):
-    skip_if_quick(request, "sparse_network", n)
-    m = build_sparse_network(n)
-    benchmark(_access_matrices, m)
+@pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
+def test_matrices(
+    benchmark: Callable[..., object],
+    spec: ModelSpec,
+    size: int,
+    request: pytest.FixtureRequest,
+) -> None:
+    maybe_skip(request, spec, size)
+    m = spec.build(size)
+    benchmark(touch_matrices, m)
diff --git a/benchmarks/test_memory_id_alignment.py b/benchmarks/test_memory_id_alignment.py
new file mode 100644
index 00000000..8478f174
--- /dev/null
+++ b/benchmarks/test_memory_id_alignment.py
@@ -0,0 +1,72 @@
+"""
+Guard test for the timing ↔ memory test-id seam.
+
+``memory.py`` hand-rolls f-strings to label each measurement with the
+same node id pytest-benchmark produces (e.g.
+``benchmarks/test_matrices.py::test_matrices[basic-n=10]``). If a
+benchmark test function gets renamed and the matching f-string in
+``memory.py`` isn't updated, ``plot`` would silently end up with
+non-overlapping timing and memory sets — no error, just missing data.
+
+This test exercises both sides once and asserts every memory-emitted
+id is present in pytest's collection.
+"""
+
+from __future__ import annotations
+
+import re
+import subprocess
+import sys
+from pathlib import Path
+
+from benchmarks.memory import MEMORY_PHASES, _measurements
+from benchmarks.registry import REGISTRY
+
+
+def _collect_benchmark_ids() -> set[str]:
+    """Return the set of node ids pytest collects under ``benchmarks/``."""
+    repo_root = Path(__file__).resolve().parents[1]
+    result = subprocess.run(
+        [
+            sys.executable,
+            "-m",
+            "pytest",
+            "benchmarks/",
+            "--collect-only",
+            "-q",
+            "--no-header",
+            "--co",
+        ],
+        capture_output=True,
+        text=True,
+        check=True,
+        cwd=repo_root,
+    )
+    # pytest -q --co emits one node id per line; trailing summary lines
+    # like "N tests collected" can be ignored.
+    return {
+        line.strip()
+        for line in result.stdout.splitlines()
+        if re.match(r"^benchmarks/.*::.*\[.*\]$", line.strip())
+    }
+
+
+def test_memory_node_ids_match_pytest_collection() -> None:
+    collected = _collect_benchmark_ids()
+    assert collected, "pytest collected zero benchmark node ids — sanity broken"
+
+    # ``basic`` at its smallest size is cheap and declares every default
+    # phase, so it exercises every node-id format ``_measurements`` emits.
+    spec = REGISTRY["basic"]
+    size = spec.sizes[0]
+
+    mem_ids: set[str] = set()
+    for phase in MEMORY_PHASES:
+        for test_id, _ in _measurements(phase, spec, size):
+            mem_ids.add(test_id)
+
+    missing = mem_ids - collected
+    assert not missing, (
+        "memory.py emits node ids that pytest doesn't collect "
+        "(test rename drift?):\n" + "\n".join(f"  {m}" for m in sorted(missing))
+    )
diff --git a/benchmarks/test_netcdf.py b/benchmarks/test_netcdf.py
new file mode 100644
index 00000000..a47203e0
--- /dev/null
+++ b/benchmarks/test_netcdf.py
@@ -0,0 +1,49 @@
+"""
+Benchmarks for the netCDF persistence round-trip.
+
+We track ``to_netcdf`` and ``read_netcdf`` separately because the cost split
+matters in practice: distributed workflows tend to do many reads of a single
+written artifact.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+from pathlib import Path
+
+import pytest
+
+from benchmarks.conftest import maybe_skip
+from benchmarks.phases import read_netcdf, write_netcdf
+from benchmarks.registry import NETCDF, ModelSpec, iter_params, param_ids
+
+_PARAMS = iter_params(NETCDF)
+
+
+@pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
+def test_netcdf_write(
+    benchmark: Callable[..., object],
+    spec: ModelSpec,
+    size: int,
+    request: pytest.FixtureRequest,
+    tmp_path: Path,
+) -> None:
+    maybe_skip(request, spec, size)
+    m = spec.build(size)
+    out = tmp_path / "model.nc"
+    benchmark(write_netcdf, m, out)
+
+
+@pytest.mark.parametrize("spec,size", _PARAMS, ids=param_ids(_PARAMS))
+def test_netcdf_read(
+    benchmark: Callable[..., object],
+    spec: ModelSpec,
+    size: int,
+    request: pytest.FixtureRequest,
+    tmp_path: Path,
+) -> None:
+    maybe_skip(request, spec, size)
+    m = spec.build(size)
+    out = tmp_path / "model.nc"
+    write_netcdf(m, out)
+    benchmark(read_netcdf, out)
diff --git a/benchmarks/test_pypsa_carbon_management.py b/benchmarks/test_pypsa_carbon_management.py
index 7f29a52e..a57763e2 100644
--- a/benchmarks/test_pypsa_carbon_management.py
+++ b/benchmarks/test_pypsa_carbon_management.py
@@ -1,43 +1,57 @@
-import pypsa
+from __future__ import annotations
+
+from collections.abc import Callable
+from typing import Any
+
 import pytest
 
 import linopy as lp
 
+# pypsa is an optional benchmark dep. Skip the whole module if it's missing
+# so the rest of the suite stays collectable without it.
+pypsa = pytest.importorskip("pypsa")
+
 
 @pytest.fixture(scope="module")
-def network():
+def network() -> Any:
     return pypsa.examples.carbon_management()
 
 
-def test_create_model_frozen(benchmark, network):
+def test_create_model_frozen(benchmark: Callable[..., object], network: Any) -> None:
     benchmark(network.optimize.create_model, freeze_constraints=True)
 
 
-def test_create_model_mutable(benchmark, network):
+def test_create_model_mutable(benchmark: Callable[..., object], network: Any) -> None:
     benchmark(network.optimize.create_model, freeze_constraints=False)
 
 
 @pytest.fixture(scope="module")
-def model_frozen(network):
+def model_frozen(network: Any) -> Any:
     return network.optimize.create_model(freeze_constraints=True)
 
 
 @pytest.fixture(scope="module")
-def model_mutable(network):
+def model_mutable(network: Any) -> Any:
     return network.optimize.create_model(freeze_constraints=False)
 
 
-def test_to_highspy_frozen(benchmark, model_frozen):
+def test_to_highspy_frozen(benchmark: Callable[..., object], model_frozen: Any) -> None:
     benchmark(lp.io.to_highspy, model_frozen)
 
 
-def test_to_highspy_mutable(benchmark, model_mutable):
+def test_to_highspy_mutable(
+    benchmark: Callable[..., object], model_mutable: Any
+) -> None:
     benchmark(lp.io.to_highspy, model_mutable)
 
 
-def test_to_highspy_mutable_no_names(benchmark, model_mutable):
+def test_to_highspy_mutable_no_names(
+    benchmark: Callable[..., object], model_mutable: Any
+) -> None:
     benchmark(lp.io.to_highspy, model_mutable, set_names=False)
 
 
-def test_to_highspy_frozen_no_names(benchmark, model_frozen):
+def test_to_highspy_frozen_no_names(
+    benchmark: Callable[..., object], model_frozen: Any
+) -> None:
     benchmark(lp.io.to_highspy, model_frozen, set_names=False)
diff --git a/benchmarks/test_solver_handoff.py b/benchmarks/test_solver_handoff.py
new file mode 100644
index 00000000..702d8a21
--- /dev/null
+++ b/benchmarks/test_solver_handoff.py
@@ -0,0 +1,56 @@
+"""
+Benchmarks for solver handoff (model -> native solver instance).
+
+Times each ``linopy.io.to_<solver>`` wrapper. These wrappers delegate to the
+same direct-API build path as the new stateful Solver API
+(``Solver.from_name(name, model, io_api="direct")``), so the numbers serve
+double duty: regression tracking for the wrappers, *and* for the underlying
+``Solver._build_direct`` paths. They've also been available for many releases
+— using them keeps the suite runnable on older linopy versions.
+
+The actual ``Solver.solve()`` runtime (i.e. solver-side algorithm time) is
+intentionally not benchmarked.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+
+import pytest
+
+from benchmarks.conftest import maybe_skip
+from benchmarks.phases import SOLVER_HANDOFFS
+from benchmarks.registry import ModelSpec, iter_params
+from linopy.solvers import available_solvers
+
+
+def _make_params() -> list[object]:
+    out: list[object] = []
+    for solver_name, phase, wrapper in SOLVER_HANDOFFS:
+        for spec, size in iter_params(phase):
+            out.append(
+                pytest.param(
+                    solver_name,
+                    wrapper,
+                    spec,
+                    size,
+                    id=f"{solver_name}-{spec.name}-n={size}",
+                )
+            )
+    return out
+
+
+@pytest.mark.parametrize("solver_name,wrapper,spec,size", _make_params())
+def test_solver_handoff(
+    benchmark: Callable[..., object],
+    solver_name: str,
+    wrapper: Callable[..., object],
+    spec: ModelSpec,
+    size: int,
+    request: pytest.FixtureRequest,
+) -> None:
+    if solver_name not in available_solvers:
+        pytest.skip(f"{solver_name} not installed")
+    maybe_skip(request, spec, size)
+    model = spec.build(size)
+    benchmark(wrapper, model)
diff --git a/benchmarks/test_sweep.py b/benchmarks/test_sweep.py
new file mode 100644
index 00000000..3531aebb
--- /dev/null
+++ b/benchmarks/test_sweep.py
@@ -0,0 +1,33 @@
+"""Unit tests for sweep helpers (no venvs spun up)."""
+
+from __future__ import annotations
+
+import pytest
+
+from benchmarks.sweep import _snapshot_label
+
+
+@pytest.mark.parametrize(
+    "spec,expected",
+    [
+        # plain releases pass through unchanged
+        ("0.6.1", "0.6.1"),
+        ("0.5.0a1", "0.5.0a1"),
+        # git spec pinned to a sha -> the sha (clean, reproducible filename)
+        ("git+https://github.com/PyPSA/linopy.git@2993b95", "2993b95"),
+        # git spec on a branch -> the branch name
+        ("git+https://github.com/PyPSA/linopy.git@main", "main"),
+        # PEP 508 local file url -> sanitised (no slashes survive)
+        ("linopy @ file:///home/me/linopy", "file-home-me-linopy"),
+    ],
+)
+def test_snapshot_label(spec: str, expected: str) -> None:
+    label = _snapshot_label(spec)
+    assert label == expected
+    # whatever the input, the label must be a safe single path segment.
+    assert "/" not in label and " " not in label and label
+
+
+def test_snapshot_label_never_empty() -> None:
+    # a spec that sanitises to nothing still yields a usable stub.
+    assert _snapshot_label("@@@") == "spec"
diff --git a/benchmarks/walkthrough.md b/benchmarks/walkthrough.md
new file mode 100644
index 00000000..72d75267
--- /dev/null
+++ b/benchmarks/walkthrough.md
@@ -0,0 +1,318 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.13
+    jupytext_version: 1.19.3
+kernelspec:
+  display_name: Python 3 (ipykernel)
+  language: python
+  name: python3
+---
+
+# Linopy benchmarks — CLI walkthrough
+
+> ⚠️ **This file is the source. Don't edit the `.ipynb` directly.**
+> Run `python -m benchmarks notebook --build` to (re)generate
+> `walkthrough.ipynb` from this `.md`, then open the `.ipynb` in
+> JupyterLab / PyCharm / VSCode to view and run cells. To change the
+> walkthrough's content, edit the `.md`, then re-run `--build`. The
+> `.ipynb` is gitignored.
+
+Internal performance tracking for `linopy`. This notebook shows the
+typer CLI working end-to-end: introspect what's registered, run a
+timing snapshot, diff two snapshots, render the comparison views
+inline.
+
+For what this notebook deliberately doesn't duplicate:
+
+- **Install + size tiers** → [`benchmarks/README.md`](README.md)
+- **Every CLI flag** → `python -m benchmarks --help` (rich-rendered);
+  `--help` on any subcommand drills in.
+
+## What's measured
+
+| Phase            | Test file                         | Measures                                                       |
+| ---------------- | --------------------------------- | -------------------------------------------------------------- |
+| `build`          | `test_build.py`                   | constructing variables / expressions / constraints / objective |
+| `matrices`       | `test_matrices.py`                | `A`, `b`, `c`, bounds, labels, `Q` for QP                      |
+| `lp_write`       | `test_lp_write.py`                | `model.to_file(...)` — LP / MPS serialization                  |
+| `netcdf`         | `test_netcdf.py`                  | `to_netcdf` / `read_netcdf` round-trip                         |
+| `solver_handoff` | `test_solver_handoff.py`          | `lp.io.to_highspy` / `to_gurobipy` / `to_mosek` / `to_xpress`  |
+| end-to-end       | `test_pypsa_carbon_management.py` | fixed PyPSA model → highspy; sweeps `freeze_constraints`       |
+
+Solver algorithm runtime is intentionally out of scope.
+
+## Setup
+
+Locate the repo so the shell cells below can run `python -m benchmarks`
+regardless of where Jupyter was launched, and pick a tempdir for the
+snapshot/plot files we'll produce.
+
+```{code-cell} ipython3
+import os
+import sys
+import tempfile
+from pathlib import Path
+
+# CI sets LINOPY_REPO_ROOT; locally we walk up from cwd.
+_root = os.environ.get("LINOPY_REPO_ROOT") or next(
+    (
+        str(p) for p in [Path.cwd().resolve(), *Path.cwd().resolve().parents]
+        if (p / "benchmarks" / "registry.py").exists()
+    ),
+    None,
+)
+if _root is None:
+    raise RuntimeError(
+        "Could not locate linopy repo root. Set LINOPY_REPO_ROOT or launch "
+        "Jupyter from somewhere inside the repo."
+    )
+
+# Subshells launched by ``!``-cells inherit cwd, env, and PYTHONPATH.
+os.chdir(_root)
+os.environ["PYTHONPATH"] = f"{_root}:{os.environ.get('PYTHONPATH', '')}"
+# Rich/click disable colour when stdout isn't a TTY (and the ``!`` pipe
+# isn't); ``FORCE_COLOR`` overrides that so typer's ``--help`` panels
+# render with colour in the notebook output.
+os.environ["FORCE_COLOR"] = "1"
+
+_tmp = Path(tempfile.mkdtemp(prefix="bench-walkthrough-"))
+baseline = _tmp / "baseline.json"
+candidate = _tmp / "candidate.json"
+scatter_html = _tmp / "scatter.html"
+compare_html = _tmp / "compare.html"
+
+print(f"repo root: {_root}")
+print(f"tempdir:   {_tmp}")
+```
+
+## Introspect the registry
+
+`list` enumerates registered specs. `--details` shows the feature tags
+and size range each spec covers, so you can pick a focused target.
+
+```{code-cell} ipython3
+!python -m benchmarks list --details
+```
+
+`show <name>` drills into one spec — every attribute the registry
+exposes, including which phases it's eligible for and the
+`quick_threshold` / `long_threshold` gating its sizes.
+
+```{code-cell} ipython3
+!python -m benchmarks show basic
+```
+
+`filter` narrows by feature tag (`quadratic`, `integer`, `sos`, …) or
+phase tag — useful when you only care about a subset of the suite.
+
+```{code-cell} ipython3
+!python -m benchmarks filter --feature quadratic
+```
+
+## Run a timing snapshot
+
+`run` is the main timing entry point. Below we run twice with
+`--quick --phase build` (~10 s each) to get a baseline / candidate
+pair we can diff. On a real PR you'd run once on `master` and once on
+your branch.
+
+```{code-cell} ipython3
+!python -m benchmarks run --quick --phase build --json {baseline}
+```
+
+```{code-cell} ipython3
+!python -m benchmarks run --quick --phase build --json {candidate}
+```
+
+The diff between two `--quick` runs of the same code is just
+measurement noise — that's expected. On a real PR the numbers below
+would actually move.
+
+## Diff snapshots
+
+### Text table — `compare`
+
+`compare` wraps `pytest-benchmark compare` with opinionated defaults:
+group by full test name, sort by `min`, show min + IQR. One mini-table
+per test with the baseline + candidate rows and a relative-speedup
+factor flagging the slower one. Scales to 30+ tests, just long output.
+
+```{code-cell} ipython3
+!python -m benchmarks compare {baseline} {candidate}
+```
+
+### Scatter view — exploratory plot
+
+x = baseline cost on a log axis, y = ratio (candidate / baseline),
+colour = absolute Δ. **Top-right = slow tests that got slower** —
+the "fix this" zone. Top-left = cheap tests with big ratio swings
+(noise, not real change). Bottom-right = already-slow tests that
+didn't move. Resolves the absolute-vs-relative tension that either
+axis alone has a blind spot for.
+
+```{code-cell} ipython3
+!python -m benchmarks plot --view scatter {baseline} {candidate} -o {scatter_html}
+
+from IPython.display import HTML
+HTML(scatter_html.read_text())
+```
+
+### Compare view — sorted-Δ bar chart
+
+The "did this PR regress anything, ranked by impact" picture. Bars
+sorted by absolute time delta by default (`--sort relative` switches
+to percent). Diverging colour around zero.
+
+```{code-cell} ipython3
+!python -m benchmarks plot --view compare {baseline} {candidate} -o {compare_html}
+HTML(compare_html.read_text())
+```
+
+### In Python — load straight from file
+
+The CLI views above all sit on one function, `load_long_df`, which reads
+snapshot json files (timing *or* memory) into a tidy frame —  `snapshot`,
+`test_id`, `phase`, `model`, `size`, `value` — plus the unit. Re-exported
+from the package so you can do your own analysis without pulling in
+plotly:
+
+```{code-cell} ipython3
+from benchmarks import load_long_df
+
+df, unit = load_long_df([baseline, candidate])
+print(f"unit: {unit}")
+df.head()
+```
+
+Pivot to one column per snapshot and the comparison is a couple of pandas
+lines — the same baseline-vs-candidate diff the `compare` view draws,
+here as a DataFrame you can sort, filter, or feed onward:
+
+```{code-cell} ipython3
+wide = df.pivot_table(
+    index=["phase", "model", "size"], columns="snapshot", values="value"
+)
+wide["ratio"] = wide["candidate"] / wide["baseline"]
+wide.sort_values("ratio", ascending=False)
+```
+
+(Two `--quick` runs of the same code, so the ratios are ~1 ± noise; on a
+real PR they'd move. The same frame feeds the plot views — pass the files
+to `python -m benchmarks plot` for the rendered version.)
+
+## Memory snapshots
+
+`memory save <label>` runs benchmarks under `memray.Tracker` and
+writes peak allocations (MiB) per `(phase, spec, size)` to
+`.benchmarks/memory/<label>.json`. The model is built **outside** the
+tracked region so peak reflects only the phase work, not model
+construction.
+
+```{code-cell} ipython3
+!python -m benchmarks memory save baseline_mem --quick --phase build
+```
+
+```{code-cell} ipython3
+!python -m benchmarks memory save candidate_mem --quick --phase build
+```
+
+`memory compare` prints a per-test table of the two labels with
+percent change — same shape as the timing `compare`, different
+metric. Tests present in only one snapshot show `—` for the missing
+column.
+
+```{code-cell} ipython3
+!python -m benchmarks memory compare baseline_mem candidate_mem
+```
+
+For cross-version memory tracking (analogous to `sweep` for timing),
+use `memory sweep <v1> <v2> ...` — same per-version venv shape, peak
+RSS metric.
+
+## Benchmarking custom things — the `bench` API
+
+The CLI measures the fixed registry grid. When you want to time or
+memory-profile *something the registry doesn't have* — a builder called
+with odd arguments, a phase verb on a model you built by hand, a one-off
+lambda — reach for `benchmarks.bench`. It measures in-process on the
+**current** tree and hands back a result you can inspect or drop into a
+snapshot the `plot` / `compare` machinery already reads. (It can't feed
+`sweep`, which runs pytest in per-version subprocesses — promote a model
+to `benchmarks/models/` to sweep it.)
+
+`bench.time` times any callable with the suite's min-of-N convention. It
+is *not* pytest-benchmark's calibrated timer, so compare `bench` numbers
+only to other `bench` numbers:
+
+```{code-cell} ipython3
+from benchmarks import REGISTRY, bench
+
+bench.time(REGISTRY["basic"].build, 100, rounds=5)
+```
+
+Any callable works — including a phase verb applied to a model the
+registry has never heard of. `bench.memory` profiles peak RSS through
+the same `memray` path the `memory` command uses:
+
+```{code-cell} ipython3
+import linopy
+from benchmarks.phases import touch_matrices
+
+m = linopy.Model()
+x = m.add_variables(coords=[range(2000)], dims=["i"], name="x")
+m.add_constraints(x >= 1)
+m.add_objective(x.sum())
+
+bench.memory(touch_matrices, m)
+```
+
+`bench.compare` runs several callables and collects a `ResultSet`.
+`to_snapshot` writes it in the on-disk shape `load_long_df` reads — the
+seam every plot view sits on — so in-process results round-trip through
+the existing tooling without a detour:
+
+```{code-cell} ipython3
+from benchmarks import load_long_df
+
+rs = bench.compare(
+    {
+        "listcomp": lambda: [i * i for i in range(10_000)],
+        "map": lambda: list(map(lambda i: i * i, range(10_000))),
+    },
+    rounds=20,
+)
+
+bench_snap = _tmp / "bench.json"
+rs.to_snapshot(bench_snap)
+
+df, unit = load_long_df([bench_snap])
+print(f"unit: {unit}")
+df
+```
+
+Those label-keyed ids land in the `other` bucket. For a size-`scaling`
+plot, write each result with `model=` / `size=` / `phase=` so the id
+parses into those columns — `plot` then treats it like any suite
+snapshot:
+
+    bench.time(REGISTRY["basic"].build, 100).to_snapshot(
+        snap, model="basic", size=100, phase="build"
+    )
+
+## Extending the suite
+
+Add a new model:
+
+1. Drop `benchmarks/models/<name>.py` with a `build_<name>(size) -> linopy.Model`.
+2. Build a `ModelSpec`, call `register(...)` at module scope, declare
+   realistic `quick_threshold` / `long_threshold` so the smoke run
+   stays fast.
+3. Import it in `benchmarks/models/__init__.py` so registration fires
+   on first import.
+
+Every phase test that lists `<name>` in its applicable phases picks it
+up automatically via `iter_params(phase)`. The first introspection
+section of this notebook will list your new spec on the next run.
diff --git a/pyproject.toml b/pyproject.toml
index 19d0abb3..49071402 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -82,11 +82,49 @@ dev = [
     "highspy",
     "jupyter",
 ]
+# Every direct dep that affects measurement is pinned exactly so the
+# environment stays stable over time on the same machine — deltas
+# between two runs then reflect linopy changes, not a numpy/scipy/pytest
+# upgrade. Absolute numbers are still machine-dependent (CPU / cache /
+# memory bandwidth).
+#
+# ``sweep`` installs these into each per-version venv, so the same pin
+# set drives every linopy version in a sweep call — only ``linopy``
+# varies. Transitive deps resolve fresh per venv; uv's deterministic
+# resolution gives identical results across versions within one sweep.
+#
+# ``highspy`` follows the project-wide ``!=1.14.0`` exclusion (see the
+# ``solvers`` extra).
 benchmarks = [
-    "pytest-benchmark",
-    "pypsa",
-    "highspy>=1.7.1",
-    "pytest-memray",
+    # Perf-relevant deps measured directly by the suite. Individual
+    # Dependabot PRs → CodSpeed attributes deltas to specific bumps.
+    "highspy==1.13.1",
+    "netcdf4==1.7.4",
+    # Perf-sensitive runtime deps. Pinned here (not in ``[project
+    # .dependencies]``) so downstream linopy consumers keep their loose
+    # resolve while the benchmark environment is fixed.
+    #
+    # ``numpy`` is held at the last 1.x release: linopy <0.5.1 declares
+    # ``numpy<2.0``, and we want ``sweep`` to cover the older versions.
+    # Bump to a 2.x release once we drop pre-0.5.1 from sweep coverage —
+    # ``sweep --smoke`` is the right tool to re-verify when that happens.
+    "numpy==1.26.4",
+    "scipy==1.16.3",
+    "xarray==2025.1.2",
+    "pandas==2.3.3",
+    "polars==1.35.2",
+    "dask==2025.11.0",
+    # Measurement scaffolding + CLI/notebook tooling. Pinned for
+    # reproducibility but grouped in dependabot.yml so bumps batch into
+    # one PR instead of cluttering review.
+    "pytest==9.0.3",
+    "pytest-benchmark==5.2.3",
+    "pytest-memray==1.8.0",
+    "pytest-codspeed==5.0.3",
+    "jupytext==1.19.3",
+    "nbconvert==7.17.1",
+    "typer==0.26.2",
+    "plotly==6.7.0",
 ]
 solvers = [
     "gurobipy",