diff --git a/.github/dependabot.yml b/.github/dependabot.yml index f8f779b5..e7750e98 100644 --- a/.github/dependabot.yml +++ b/.github/dependabot.yml @@ -12,3 +12,30 @@ updates: github-actions: patterns: - '*' + +# Pinned ``[benchmarks]`` extra in pyproject.toml. One PR per dep bump +# → CodSpeed CI runs and attributes any perf delta to that specific +# bump. Keeps the cross-version ``sweep`` baseline (lockfile-pinned) +# stable while still surfacing upstream perf changes per-PR with +# eyes-open review. Loose ``[project.dependencies]`` (numpy, scipy, ...) +# have no version specifier so Dependabot leaves them alone — only the +# ``==`` pins in ``[benchmarks]`` produce PRs. +- package-ecosystem: pip + directory: / + schedule: + interval: monthly + open-pull-requests-limit: 5 + groups: + # Measurement scaffolding + CLI/notebook tooling. Perf-irrelevant — + # they don't move CodSpeed signal, so batching into one PR cuts + # review noise. Perf-relevant deps (numpy, xarray, highspy, …) stay + # un-grouped so each gets its own attributed CodSpeed delta. + benchmark-tooling: + patterns: + - pytest + - pytest-benchmark + - pytest-memray + - pytest-codspeed + - nbconvert + - typer + - plotly diff --git a/.github/workflows/benchmark-smoke.yml b/.github/workflows/benchmark-smoke.yml new file mode 100644 index 00000000..59f6462f --- /dev/null +++ b/.github/workflows/benchmark-smoke.yml @@ -0,0 +1,81 @@ +name: Benchmark smoke + +# Runs the internal benchmark suite under --quick --benchmark-disable so every +# model spec is built and every phase fires at least once, but no timings are +# recorded. The goal is "did refactor X break a model spec?" — not regression +# tracking, which is done out-of-CI on dedicated hardware. + +on: + push: + branches: [ master ] + pull_request: + branches: [ '*' ] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + smoke: + name: Benchmark smoke (quick) + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 # setuptools_scm + + - name: Set up Python 3.12 + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Install package and benchmark dependencies + run: | + python -m pip install uv + # [dev] for pytest + netcdf4; [benchmarks] for pytest-benchmark + pypsa. + uv pip install --system -e ".[dev,benchmarks]" + + - name: Run benchmark smoke + run: | + python -m benchmarks smoke + + - name: Execute walkthrough notebook + # Catches doc rot — walkthrough.md is the canonical CLI walkthrough + # for the suite and must stay runnable end-to-end. + run: | + python -m benchmarks notebook + + codspeed: + name: CodSpeed (micro regression detection) + runs-on: ubuntu-latest + # Cachegrind is ~10–20× slower than native, so we restrict to ``--quick`` + # (smallest size per spec) and skip PyPSA end-to-end. The signal we want + # here is "did this PR change the instruction count of the hot paths?"; + # full wall-clock cross-version comparison stays in ``sweep``. + steps: + - uses: actions/checkout@v6 + with: + fetch-depth: 0 # setuptools_scm + + - name: Set up Python 3.12 + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Install pinned benchmark environment + # Install from the pinned ``[benchmarks]`` extra (not the lockfile) + # so Dependabot can auto-detect pyproject.toml and propose bumps + # to top-level deps. Each bump → one attributed CodSpeed delta. + # The full transitive lockfile is reserved for cross-version + # ``sweep`` reproducibility, where machine variance matters more. + run: | + python -m pip install uv + uv pip install --system -e ".[dev,benchmarks]" + + - name: Run benchmarks under CodSpeed + uses: CodSpeedHQ/action@v3 + with: + token: ${{ secrets.CODSPEED_TOKEN }} + run: | + pytest benchmarks/ --quick --codspeed diff --git a/.gitignore b/.gitignore index 654b686d..45c1fb7b 100644 --- a/.gitignore +++ b/.gitignore @@ -45,6 +45,12 @@ benchmark/scripts/__pycache__ benchmark/scripts/benchmarks-pypsa-eur/__pycache__ benchmark/scripts/leftovers/ +# Benchmarks (internal suite in benchmarks/) — the .md walkthrough is +# canonical; ``python -m benchmarks notebook --build`` regenerates the +# .ipynb sibling as a throwaway viewing/running artifact. +benchmarks/walkthrough.ipynb +benchmarks/.ipynb_checkpoints/ + # IDE .idea/ diff --git a/benchmarks/README.md b/benchmarks/README.md index 22ac73ce..320bbcb3 100644 --- a/benchmarks/README.md +++ b/benchmarks/README.md @@ -1,94 +1,44 @@ # Internal Performance Benchmarks -Measures linopy's own performance (build time, LP write speed, memory usage) across problem sizes using [pytest-benchmark](https://pytest-benchmark.readthedocs.io/) and [pytest-memray](https://pytest-memray.readthedocs.io/). Use these to check whether a code change introduces a regression or improvement. +End-to-end performance tracking for `linopy` — build → solver handoff +→ netCDF (de)serialization → fixed PyPSA model. Solver algorithm +runtime is out of scope. -> **Note:** The `benchmark/` directory (singular) contains *external* benchmarks comparing linopy against other modeling frameworks. This directory (`benchmarks/`) is for *internal* performance tracking only. +**The walkthrough is load-bearing.** Phase coverage, CLI introspection, +the two-snapshot regression workflow with inline Plotly views, and +how to extend the suite live in [`walkthrough.md`](walkthrough.md). +This README only covers install and how to open the walkthrough. -## Setup +> `benchmark/` (singular) is the legacy external-framework suite. +> `benchmarks/` (plural) is this internal suite. -```bash -pip install -e ".[benchmarks]" -``` - -## Running benchmarks +## Install ```bash -# Quick smoke test (small sizes only) -pytest benchmarks/ --quick - -# Full timing benchmarks -pytest benchmarks/test_build.py benchmarks/test_lp_write.py benchmarks/test_matrices.py - -# Run a specific model -pytest benchmarks/test_build.py -k basic -``` - -## Comparing timing between branches - -```bash -# Save baseline results on master -git checkout master -pytest benchmarks/test_build.py --benchmark-save=master - -# Switch to feature branch and compare -git checkout my-feature -pytest benchmarks/test_build.py --benchmark-save=my-feature --benchmark-compare=0001_master - -# Compare saved results without re-running -pytest-benchmark compare 0001_master 0002_my-feature --columns=median,iqr +uv sync --extra dev --extra benchmarks +source .venv/bin/activate ``` -Results are stored in `.benchmarks/` (gitignored). +`pypsa` is optional — `pypsa_scigrid` and +`test_pypsa_carbon_management.py` skip gracefully without it. Install +when you need them: `uv pip install pypsa`. -## Memory benchmarks +The `[benchmarks]` extra in `pyproject.toml` pins every direct dep that +affects measurement (`numpy`, `scipy`, `xarray`, `pandas`, `polars`, +`dask`, etc.). `sweep` installs these into each per-version venv, so +"same deps, only linopy varies" comes for free without a separate +lockfile — bump the pins in pyproject and the next sweep picks them up. -`memory.py` runs each test in a separate process with pytest-memray to get accurate per-test peak memory (including C/numpy allocations). Results are saved as JSON and can be compared across branches. - -By default, only the build phase (`test_build.py`) is measured. Unlike timing benchmarks where `benchmark()` isolates the measured function, memray tracks all allocations within a test — including model construction in setup. This means LP write and matrix tests would report build + phase memory combined, making the phase-specific contribution impossible to isolate. Since model construction dominates memory usage, measuring build alone gives the most actionable numbers. +## Open the walkthrough ```bash -# Save baseline on master -git checkout master -python benchmarks/memory.py save master - -# Save feature branch -git checkout my-feature -python benchmarks/memory.py save my-feature - -# Compare -python benchmarks/memory.py compare master my-feature - -# Quick mode (smaller sizes, faster) -python benchmarks/memory.py save master --quick - -# Measure a specific phase (includes build overhead) -python benchmarks/memory.py save master --test-path benchmarks/test_lp_write.py +python -m benchmarks notebook --build # (re)generate walkthrough.ipynb +jupyter lab benchmarks/walkthrough.ipynb # ...or PyCharm / VSCode ``` -Results are stored in `.benchmarks/memory/` (gitignored). Requires Linux or macOS (memray is not available on Windows). - -> **Note:** Small tests (~5 MiB) are near the import-overhead floor and may show noise of ~1 MiB between runs. Focus on larger tests for meaningful memory comparisons. Do not combine `--memray` with timing benchmarks — memray adds ~2x overhead that invalidates timing results. - -## Models - -| Model | Description | Sizes | -|-------|-------------|-------| -| `basic` | Dense N*N model, 2*N^2 vars/cons | 10 — 1600 | -| `knapsack` | N binary variables, 1 constraint | 100 — 1M | -| `expression_arithmetic` | Broadcasting, scaling, summation across dims | 10 — 1000 | -| `sparse_network` | Ring network with mismatched bus/line coords | 10 — 1000 | -| `pypsa_scigrid` | Real power system (requires `pypsa`) | 10 — 200 snapshots | - -## Phases - -| Phase | File | What it measures | -|-------|------|------------------| -| Build | `test_build.py` | Model construction (add_variables, add_constraints, add_objective) | -| LP write | `test_lp_write.py` | Writing the model to an LP file | -| Matrices | `test_matrices.py` | Generating sparse matrices (A, b, c, bounds) from the model | - -## Adding a new model +The `.md` is the source of truth; the `.ipynb` is a disposable, +gitignored build artifact. Edit the `.md`, re-run `--build`, re-open. +Same workflow in any editor. -1. Create `benchmarks/models/my_model.py` with a `build_my_model(n)` function and a `SIZES` list -2. Add parametrized tests in the relevant `test_*.py` files -3. Add a quick threshold in `conftest.py` +CI executes the walkthrough end-to-end on every PR +(`python -m benchmarks notebook`) so the examples can't silently rot. diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py index 6bf202cc..2f476484 100644 --- a/benchmarks/__init__.py +++ b/benchmarks/__init__.py @@ -1 +1,114 @@ -"""Linopy benchmark suite — run with ``pytest benchmarks/`` (use ``--quick`` for smaller sizes).""" +""" +Linopy benchmark suite. + +Run with ``pytest benchmarks/`` (use ``--quick`` for smaller sizes). + +This package also exposes a **reusable model registry** for any test, profiling +session, or example that wants ready-made linopy models of varying sizes and +features. Each entry exposes a ``build(size) -> linopy.Model`` callable plus +metadata:: + + from benchmarks import REGISTRY, QUADRATIC + + # Look up by name + model = REGISTRY["basic"].build(100) + + # Iterate / filter + for spec in REGISTRY.values(): + m = spec.build(spec.sizes[0]) + ... + + from benchmarks import filter_by + qp_specs = filter_by(has_feature=QUADRATIC) +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from pathlib import Path + + import pandas as pd + + from benchmarks.snapshot import Metric + +# Importing the models package triggers each module's ``register(...)`` call. +from benchmarks import bench, models # noqa: F401, E402 + + +def load_long_df( + snapshots: list[Path], metric: Metric = "min" +) -> tuple[pd.DataFrame, str]: + """ + Load one or more benchmark JSON snapshots into a tidy DataFrame. + + Thin re-export of :func:`benchmarks.snapshot.load_long_df` so callers + can do their own analysis without importing the plotting module + (which pulls in plotly). Returns ``(df, unit)`` where ``df`` has one + row per ``(snapshot, test_id)`` with columns ``snapshot, test_id, + phase, model, size, value``, and ``unit`` is ``"s"`` (timing) or + ``"MiB"`` (memory). + """ + from benchmarks.snapshot import load_long_df as _impl + + return _impl(snapshots, metric) + + +from benchmarks.registry import ( # noqa: F401, E402 — re-export + ALL_FEATURES, + ALL_PHASES, + BINARY, + BUILD, + CONTINUOUS, + DEFAULT_PHASES, + INTEGER, + LP_WRITE, + MASKED, + MATRICES, + NETCDF, + PIECEWISE, + QUADRATIC, + REGISTRY, + SOS, + TO_GUROBIPY, + TO_HIGHSPY, + TO_MOSEK, + TO_XPRESS, + ModelSpec, + filter_by, + get, + iter_params, + param_ids, + register, +) + +__all__ = [ + "ALL_FEATURES", + "ALL_PHASES", + "BINARY", + "BUILD", + "CONTINUOUS", + "DEFAULT_PHASES", + "INTEGER", + "LP_WRITE", + "MASKED", + "MATRICES", + "ModelSpec", + "NETCDF", + "PIECEWISE", + "QUADRATIC", + "REGISTRY", + "SOS", + "TO_GUROBIPY", + "TO_HIGHSPY", + "TO_MOSEK", + "TO_XPRESS", + "bench", + "filter_by", + "get", + "iter_params", + "load_long_df", + "param_ids", + "register", +] diff --git a/benchmarks/__main__.py b/benchmarks/__main__.py new file mode 100644 index 00000000..34a28439 --- /dev/null +++ b/benchmarks/__main__.py @@ -0,0 +1,5 @@ +"""Allow ``python -m benchmarks ``.""" + +from benchmarks.cli import app + +app() diff --git a/benchmarks/bench.py b/benchmarks/bench.py new file mode 100644 index 00000000..682523f2 --- /dev/null +++ b/benchmarks/bench.py @@ -0,0 +1,356 @@ +""" +Ad-hoc benchmarking of arbitrary callables on the *current* linopy tree. + +Where the pytest suite measures the fixed registry grid and ``sweep`` +measures across installed linopy versions, ``bench`` is for the +interactive middle: time or memory-profile any callable — a registry +builder, a phase verb applied to a model you built by hand, or a one-off +lambda — get a result object back, and either inspect it as a DataFrame +or drop it into a snapshot the existing ``plot`` / ``compare`` machinery +already understands:: + + from benchmarks import bench, REGISTRY + + r = bench.time(REGISTRY["basic"].build, 100) + r # rich repr in a notebook + r.to_snapshot("a.json", model="basic", size=100, phase="build") + + bench.compare({"v1": f1, "v2": f2}).to_snapshot("cmp.json") + +This plugs into the *output* side of the pipeline (snapshot JSON read by +``snapshot.load_long_df``), not into ``sweep``: a sweep runs pytest inside +per-version venvs as subprocesses, so it can only measure importable +registry models — an in-process callable can't cross that boundary. To +sweep a custom model across versions, promote it to ``benchmarks/models/``. + +**Methodology.** Timing is built on :class:`timeit.Timer`: an +``autorange`` calibration picks the inner iteration count (so timer +resolution doesn't dominate fast callables), then the per-iteration time +is sampled across rounds with the suite's min-of-N convention (the +fastest sample approximates the no-noise floor). It is *not* +pytest-benchmark's calibrated timer, so absolute numbers are not +interchangeable with suite snapshots — compare ``bench`` to ``bench`` and +suite to suite. +""" + +from __future__ import annotations + +from collections.abc import Callable, Sequence +from dataclasses import dataclass, field +from pathlib import Path +from statistics import mean, median, stdev +from timeit import Timer +from typing import TYPE_CHECKING, Any, Literal + +from benchmarks.snapshot import ( + parse_test_id, + synth_test_id, + write_memory_snapshot, + write_timing_snapshot, +) + +if TYPE_CHECKING: + import pandas as pd + +__all__ = [ + "MemoryResult", + "ResultSet", + "TimingResult", + "compare", + "memory", + "time", +] + +# Floor / cap on the auto-tuned round count when ``rounds`` is unset. +# The floor guarantees a meaningful min-of-N even for slow callables that +# blow past ``min_time`` in one shot; the cap stops a microsecond callable +# from spinning forever. +_ROUND_FLOOR = 5 +_ROUND_CAP = 10_000 + + +def _fn_name(fn: Callable[..., object]) -> str: + """Best-effort label for a callable (``functools.partial`` has no name).""" + return getattr(fn, "__name__", None) or repr(fn) + + +def _row(test_id: str, value: float) -> dict[str, object]: + """One ``load_long_df``-shaped row for an in-process result.""" + phase, model, size = parse_test_id(test_id) + return { + "snapshot": test_id, + "test_id": test_id, + "phase": phase, + "model": model, + "size": size, + "value": value, + } + + +def _frame(rows: list[dict[str, object]]) -> pd.DataFrame: + """Build a DataFrame with the exact column set/dtype of ``load_long_df``.""" + import pandas as pd + + df = pd.DataFrame( + rows, columns=["snapshot", "test_id", "phase", "model", "size", "value"] + ) + df["size"] = df["size"].astype("Int64") + return df + + +# --- Result types ---------------------------------------------------------- + + +@dataclass(frozen=True) +class TimingResult: + """One timed callable: per-round stats with ``min`` as the headline.""" + + label: str + stats: dict[str, float] + unit: Literal["s"] = "s" + + def to_snapshot( + self, + path: str | Path, + *, + model: str | None = None, + size: int | None = None, + phase: str | None = None, + ) -> Path: + """Write a pytest-benchmark-shaped timing snapshot (seconds).""" + test_id = synth_test_id(self.label, model=model, size=size, phase=phase) + return write_timing_snapshot(path, [(test_id, dict(self.stats))]) + + def to_df(self) -> pd.DataFrame: + """``load_long_df``-shaped frame (one row, ``value`` = min seconds).""" + return _frame([_row(self.label, self.stats["min"])]) + + def __repr__(self) -> str: + return ( + f"TimingResult({self.label!r}, min={self.stats['min']:.4g}s, " + f"rounds={int(self.stats['rounds'])}x{int(self.stats.get('iterations', 1))})" + ) + + def _repr_html_(self) -> str: + rows = [ + ("min", f"{self.stats['min']:.4g} s"), + ("median", f"{self.stats['median']:.4g} s"), + ("mean", f"{self.stats['mean']:.4g} s"), + ("max", f"{self.stats['max']:.4g} s"), + ("stddev", f"{self.stats['stddev']:.4g} s"), + ("rounds", int(self.stats["rounds"])), + ("iterations", int(self.stats.get("iterations", 1))), + ] + return _html_table("TimingResult", self.label, rows) + + +@dataclass(frozen=True) +class MemoryResult: + """One memory-profiled callable: peak RSS in MiB.""" + + label: str + peak_mib: float + unit: Literal["MiB"] = "MiB" + + def to_snapshot( + self, + path: str | Path, + *, + model: str | None = None, + size: int | None = None, + phase: str | None = None, + ) -> Path: + """Write a memory.py-shaped snapshot (peak MiB).""" + test_id = synth_test_id(self.label, model=model, size=size, phase=phase) + return write_memory_snapshot(path, self.label, {test_id: self.peak_mib}) + + def to_df(self) -> pd.DataFrame: + """``load_long_df``-shaped frame (one row, ``value`` = peak MiB).""" + return _frame([_row(self.label, self.peak_mib)]) + + def __repr__(self) -> str: + return f"MemoryResult({self.label!r}, peak={self.peak_mib:.1f} MiB)" + + def _repr_html_(self) -> str: + return _html_table( + "MemoryResult", self.label, [("peak", f"{self.peak_mib:.1f} MiB")] + ) + + +@dataclass(frozen=True) +class ResultSet: + """ + Several results of one kind (all timing, or all memory). + + ``to_snapshot`` writes every result into a single file keyed by its + label — the natural "compare these N variants" case. For + size-parametrized ``scaling`` plots, write each result individually + with ``model``/``size``/``phase`` instead. + """ + + results: list[TimingResult | MemoryResult] = field(default_factory=list) + unit: Literal["s", "MiB"] = "s" + + def to_snapshot(self, path: str | Path) -> Path: + """Write all results into one snapshot, each keyed by its label.""" + if self.unit == "s": + return write_timing_snapshot( + path, + [ + (r.label, dict(r.stats)) + for r in self.results + if isinstance(r, TimingResult) + ], + ) + peaks = { + r.label: r.peak_mib for r in self.results if isinstance(r, MemoryResult) + } + return write_memory_snapshot(path, "compare", peaks) + + def to_df(self) -> pd.DataFrame: + """Concatenate the per-result frames (shares ``load_long_df`` columns).""" + import pandas as pd + + return pd.concat([r.to_df() for r in self.results], ignore_index=True) + + def __repr__(self) -> str: + labels = ", ".join(r.label for r in self.results) + return f"ResultSet(unit={self.unit!r}, [{labels}])" + + def _repr_html_(self) -> str: + rows = [ + ( + r.label, + f"{r.stats['min']:.4g} s" + if isinstance(r, TimingResult) + else f"{r.peak_mib:.1f} MiB", + ) + for r in self.results + ] + return _html_table("ResultSet", self.unit, rows) + + +def _html_table(kind: str, header: str, rows: Sequence[tuple[str, object]]) -> str: + """Compact two-column Jupyter table, mirroring ``ModelSpec._repr_html_``.""" + body = "".join( + f"{k}{v}" + for k, v in rows + ) + return ( + f"{kind} {header}" + f"{body}
" + ) + + +# --- Entry points ---------------------------------------------------------- + + +def time( + fn: Callable[..., object], + /, + *args: object, + rounds: int | None = None, + warmup: int = 1, + min_time: float = 0.5, + label: str | None = None, + **kwargs: object, +) -> TimingResult: + """ + Time ``fn(*args, **kwargs)`` and return a :class:`TimingResult`. + + Built on :class:`timeit.Timer`: an ``autorange`` calibration first + picks the inner iteration count so timer resolution doesn't dominate + for fast callables (the bespoke "one call per round" loop this + replaced was unstable in exactly that regime). Each round then runs + that many calibrated iterations; the per-iteration time is the + sample. ``warmup`` rounds are discarded to prime caches. + + With ``rounds`` set, run exactly that many rounds; otherwise + auto-tune — keep going until cumulative timed wall-clock reaches + ``min_time`` (floor of 5 rounds, hard cap). The headline number is + ``stats["min"]``; ``stats["iterations"]`` records the calibrated + inner count. + + This is *not* pytest-benchmark's calibrated timer — ``bench`` numbers + are only comparable to other ``bench`` numbers, not to suite + snapshots. + """ + timer = Timer(lambda: fn(*args, **kwargs)) + + # Calibrate inner iterations so a single round is long enough that + # ``perf_counter`` granularity is negligible (timeit targets ~0.2 s). + number, _ = timer.autorange() + + for _ in range(max(0, warmup)): + timer.timeit(number) + + samples: list[float] = [] # per-iteration seconds + if rounds is not None: + samples = [ + t / number for t in timer.repeat(repeat=max(1, rounds), number=number) + ] + else: + total = 0.0 + while True: + t = timer.timeit(number) + samples.append(t / number) + total += t + if len(samples) >= _ROUND_FLOOR and total >= min_time: + break + if len(samples) >= _ROUND_CAP: + break + + stats = { + "min": min(samples), + "max": max(samples), + "mean": mean(samples), + "median": median(samples), + "stddev": stdev(samples) if len(samples) > 1 else 0.0, + "rounds": float(len(samples)), + "iterations": float(number), + } + return TimingResult(label=label or _fn_name(fn), stats=stats) + + +def memory( + fn: Callable[..., object], + /, + *args: object, + repeats: int = 1, + label: str | None = None, + **kwargs: object, +) -> MemoryResult: + """ + Peak-RSS profile ``fn(*args, **kwargs)`` and return a :class:`MemoryResult`. + + Thin wrapper over :func:`benchmarks.memory.measure_peak`; ``repeats > 1`` + keeps the minimum peak. Raises on Windows (no ``memray``). + """ + from benchmarks.memory import measure_peak + + peak = measure_peak(lambda: fn(*args, **kwargs), repeats=repeats) + return MemoryResult(label=label or _fn_name(fn), peak_mib=peak) + + +def compare( + cases: dict[str, Callable[[], object]], + *, + kind: Literal["time", "memory"] = "time", + **opts: Any, +) -> ResultSet: + """ + Run each zero-arg callable in ``cases`` and collect a :class:`ResultSet`. + + ``kind`` selects timing (default) or memory; ``opts`` are forwarded to + :func:`time` / :func:`memory` (e.g. ``rounds=``, ``repeats=``). The + dict key becomes each case's label. + """ + if kind == "time": + results: list[TimingResult | MemoryResult] = [ + time(fn, label=name, **opts) for name, fn in cases.items() + ] + return ResultSet(results=results, unit="s") + if kind == "memory": + results = [memory(fn, label=name, **opts) for name, fn in cases.items()] + return ResultSet(results=results, unit="MiB") + raise ValueError(f"kind must be 'time' or 'memory', got {kind!r}") diff --git a/benchmarks/cli.py b/benchmarks/cli.py new file mode 100644 index 00000000..9fde533f --- /dev/null +++ b/benchmarks/cli.py @@ -0,0 +1,882 @@ +""" +linopy benchmark CLI — one entry point for the suite. + +Run with:: + + python -m benchmarks [options] + +The CLI is a thin layer over pytest for the timing / smoke commands, plus +direct dispatch for registry introspection and memory snapshots. +""" + +from __future__ import annotations + +import os +import subprocess +import sys +import tempfile +from pathlib import Path +from typing import Annotated, Literal + +import typer + +from benchmarks import ( + REGISTRY, + filter_by, + get, +) +from benchmarks.memory import compare as memory_compare +from benchmarks.memory import save as memory_save +from benchmarks.plotting import FacetBy, Metric, PlotView, SortMode +from benchmarks.snapshot import discover_snapshots +from benchmarks.sweep import run_memory_sweep, run_sweep + +app = typer.Typer( + help=( + "Linopy internal benchmark suite — a thin layer over pytest plus " + "registry introspection and memory snapshots." + ), + no_args_is_help=True, + rich_markup_mode="rich", +) + +memory_app = typer.Typer( + help="Peak-RSS memory snapshots (pytest-memray under the hood).", + no_args_is_help=True, +) +app.add_typer(memory_app, name="memory") + + +PhaseName = Literal["build", "matrices", "lp_write", "netcdf", "solver_handoff"] + + +_PHASE_TEST_FILE: dict[PhaseName, str] = { + "build": "benchmarks/test_build.py", + "matrices": "benchmarks/test_matrices.py", + "lp_write": "benchmarks/test_lp_write.py", + "netcdf": "benchmarks/test_netcdf.py", + "solver_handoff": "benchmarks/test_solver_handoff.py", +} + +# pytest args that constitute a "smoke" run — quick sizes, no timings. +# Shared between the top-level ``smoke`` command and ``sweep --smoke`` so +# bumping the definition stays single-source. +_SMOKE_PYTEST_ARGS = ["benchmarks/", "--quick", "--benchmark-disable", "-q"] + + +# --- Introspection commands ------------------------------------------------ + + +@app.command("list") +def list_( + details: Annotated[ + bool, + typer.Option("--details", "-d", help="Show features and size range."), + ] = False, +) -> None: + """ + List the registered model specs. + + By default emits one name per line — suitable for piping into other + tools. Pass ``--details`` for a small table that also shows the + features tags and the size range. + """ + if not details: + for name in sorted(REGISTRY): + typer.echo(name) + return + + rows = [ + ( + spec.name, + ",".join(sorted(spec.features)), + f"{spec.sizes[0]}..{spec.sizes[-1]}", + ) + for spec in REGISTRY.values() + ] + name_w = max(len(r[0]) for r in rows) + feat_w = max(len(r[1]) for r in rows) + # ``secho`` strips colour automatically when stdout isn't a TTY, so + # piping ``list --details | grep`` still gets plain text. + typer.secho( + f"{'name':<{name_w}} {'features':<{feat_w}} sizes", + dim=True, + ) + typer.secho("-" * (name_w + feat_w + 20), dim=True) + for name, feats, sizes in rows: + typer.secho(f"{name:<{name_w}}", fg=typer.colors.CYAN, nl=False) + typer.echo(f" {feats:<{feat_w}} {sizes}") + + +@app.command() +def show( + name: Annotated[str, typer.Argument(help="Spec name (see ``list``).")], +) -> None: + """ + Print full attributes of one model spec. + + Output includes sizes, feature tags, applicable phases, the quick / + long size thresholds, and any optional ``requires=`` dependencies the + spec advertises. + """ + try: + spec = get(name) + except KeyError as exc: + typer.secho(f"unknown model: {name!r}", fg=typer.colors.RED, err=True) + typer.echo(f"available: {', '.join(sorted(REGISTRY))}", err=True) + raise typer.Exit(code=2) from exc + typer.echo(repr(spec)) + + def _row(label: str, value: object) -> None: + # Dim the label so the eye lands on the value first; ``secho`` + # auto-strips colour when stdout isn't a TTY. + typer.secho(f" {label:<17}", dim=True, nl=False) + typer.echo(value) + + _row("sizes:", spec.sizes) + _row("features:", sorted(spec.features)) + _row("phases:", sorted(spec.phases)) + _row("quick_threshold:", spec.quick_threshold) + _row("long_threshold:", spec.long_threshold) + if spec.requires: + _row("requires:", list(spec.requires)) + + +@app.command("filter") +def filter_( + feature: Annotated[ + str | None, + typer.Option(help="Feature tag, e.g. 'quadratic', 'integer', 'sos'."), + ] = None, + phase: Annotated[ + str | None, + typer.Option(help="Phase tag, e.g. 'to_gurobipy', 'lp_write'."), + ] = None, +) -> None: + """ + Filter specs by feature or phase tag. + + Both filters can be combined; the result is the intersection. + At least one of ``--feature`` / ``--phase`` must be supplied. + """ + if feature is None and phase is None: + typer.secho("pass --feature and/or --phase", fg=typer.colors.RED, err=True) + raise typer.Exit(code=2) + matches = filter_by(has_feature=feature, has_phase=phase) + for spec in matches: + typer.echo(repr(spec)) + + +# --- Execution commands ---------------------------------------------------- + + +def _run_pytest(args: list[str]) -> None: + """Invoke pytest as a subprocess and propagate its exit code.""" + cmd = [sys.executable, "-m", "pytest", *args] + typer.secho(f"$ {' '.join(cmd)}", fg=typer.colors.BRIGHT_BLACK) + result = subprocess.run(cmd, check=False) + if result.returncode != 0: + raise typer.Exit(code=result.returncode) + + +@app.command( + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def smoke(ctx: typer.Context) -> None: + """ + Quick smoke run — what CI uses on every PR. + + Equivalent to ``pytest benchmarks/ --quick --benchmark-disable -q``. + Every model builds at one size and every phase fires once, no timings + recorded. Typical wall-clock: ~20s. + + Any trailing arguments are forwarded to pytest verbatim, e.g.:: + + python -m benchmarks smoke -k basic --tb=short + """ + _run_pytest([*_SMOKE_PYTEST_ARGS, *ctx.args]) + + +@app.command( + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def run( + ctx: typer.Context, + long: Annotated[ + bool, + typer.Option( + "--long", + help="Include the slowest sizes (above each spec's long_threshold).", + ), + ] = False, + phase: Annotated[ + PhaseName | None, + typer.Option(help="Restrict to one phase's test file."), + ] = None, + model: Annotated[ + str | None, + typer.Option(help="Restrict to one model (passed as pytest ``-k``)."), + ] = None, + filter_expr: Annotated[ + str | None, + typer.Option( + "--filter", + "-k", + help="Arbitrary pytest ``-k`` expression (AND-ed with ``--model``).", + ), + ] = None, + json_out: Annotated[ + Path | None, + typer.Option("--json", help="Save pytest-benchmark JSON to this path."), + ] = None, + rounds: Annotated[ + int | None, + typer.Option( + "--rounds", + help=( + "Force pytest-benchmark to run exactly N rounds per test " + "(passes ``--benchmark-min-rounds=N --benchmark-max-time=0``). " + "Default: pytest-benchmark auto-tunes per test (5–40+ rounds " + "depending on cost). Use a fixed N for uniform measurement " + "across versions in a sweep." + ), + ), + ] = None, +) -> None: + """ + Default timing run. Records timings with pytest-benchmark. + + Without ``--long``, sizes above each spec's ``long_threshold`` are + skipped — keeps the wall-clock around 45s instead of several minutes. + Add ``--long`` for the full sweep including the heaviest sizes + (knapsack at 1M, basic at 1600, pypsa_scigrid at >50). + + Any trailing arguments are forwarded to pytest verbatim, e.g.:: + + python -m benchmarks run --long -- --tb=short -x + + To skip timing entirely (e.g. just verifying everything runs at a + bigger size), use ``smoke`` instead, or pass ``--benchmark-disable`` + as a trailing arg. + """ + args: list[str] = [] + args.append(_PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/") + if long: + args.append("--long") + args.append("--benchmark-only") + if json_out is not None: + args.extend(["--benchmark-json", str(json_out)]) + if rounds is not None: + args.extend([f"--benchmark-min-rounds={rounds}", "--benchmark-max-time=0"]) + + k_parts = [p for p in (model, filter_expr) if p] + if k_parts: + args.extend(["-k", " and ".join(k_parts)]) + + args.extend(ctx.args) + _run_pytest(args) + + +@app.command() +def notebook( + build: Annotated[ + bool, + typer.Option( + "--build", + help=( + "Regenerate ``walkthrough.ipynb`` from the ``.md`` source. " + "One-way build — the ``.ipynb`` is a throwaway artifact for " + "opening in any editor (JupyterLab, PyCharm, VSCode), the " + "``.md`` stays canonical. Re-run after editing the ``.md``. " + "The ``.ipynb`` is gitignored." + ), + ), + ] = False, +) -> None: + """ + Execute the walkthrough notebook end-to-end (default) or rebuild the + ``.ipynb`` artifact for interactive viewing (``--build``). + + The walkthrough is a Jupytext MyST markdown file + (``benchmarks/walkthrough.md``) — diffs cleanly in git, runs as a + notebook in Jupyter. The ``.md`` is the source of truth; the paired + ``.ipynb`` is generated output. Edit the ``.md``, re-run ``--build``, + open the ``.ipynb`` in your editor of choice. + + CI calls this with no flags to catch doc rot; the executed copy goes + to a tempdir and is discarded so the source file stays output-free. + """ + nb = Path("benchmarks/walkthrough.md") + if not nb.exists(): + typer.secho(f"walkthrough not found: {nb}", fg=typer.colors.RED, err=True) + raise typer.Exit(code=1) + + if build: + # ``--to ipynb`` is a one-way conversion (no ``formats`` metadata + # written into the .md). The generated .ipynb is editor-agnostic; + # contributors regenerate it after editing the .md. + cmd = [ + sys.executable, + "-m", + "jupytext", + "--to", + "ipynb", + str(nb), + ] + typer.secho(f"$ {' '.join(cmd)}", fg=typer.colors.BRIGHT_BLACK) + result = subprocess.run(cmd, check=False) + if result.returncode != 0: + raise typer.Exit(code=result.returncode) + ipynb = nb.with_suffix(".ipynb") + typer.secho(f"built: {ipynb} (regenerable from {nb})", fg=typer.colors.GREEN) + typer.echo(f"Open it: jupyter lab {ipynb} # or PyCharm / VSCode / …") + return + + with tempfile.TemporaryDirectory() as tmp: + # Jupytext sets the kernel cwd to the output directory (the + # tempdir here), so forward the repo root via + # ``LINOPY_REPO_ROOT`` for the walkthrough's first cell to find + # ``benchmarks/``. + env = {**os.environ, "LINOPY_REPO_ROOT": str(Path.cwd().resolve())} + cmd = [ + sys.executable, + "-m", + "jupytext", + "--to", + "notebook", + "--execute", + "--output", + str(Path(tmp) / "executed.ipynb"), + str(nb), + ] + typer.secho(f"$ {' '.join(cmd)}", fg=typer.colors.BRIGHT_BLACK) + result = subprocess.run(cmd, env=env, check=False) + if result.returncode != 0: + raise typer.Exit(code=result.returncode) + + +# --- Sweep across linopy versions ------------------------------------------ + + +@app.command( + context_settings={"allow_extra_args": True, "ignore_unknown_options": True}, +) +def sweep( + ctx: typer.Context, + versions: Annotated[ + list[str], + typer.Argument(help="linopy versions, e.g. 0.4.0 0.5.0 (or any pip spec)."), + ], + output_dir: Annotated[ + Path, + typer.Option("--output-dir", "-o", help="Where to save snapshot JSONs."), + ] = Path(".benchmarks/sweep"), + long: Annotated[ + bool, typer.Option("--long", help="Include the slowest sizes.") + ] = False, + quick: Annotated[ + bool, + typer.Option("--quick", help="Use only the smallest sizes (faster sweep)."), + ] = False, + phase: Annotated[ + PhaseName | None, + typer.Option(help="Restrict each version's run to one phase's test file."), + ] = None, + model: Annotated[ + str | None, + typer.Option(help="Restrict to one model (passed as pytest ``-k``)."), + ] = None, + filter_expr: Annotated[ + str | None, + typer.Option( + "--filter", + "-k", + help="Arbitrary pytest ``-k`` expression (AND-ed with ``--model``).", + ), + ] = None, + rounds: Annotated[ + int | None, + typer.Option( + "--rounds", + help=( + "Force pytest-benchmark to run exactly N rounds per test in " + "every version (uniform measurement across the sweep). " + "Default: pytest-benchmark auto-tunes per test." + ), + ), + ] = None, + smoke: Annotated[ + bool, + typer.Option( + "--smoke", + help=( + "Run the smoke suite in each version's venv instead of the " + "full timing run. Same pytest invocation as the top-level " + "``smoke`` command — every model/phase fires once at the " + "quickest size, no timings, ~20 s per version. Useful before " + "bumping a perf-sensitive pin to check the combination is " + "viable across every linopy version you'd sweep against." + ), + ), + ] = False, + as_of: Annotated[ + str | None, + typer.Option( + "--as-of", + help=( + "Freeze every dep's resolution to releases on or before this " + "date (``YYYY-MM-DD`` or ISO 8601). Passes ``--exclude-newer`` " + "to uv. Use a consistent value across invocations for " + "cross-time-reproducible sweeps — direct pins alone keep " + "results stable within one call but transitives can drift " + "between calls." + ), + ), + ] = None, +) -> None: + """ + Run the benchmark suite against several linopy versions. + + Uses ``uv`` to build a fresh venv per version (near-instant) and to + install the benchmark infra + target linopy in a single resolution + pass. The pytest-benchmark JSON snapshot lands in + ``/linopy-.json``. + + Versions are accepted in two forms: + + - Plain releases: ``0.4.0``, ``0.5.0a1`` — expanded to ``linopy==X``. + - Pip specs verbatim: ``git+https://github.com/PyPSA/linopy.git@`` + or ``linopy @ file:///path/to/checkout``. + + The current (repo-tip) benchmark code runs against each linopy + version, so the measurement layer is constant. ``_API_AVAILABLE`` + gates in the ``sos`` / ``piecewise`` specs let older linopy versions + skip those phases gracefully. + + Filter knobs (``--phase``, ``--model``, ``--filter``) mirror ``run`` + and apply to every version's pytest invocation. Trailing arguments + after ``--`` are forwarded to pytest verbatim: + + python -m benchmarks sweep 0.6.7 --phase build --model basic + python -m benchmarks sweep 0.6.7 -- --tb=short -x + + Wall-clock: roughly 1-2 minutes per version (venv + install + + benchmarks). uv's wheel cache makes repeated runs much faster. + """ + test_target = _PHASE_TEST_FILE[phase] if phase is not None else "benchmarks/" + run_sweep( + versions, + output_dir=output_dir, + test_target=test_target, + smoke_args=_SMOKE_PYTEST_ARGS, + long=long, + quick=quick, + rounds=rounds, + model=model, + filter_expr=filter_expr, + smoke=smoke, + as_of=as_of, + extra_args=ctx.args, + ) + + +# --- Compare timing snapshots --------------------------------------------- + + +def _suggest_snapshots(reason: str) -> None: + """Print an error + a hint listing whatever snapshots we can find.""" + typer.secho(reason, fg=typer.colors.RED, err=True) + found = discover_snapshots() + if found: + typer.echo("\nAvailable snapshots under .benchmarks/:", err=True) + for p in found: + typer.echo(f" {p}", err=True) + else: + typer.echo( + "\nNo snapshots found under .benchmarks/. Generate one with:\n" + " python -m benchmarks run --json .benchmarks/