Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
68 commits
Select commit Hold shift + click to select a range
8c908af
docs: update benchmark readme
FBumann May 27, 2026
413f1c6
benchmarks: reusable model registry, new model types, new phases, CI …
FBumann May 28, 2026
a6cc83b
benchmarks: add --long flag, gate super-long sizes by default
FBumann May 28, 2026
300abb5
benchmarks: make --quick truly quick (35s → 18s)
FBumann May 28, 2026
c725c68
benchmarks: add registry-usage notebook + execute in CI
FBumann May 28, 2026
99483f8
benchmarks: switch walkthrough to .ipynb, add reprs to ModelSpec
FBumann May 28, 2026
751aa78
benchmarks: typer-based CLI as the single entry point
FBumann May 28, 2026
8b124e2
benchmarks: pin typer==0.26.2, use ctx.args for pytest pass-through
FBumann May 28, 2026
86fd036
benchmarks: pin test infra + add transitive lockfile
FBumann May 28, 2026
9be18e1
benchmarks: add ``sweep`` subcommand for cross-version perf runs
FBumann May 28, 2026
51f418d
benchmarks: collapse README to a pointer, kill duplication
FBumann May 28, 2026
c0f3fee
benchmarks: make pypsa optional, expand notebook into proper guide
FBumann May 28, 2026
0522a75
benchmarks: sweep gains --phase / --model / --filter + pytest pass-th…
FBumann May 28, 2026
7bb464e
benchmarks: add ``compare`` subcommand wrapping pytest-benchmark compare
FBumann May 28, 2026
83bdeda
benchmarks: compare lists snapshots as relative paths (easier to copy…
FBumann May 28, 2026
8e378b5
benchmarks: tighter defaults for ``compare`` (median/iqr, sorted by n…
FBumann May 28, 2026
f67721b
benchmarks: compare gains ``--group-by=fullname`` default + ctx.args …
FBumann May 28, 2026
3ac333b
benchmarks: revert compare to manual arg-split + acknowledge typer wart
FBumann May 28, 2026
919e061
benchmarks: add ``plot`` subcommand (compare / sweep / scaling views)
FBumann May 28, 2026
c921b78
benchmarks: move plotting to benchmarks/plotting.py + text_auto + hov…
FBumann May 28, 2026
4c6f328
benchmarks: switch primary metric to ``min``, allow ``--metric`` over…
FBumann May 28, 2026
d703cb1
benchmarks: plot compare sorts/bars by absolute time delta by default
FBumann May 28, 2026
69693c0
benchmarks: add ``scatter`` plot view for two-snapshot exploration
FBumann May 28, 2026
2f08aa6
benchmarks: scatter view handles N snapshots via plotly animation_frame
FBumann May 28, 2026
321d2d9
benchmarks: scatter — include baseline as frame 0, clip colour to p95…
FBumann May 28, 2026
a0d4b7a
benchmarks: scatter — center y-axis symmetrically around 1.0
FBumann May 28, 2026
45700e7
benchmarks: address review — row height, scaling-from-params, mismatc…
FBumann May 28, 2026
ad7aa53
benchmarks: plot returns Figure, default output → .benchmarks/plots/<…
FBumann May 28, 2026
7c7bab2
benchmarks: plot renderers return (Figure, n_tests) — drop trace intr…
FBumann May 28, 2026
6a8a16d
benchmarks: notebook plot demo uses the CLI + tqdm progress
FBumann May 28, 2026
09dad9d
benchmarks: notebook plot demo accepts the full CLI command string
FBumann May 28, 2026
2ece2c1
benchmarks: memory tracks all phases via memray.Tracker; README accur…
FBumann May 28, 2026
ea4bc76
benchmarks: plot subcommand auto-detects memory snapshots alongside t…
FBumann May 28, 2026
cccd476
benchmarks: compare view drops unchanged tests (esp. memory)
FBumann May 28, 2026
d34824a
benchmarks: fix compare y-axis collision; revert unchanged-row filter
FBumann May 28, 2026
d88f235
benchmarks: compare view renders value text outside bars
FBumann May 28, 2026
abb3f14
benchmarks: compare bars keep alphabetical test-id order
FBumann May 28, 2026
914efbf
benchmarks: plot gains ``--facets {phase,model}`` for compare + scatter
FBumann May 28, 2026
eb687f1
benchmarks: faceted compare/scatter share one x + y axis label
FBumann May 28, 2026
5a08e79
benchmarks: notebook showcases ``--facets phase`` after compare/scatter
FBumann May 28, 2026
e24451a
benchmarks: faceted compare — per-facet rows, shared y-tick labels pe…
FBumann May 28, 2026
f4917dd
benchmarks: scatter as default compare view + expose load_long_df
FBumann May 28, 2026
2993b95
benchmarks: memory sweep + --rounds/--repeats overrides + centralized…
FBumann May 28, 2026
ac1df53
benchmarks: CodSpeed CI + Dependabot perf attribution loop
FBumann May 28, 2026
0e6ec41
benchmarks: drop lockfile, relocate walkthrough, Jupytext --build flow
FBumann May 28, 2026
3981cad
benchmarks: add CLI walkthrough as Jupytext MyST notebook
FBumann May 28, 2026
59eadb3
benchmarks: bump pinned jupytext to 1.19.3 (matches installed)
FBumann May 28, 2026
cbf517a
benchmarks: sweep --smoke for cross-version sanity checks
FBumann May 28, 2026
4ba6fb4
benchmarks: small cleanups (dead __iter__, naming, stale comments)
FBumann May 28, 2026
d86b111
benchmarks: delete unused SOLVER_BUILD phase + collapse models re-exp…
FBumann May 28, 2026
b153239
benchmarks: share phase verbs via benchmarks/phases.py + guard the seam
FBumann May 28, 2026
754e0ec
benchmarks: extract _provision_venvs helper to dedupe sweep plumbing
FBumann May 28, 2026
7d3e474
benchmarks: bump pinned numpy 1.26.4 → 2.4.6
FBumann May 29, 2026
2621a7b
benchmarks: relax numpy pin to <2.0 for wider sweep coverage
FBumann May 29, 2026
2656178
benchmarks: pin numpy back to ==1.26.4 (last 1.x)
FBumann May 29, 2026
e7f9c5b
benchmarks: fix sweep silently measuring dev linopy + getattr SOLVER_…
FBumann May 29, 2026
b35fafe
benchmarks: pin xarray to 2025.1.2 to extend sweep coverage to 0.4.4
FBumann May 29, 2026
11f56d2
benchmarks: shim write_lp for linopy <0.4.1, extending sweep floor to…
FBumann May 29, 2026
3091c64
benchmarks: add --as-of <DATE> for cross-time-reproducible sweeps
FBumann May 29, 2026
e74ae1e
benchmarks: harden the sweep isolation seam (preflight + no bytecode)
FBumann May 29, 2026
55612f5
benchmarks: copy harness into sweep venvs instead of symlinking
FBumann May 29, 2026
c031153
benchmarks: add ad-hoc `bench` helper for arbitrary callables
FBumann May 29, 2026
3df647c
benchmarks: make the suite mypy-clean
FBumann May 29, 2026
c5f23ec
benchmarks: extract snapshot.py + calibrate bench.time
FBumann May 29, 2026
2839145
benchmarks: split sweep orchestration out of cli.py
FBumann May 29, 2026
4502fed
benchmarks: drop the "Other CLI surfaces" table from the walkthrough
FBumann May 29, 2026
99f4f56
benchmarks: show load_long_df from-file diff in the walkthrough
FBumann May 29, 2026
927750f
benchmarks: label sweep snapshots by ref/sha for git/file specs
FBumann May 29, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions .github/dependabot.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,30 @@ updates:
github-actions:
patterns:
- '*'

# Pinned ``[benchmarks]`` extra in pyproject.toml. One PR per dep bump
# → CodSpeed CI runs and attributes any perf delta to that specific
# bump. Keeps the cross-version ``sweep`` baseline (lockfile-pinned)
# stable while still surfacing upstream perf changes per-PR with
# eyes-open review. Loose ``[project.dependencies]`` (numpy, scipy, ...)
# have no version specifier so Dependabot leaves them alone — only the
# ``==`` pins in ``[benchmarks]`` produce PRs.
- package-ecosystem: pip
directory: /
schedule:
interval: monthly
open-pull-requests-limit: 5
groups:
# Measurement scaffolding + CLI/notebook tooling. Perf-irrelevant —
# they don't move CodSpeed signal, so batching into one PR cuts
# review noise. Perf-relevant deps (numpy, xarray, highspy, …) stay
# un-grouped so each gets its own attributed CodSpeed delta.
benchmark-tooling:
patterns:
- pytest
- pytest-benchmark
- pytest-memray
- pytest-codspeed
- nbconvert
- typer
- plotly
81 changes: 81 additions & 0 deletions .github/workflows/benchmark-smoke.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
name: Benchmark smoke

# Runs the internal benchmark suite under --quick --benchmark-disable so every
# model spec is built and every phase fires at least once, but no timings are
# recorded. The goal is "did refactor X break a model spec?" — not regression
# tracking, which is done out-of-CI on dedicated hardware.

on:
push:
branches: [ master ]
pull_request:
branches: [ '*' ]

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
smoke:
name: Benchmark smoke (quick)
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0 # setuptools_scm

- name: Set up Python 3.12
uses: actions/setup-python@v6
with:
python-version: "3.12"

- name: Install package and benchmark dependencies
run: |
python -m pip install uv
# [dev] for pytest + netcdf4; [benchmarks] for pytest-benchmark + pypsa.
uv pip install --system -e ".[dev,benchmarks]"

- name: Run benchmark smoke
run: |
python -m benchmarks smoke

- name: Execute walkthrough notebook
# Catches doc rot — walkthrough.md is the canonical CLI walkthrough
# for the suite and must stay runnable end-to-end.
run: |
python -m benchmarks notebook

codspeed:
name: CodSpeed (micro regression detection)
runs-on: ubuntu-latest
# Cachegrind is ~10–20× slower than native, so we restrict to ``--quick``
# (smallest size per spec) and skip PyPSA end-to-end. The signal we want
# here is "did this PR change the instruction count of the hot paths?";
# full wall-clock cross-version comparison stays in ``sweep``.
steps:
- uses: actions/checkout@v6
with:
fetch-depth: 0 # setuptools_scm

- name: Set up Python 3.12
uses: actions/setup-python@v6
with:
python-version: "3.12"

- name: Install pinned benchmark environment
# Install from the pinned ``[benchmarks]`` extra (not the lockfile)
# so Dependabot can auto-detect pyproject.toml and propose bumps
# to top-level deps. Each bump → one attributed CodSpeed delta.
# The full transitive lockfile is reserved for cross-version
# ``sweep`` reproducibility, where machine variance matters more.
run: |
python -m pip install uv
uv pip install --system -e ".[dev,benchmarks]"

- name: Run benchmarks under CodSpeed
uses: CodSpeedHQ/action@v3
with:
token: ${{ secrets.CODSPEED_TOKEN }}
run: |
pytest benchmarks/ --quick --codspeed
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ benchmark/scripts/__pycache__
benchmark/scripts/benchmarks-pypsa-eur/__pycache__
benchmark/scripts/leftovers/

# Benchmarks (internal suite in benchmarks/) — the .md walkthrough is
# canonical; ``python -m benchmarks notebook --build`` regenerates the
# .ipynb sibling as a throwaway viewing/running artifact.
benchmarks/walkthrough.ipynb
benchmarks/.ipynb_checkpoints/

# IDE
.idea/

Expand Down
106 changes: 28 additions & 78 deletions benchmarks/README.md
Original file line number Diff line number Diff line change
@@ -1,94 +1,44 @@
# Internal Performance Benchmarks

Measures linopy's own performance (build time, LP write speed, memory usage) across problem sizes using [pytest-benchmark](https://pytest-benchmark.readthedocs.io/) and [pytest-memray](https://pytest-memray.readthedocs.io/). Use these to check whether a code change introduces a regression or improvement.
End-to-end performance tracking for `linopy` — build → solver handoff
→ netCDF (de)serialization → fixed PyPSA model. Solver algorithm
runtime is out of scope.

> **Note:** The `benchmark/` directory (singular) contains *external* benchmarks comparing linopy against other modeling frameworks. This directory (`benchmarks/`) is for *internal* performance tracking only.
**The walkthrough is load-bearing.** Phase coverage, CLI introspection,
the two-snapshot regression workflow with inline Plotly views, and
how to extend the suite live in [`walkthrough.md`](walkthrough.md).
This README only covers install and how to open the walkthrough.

## Setup
> `benchmark/` (singular) is the legacy external-framework suite.
> `benchmarks/` (plural) is this internal suite.

```bash
pip install -e ".[benchmarks]"
```

## Running benchmarks
## Install

```bash
# Quick smoke test (small sizes only)
pytest benchmarks/ --quick

# Full timing benchmarks
pytest benchmarks/test_build.py benchmarks/test_lp_write.py benchmarks/test_matrices.py

# Run a specific model
pytest benchmarks/test_build.py -k basic
```

## Comparing timing between branches

```bash
# Save baseline results on master
git checkout master
pytest benchmarks/test_build.py --benchmark-save=master

# Switch to feature branch and compare
git checkout my-feature
pytest benchmarks/test_build.py --benchmark-save=my-feature --benchmark-compare=0001_master

# Compare saved results without re-running
pytest-benchmark compare 0001_master 0002_my-feature --columns=median,iqr
uv sync --extra dev --extra benchmarks
source .venv/bin/activate
```

Results are stored in `.benchmarks/` (gitignored).
`pypsa` is optional — `pypsa_scigrid` and
`test_pypsa_carbon_management.py` skip gracefully without it. Install
when you need them: `uv pip install pypsa`.

## Memory benchmarks
The `[benchmarks]` extra in `pyproject.toml` pins every direct dep that
affects measurement (`numpy`, `scipy`, `xarray`, `pandas`, `polars`,
`dask`, etc.). `sweep` installs these into each per-version venv, so
"same deps, only linopy varies" comes for free without a separate
lockfile — bump the pins in pyproject and the next sweep picks them up.

`memory.py` runs each test in a separate process with pytest-memray to get accurate per-test peak memory (including C/numpy allocations). Results are saved as JSON and can be compared across branches.

By default, only the build phase (`test_build.py`) is measured. Unlike timing benchmarks where `benchmark()` isolates the measured function, memray tracks all allocations within a test — including model construction in setup. This means LP write and matrix tests would report build + phase memory combined, making the phase-specific contribution impossible to isolate. Since model construction dominates memory usage, measuring build alone gives the most actionable numbers.
## Open the walkthrough

```bash
# Save baseline on master
git checkout master
python benchmarks/memory.py save master

# Save feature branch
git checkout my-feature
python benchmarks/memory.py save my-feature

# Compare
python benchmarks/memory.py compare master my-feature

# Quick mode (smaller sizes, faster)
python benchmarks/memory.py save master --quick

# Measure a specific phase (includes build overhead)
python benchmarks/memory.py save master --test-path benchmarks/test_lp_write.py
python -m benchmarks notebook --build # (re)generate walkthrough.ipynb
jupyter lab benchmarks/walkthrough.ipynb # ...or PyCharm / VSCode
```

Results are stored in `.benchmarks/memory/` (gitignored). Requires Linux or macOS (memray is not available on Windows).

> **Note:** Small tests (~5 MiB) are near the import-overhead floor and may show noise of ~1 MiB between runs. Focus on larger tests for meaningful memory comparisons. Do not combine `--memray` with timing benchmarks — memray adds ~2x overhead that invalidates timing results.

## Models

| Model | Description | Sizes |
|-------|-------------|-------|
| `basic` | Dense N*N model, 2*N^2 vars/cons | 10 — 1600 |
| `knapsack` | N binary variables, 1 constraint | 100 — 1M |
| `expression_arithmetic` | Broadcasting, scaling, summation across dims | 10 — 1000 |
| `sparse_network` | Ring network with mismatched bus/line coords | 10 — 1000 |
| `pypsa_scigrid` | Real power system (requires `pypsa`) | 10 — 200 snapshots |

## Phases

| Phase | File | What it measures |
|-------|------|------------------|
| Build | `test_build.py` | Model construction (add_variables, add_constraints, add_objective) |
| LP write | `test_lp_write.py` | Writing the model to an LP file |
| Matrices | `test_matrices.py` | Generating sparse matrices (A, b, c, bounds) from the model |

## Adding a new model
The `.md` is the source of truth; the `.ipynb` is a disposable,
gitignored build artifact. Edit the `.md`, re-run `--build`, re-open.
Same workflow in any editor.

1. Create `benchmarks/models/my_model.py` with a `build_my_model(n)` function and a `SIZES` list
2. Add parametrized tests in the relevant `test_*.py` files
3. Add a quick threshold in `conftest.py`
CI executes the walkthrough end-to-end on every PR
(`python -m benchmarks notebook`) so the examples can't silently rot.
115 changes: 114 additions & 1 deletion benchmarks/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,114 @@
"""Linopy benchmark suite — run with ``pytest benchmarks/`` (use ``--quick`` for smaller sizes)."""
"""
Linopy benchmark suite.

Run with ``pytest benchmarks/`` (use ``--quick`` for smaller sizes).

This package also exposes a **reusable model registry** for any test, profiling
session, or example that wants ready-made linopy models of varying sizes and
features. Each entry exposes a ``build(size) -> linopy.Model`` callable plus
metadata::

from benchmarks import REGISTRY, QUADRATIC

# Look up by name
model = REGISTRY["basic"].build(100)

# Iterate / filter
for spec in REGISTRY.values():
m = spec.build(spec.sizes[0])
...

from benchmarks import filter_by
qp_specs = filter_by(has_feature=QUADRATIC)
"""

from __future__ import annotations

from typing import TYPE_CHECKING

if TYPE_CHECKING:
from pathlib import Path

import pandas as pd

from benchmarks.snapshot import Metric

# Importing the models package triggers each module's ``register(...)`` call.
from benchmarks import bench, models # noqa: F401, E402


def load_long_df(
snapshots: list[Path], metric: Metric = "min"
) -> tuple[pd.DataFrame, str]:
"""
Load one or more benchmark JSON snapshots into a tidy DataFrame.

Thin re-export of :func:`benchmarks.snapshot.load_long_df` so callers
can do their own analysis without importing the plotting module
(which pulls in plotly). Returns ``(df, unit)`` where ``df`` has one
row per ``(snapshot, test_id)`` with columns ``snapshot, test_id,
phase, model, size, value``, and ``unit`` is ``"s"`` (timing) or
``"MiB"`` (memory).
"""
from benchmarks.snapshot import load_long_df as _impl

return _impl(snapshots, metric)


from benchmarks.registry import ( # noqa: F401, E402 — re-export
ALL_FEATURES,
ALL_PHASES,
BINARY,
BUILD,
CONTINUOUS,
DEFAULT_PHASES,
INTEGER,
LP_WRITE,
MASKED,
MATRICES,
NETCDF,
PIECEWISE,
QUADRATIC,
REGISTRY,
SOS,
TO_GUROBIPY,
TO_HIGHSPY,
TO_MOSEK,
TO_XPRESS,
ModelSpec,
filter_by,
get,
iter_params,
param_ids,
register,
)

__all__ = [
"ALL_FEATURES",
"ALL_PHASES",
"BINARY",
"BUILD",
"CONTINUOUS",
"DEFAULT_PHASES",
"INTEGER",
"LP_WRITE",
"MASKED",
"MATRICES",
"ModelSpec",
"NETCDF",
"PIECEWISE",
"QUADRATIC",
"REGISTRY",
"SOS",
"TO_GUROBIPY",
"TO_HIGHSPY",
"TO_MOSEK",
"TO_XPRESS",
"bench",
"filter_by",
"get",
"iter_params",
"load_long_df",
"param_ids",
"register",
]
5 changes: 5 additions & 0 deletions benchmarks/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Allow ``python -m benchmarks <command>``."""

from benchmarks.cli import app

app()
Loading
Loading