From e61437a1ff5b7d7daf10eff5c300461a618147df Mon Sep 17 00:00:00 2001 From: maskedsyntax Date: Fri, 10 Apr 2026 00:49:44 +0530 Subject: [PATCH 1/2] add benchmarking module for inference latency measurement Adds cx.benchmark() and cx.compare_benchmarks() for measuring real inference latency with warmup, percentiles, and throughput. Includes a 'comprexx bench' CLI command and README/CHANGELOG entries. --- CHANGELOG.md | 5 + README.md | 26 ++++ comprexx/__init__.py | 10 ++ comprexx/benchmark/__init__.py | 10 ++ comprexx/benchmark/runner.py | 250 +++++++++++++++++++++++++++++++++ comprexx/cli/main.py | 25 ++++ tests/unit/test_benchmark.py | 93 ++++++++++++ 7 files changed, 419 insertions(+) create mode 100644 comprexx/benchmark/runner.py create mode 100644 tests/unit/test_benchmark.py diff --git a/CHANGELOG.md b/CHANGELOG.md index b29625d..9709820 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Added +- **Benchmarking module** (`comprexx.benchmark`): `cx.benchmark()` measures real + inference latency with configurable warmup/iters, reporting mean, median, std, + p50/p90/p99, min/max, and throughput. `cx.compare_benchmarks()` returns a + before/after comparison with speedup and latency/throughput deltas. Quantized + models are automatically run on CPU. New `comprexx bench` CLI command. - GitHub Actions CI workflow running `pytest` on Python 3.10, 3.11, 3.12 plus a `ruff check` lint job. - `CHANGELOG.md` with history for v0.1.0 and v0.2.0. diff --git a/README.md b/README.md index 02df0c4..5bac748 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,29 @@ pipeline = cx.Pipeline([ The `perturbation` can be `"prune"` (zero the smallest weights) or `"noise"` (add Gaussian noise scaled by weight std). Each layer is snapshotted and restored in place, so no deep copies of the model are made. +### Benchmark inference latency + +Param counts and FLOPs tell you how small a model got. They don't tell you how fast it runs. `cx.benchmark` measures real latency: + +```python +result = cx.benchmark(model, input_shape=(1, 3, 224, 224), warmup=10, iters=50) +print(result.summary()) +``` + +You get mean, median, std, p50/p90/p99, min/max, and throughput in inferences per second. To see what compression actually bought you, run `compare_benchmarks` on the baseline and compressed models: + +```python +cmp = cx.compare_benchmarks( + baseline_model, result.model, + input_shape=(1, 3, 224, 224), + iters=50, +) +print(cmp.summary()) +print(f"{cmp.speedup:.2f}x faster") +``` + +Warmup iterations are excluded from measurements so caches and JIT settle first. Quantized models are automatically run on CPU regardless of the `device` argument. + ### Export to ONNX ```python @@ -177,6 +200,9 @@ comprexx analyze model.pt --input-shape "1,3,224,224" --json comprexx compress model.pt --recipe recipe.yaml --input-shape "1,3,224,224" comprexx compress model.pt --recipe recipe.yaml --input-shape "1,3,224,224" --dry-run +# Benchmark +comprexx bench model.pt --input-shape "1,3,224,224" --iters 50 + # Export comprexx export model.pt --format onnx --input-shape "1,3,224,224" ``` diff --git a/comprexx/__init__.py b/comprexx/__init__.py index fa15f19..aaf8b68 100644 --- a/comprexx/__init__.py +++ b/comprexx/__init__.py @@ -9,6 +9,12 @@ SensitivityReport, analyze_sensitivity, ) +from comprexx.benchmark.runner import ( + BenchmarkComparison, + BenchmarkResult, + benchmark, +) +from comprexx.benchmark.runner import compare as compare_benchmarks from comprexx.core.exceptions import ( AccuracyGuardTriggered, CalibrationError, @@ -27,6 +33,8 @@ __all__ = [ "AccuracyGuard", "AccuracyGuardTriggered", + "BenchmarkComparison", + "BenchmarkResult", "CalibrationError", "CompressionReport", "ComprexxError", @@ -43,6 +51,8 @@ "UnsupportedLayerError", "analyze", "analyze_sensitivity", + "benchmark", + "compare_benchmarks", "load_recipe", "stages", ] diff --git a/comprexx/benchmark/__init__.py b/comprexx/benchmark/__init__.py index e69de29..b736ea3 100644 --- a/comprexx/benchmark/__init__.py +++ b/comprexx/benchmark/__init__.py @@ -0,0 +1,10 @@ +"""Inference benchmarking.""" + +from comprexx.benchmark.runner import ( + BenchmarkComparison, + BenchmarkResult, + benchmark, + compare, +) + +__all__ = ["BenchmarkComparison", "BenchmarkResult", "benchmark", "compare"] diff --git a/comprexx/benchmark/runner.py b/comprexx/benchmark/runner.py new file mode 100644 index 0000000..f658a9f --- /dev/null +++ b/comprexx/benchmark/runner.py @@ -0,0 +1,250 @@ +"""Latency/throughput benchmarking for PyTorch models. + +Measures real inference performance so compression reports can show actual +speedups, not just parameter-count reductions. +""" + +from __future__ import annotations + +import json +import statistics +import time +from dataclasses import asdict, dataclass, field +from typing import Callable, Optional + +import torch +import torch.nn as nn + + +@dataclass +class BenchmarkResult: + """Latency statistics from a benchmark run.""" + + device: str + dtype: str + batch_size: int + warmup: int + iters: int + mean_ms: float + median_ms: float + std_ms: float + min_ms: float + max_ms: float + p50_ms: float + p90_ms: float + p99_ms: float + throughput_ips: float + samples_ms: list[float] = field(default_factory=list) + + def to_dict(self) -> dict: + return asdict(self) + + def to_json(self) -> str: + return json.dumps(self.to_dict(), indent=2) + + def summary(self) -> str: + return ( + f"Benchmark ({self.device}, batch={self.batch_size}, iters={self.iters})\n" + f" Mean: {self.mean_ms:.3f} ms\n" + f" Median: {self.median_ms:.3f} ms\n" + f" Std: {self.std_ms:.3f} ms\n" + f" p50/p90/p99:{self.p50_ms:.3f} / {self.p90_ms:.3f} / {self.p99_ms:.3f} ms\n" + f" Min/Max: {self.min_ms:.3f} / {self.max_ms:.3f} ms\n" + f" Throughput: {self.throughput_ips:.1f} inferences/sec" + ) + + +@dataclass +class BenchmarkComparison: + """Before/after benchmark comparison.""" + + baseline: BenchmarkResult + compressed: BenchmarkResult + + @property + def speedup(self) -> float: + if self.compressed.mean_ms == 0: + return float("inf") + return self.baseline.mean_ms / self.compressed.mean_ms + + @property + def latency_reduction_pct(self) -> float: + if self.baseline.mean_ms == 0: + return 0.0 + return (1 - self.compressed.mean_ms / self.baseline.mean_ms) * 100 + + @property + def throughput_gain_pct(self) -> float: + if self.baseline.throughput_ips == 0: + return 0.0 + return (self.compressed.throughput_ips / self.baseline.throughput_ips - 1) * 100 + + def to_dict(self) -> dict: + return { + "baseline": self.baseline.to_dict(), + "compressed": self.compressed.to_dict(), + "speedup": self.speedup, + "latency_reduction_pct": self.latency_reduction_pct, + "throughput_gain_pct": self.throughput_gain_pct, + } + + def to_json(self) -> str: + return json.dumps(self.to_dict(), indent=2) + + def summary(self) -> str: + return ( + f"Benchmark Comparison ({self.baseline.device})\n" + f" Baseline: {self.baseline.mean_ms:.3f} ms " + f"({self.baseline.throughput_ips:.1f} ips)\n" + f" Compressed: {self.compressed.mean_ms:.3f} ms " + f"({self.compressed.throughput_ips:.1f} ips)\n" + f" Speedup: {self.speedup:.2f}x " + f"({self.latency_reduction_pct:+.1f}% latency, " + f"{self.throughput_gain_pct:+.1f}% throughput)" + ) + + +def _make_input( + input_shape: tuple[int, ...] | list[tuple[int, ...]], + device: torch.device, + dtype: torch.dtype, +) -> torch.Tensor | tuple[torch.Tensor, ...]: + if isinstance(input_shape, list): + return tuple(torch.randn(*s, device=device, dtype=dtype) for s in input_shape) + return torch.randn(*input_shape, device=device, dtype=dtype) + + +def _sync(device: torch.device) -> None: + if device.type == "cuda": + torch.cuda.synchronize() + + +def _percentile(values: list[float], pct: float) -> float: + if not values: + return 0.0 + s = sorted(values) + k = (len(s) - 1) * (pct / 100.0) + lo = int(k) + hi = min(lo + 1, len(s) - 1) + frac = k - lo + return s[lo] * (1 - frac) + s[hi] * frac + + +def benchmark( + model: nn.Module, + input_shape: tuple[int, ...] | list[tuple[int, ...]], + device: str = "cpu", + dtype: torch.dtype = torch.float32, + warmup: int = 10, + iters: int = 50, + input_fn: Optional[Callable[[], torch.Tensor | tuple[torch.Tensor, ...]]] = None, +) -> BenchmarkResult: + """Benchmark a model's inference latency. + + Args: + model: Model to benchmark (set to eval mode internally). + input_shape: Single tensor shape, or list of shapes for multi-input models. + device: "cpu" or "cuda". + dtype: Input tensor dtype. Quantized models ignore this. + warmup: Warmup iterations (not measured) to stabilize caches/JIT. + iters: Measured iterations. + input_fn: Optional callable returning a fresh input per call. Overrides + `input_shape` when provided. + """ + if iters <= 0: + raise ValueError("iters must be positive") + + dev = torch.device(device) + model = model.eval() + + # Quantized models must run on CPU + is_quantized = any( + "quantized" in type(m).__module__ for m in model.modules() + ) + if is_quantized and dev.type != "cpu": + dev = torch.device("cpu") + + try: + model = model.to(dev) + except (RuntimeError, NotImplementedError): + # Some quantized modules refuse .to() transfers; fall through + pass + + def _gen_input(): + if input_fn is not None: + x = input_fn() + else: + x = _make_input(input_shape, dev, dtype) + return x + + sample = _gen_input() + batch_size = ( + sample[0].shape[0] if isinstance(sample, tuple) else sample.shape[0] + ) + + with torch.inference_mode(): + # Warmup + for _ in range(warmup): + x = _gen_input() + if isinstance(x, tuple): + model(*x) + else: + model(x) + _sync(dev) + + # Measure + samples_ms: list[float] = [] + for _ in range(iters): + x = _gen_input() + _sync(dev) + t0 = time.perf_counter() + if isinstance(x, tuple): + model(*x) + else: + model(x) + _sync(dev) + samples_ms.append((time.perf_counter() - t0) * 1000.0) + + mean_ms = statistics.fmean(samples_ms) + median_ms = statistics.median(samples_ms) + std_ms = statistics.pstdev(samples_ms) if len(samples_ms) > 1 else 0.0 + throughput = (batch_size * 1000.0 / mean_ms) if mean_ms > 0 else 0.0 + + return BenchmarkResult( + device=str(dev), + dtype=str(dtype).replace("torch.", ""), + batch_size=batch_size, + warmup=warmup, + iters=iters, + mean_ms=mean_ms, + median_ms=median_ms, + std_ms=std_ms, + min_ms=min(samples_ms), + max_ms=max(samples_ms), + p50_ms=_percentile(samples_ms, 50), + p90_ms=_percentile(samples_ms, 90), + p99_ms=_percentile(samples_ms, 99), + throughput_ips=throughput, + samples_ms=samples_ms, + ) + + +def compare( + baseline_model: nn.Module, + compressed_model: nn.Module, + input_shape: tuple[int, ...] | list[tuple[int, ...]], + device: str = "cpu", + dtype: torch.dtype = torch.float32, + warmup: int = 10, + iters: int = 50, +) -> BenchmarkComparison: + """Benchmark baseline and compressed models and return a comparison.""" + base = benchmark( + baseline_model, input_shape, device=device, dtype=dtype, + warmup=warmup, iters=iters, + ) + comp = benchmark( + compressed_model, input_shape, device=device, dtype=dtype, + warmup=warmup, iters=iters, + ) + return BenchmarkComparison(baseline=base, compressed=comp) diff --git a/comprexx/cli/main.py b/comprexx/cli/main.py index 335ecb4..055af9c 100644 --- a/comprexx/cli/main.py +++ b/comprexx/cli/main.py @@ -177,5 +177,30 @@ def export_cmd( raise typer.Exit(1) +@app.command() +def bench( + model_source: str = typer.Argument(..., help="Model path or Python module path"), + input_shape: str = typer.Option(..., "--input-shape", help="Input shape, e.g. '1,3,224,224'"), + device: str = typer.Option("cpu", help="Device (cpu or cuda)"), + warmup: int = typer.Option(10, "--warmup", help="Warmup iterations"), + iters: int = typer.Option(50, "--iters", help="Measured iterations"), + json_output: bool = typer.Option(False, "--json", help="Output as JSON"), +): + """Benchmark a model's inference latency.""" + from comprexx.benchmark.runner import benchmark + + shape = _parse_input_shape(input_shape) + model = _load_model(model_source) + + with console.status("Benchmarking..."): + result = benchmark(model, input_shape=shape, device=device, + warmup=warmup, iters=iters) + + if json_output: + console.print(result.to_json()) + else: + console.print(Panel(result.summary(), title="Comprexx Benchmark")) + + if __name__ == "__main__": app() diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py new file mode 100644 index 0000000..8055eba --- /dev/null +++ b/tests/unit/test_benchmark.py @@ -0,0 +1,93 @@ +"""Tests for the benchmark module.""" + +import pytest +import torch + +import comprexx as cx +from comprexx.benchmark.runner import BenchmarkComparison, BenchmarkResult, _percentile +from tests.fixtures.models import tiny_cnn, tiny_transformer + + +class TestBenchmark: + def test_basic_run(self): + model = tiny_cnn() + result = cx.benchmark(model, input_shape=(1, 3, 32, 32), warmup=2, iters=5) + assert isinstance(result, BenchmarkResult) + assert result.iters == 5 + assert result.warmup == 2 + assert result.batch_size == 1 + assert result.mean_ms > 0 + assert result.throughput_ips > 0 + assert len(result.samples_ms) == 5 + + def test_batch_size_from_shape(self): + model = tiny_cnn() + result = cx.benchmark(model, input_shape=(4, 3, 32, 32), warmup=1, iters=3) + assert result.batch_size == 4 + + def test_percentiles_ordered(self): + model = tiny_cnn() + result = cx.benchmark(model, input_shape=(1, 3, 32, 32), warmup=1, iters=10) + assert result.p50_ms <= result.p90_ms <= result.p99_ms + assert result.min_ms <= result.mean_ms <= result.max_ms + + def test_invalid_iters(self): + model = tiny_cnn() + with pytest.raises(ValueError): + cx.benchmark(model, input_shape=(1, 3, 32, 32), iters=0) + + def test_transformer_multi_input_shape(self): + model = tiny_transformer() + # (batch, seq, features) + result = cx.benchmark(model, input_shape=(2, 8, 64), warmup=1, iters=3) + assert result.batch_size == 2 + assert result.mean_ms > 0 + + def test_custom_input_fn(self): + model = tiny_cnn() + + def fn(): + return torch.randn(1, 3, 32, 32) + + result = cx.benchmark(model, input_shape=(1, 3, 32, 32), input_fn=fn, + warmup=1, iters=3) + assert result.mean_ms > 0 + + def test_summary_and_serialization(self): + model = tiny_cnn() + result = cx.benchmark(model, input_shape=(1, 3, 32, 32), warmup=1, iters=3) + s = result.summary() + assert "Mean" in s and "ms" in s + d = result.to_dict() + assert d["iters"] == 3 + j = result.to_json() + assert "mean_ms" in j + + +class TestCompare: + def test_compare_same_model(self): + m = tiny_cnn() + cmp = cx.compare_benchmarks(m, m, input_shape=(1, 3, 32, 32), + warmup=1, iters=5) + assert isinstance(cmp, BenchmarkComparison) + # Same model: speedup should be in a sane range + assert 0.2 < cmp.speedup < 5.0 + + def test_compare_summary(self): + m = tiny_cnn() + cmp = cx.compare_benchmarks(m, m, input_shape=(1, 3, 32, 32), + warmup=1, iters=3) + s = cmp.summary() + assert "Baseline" in s and "Compressed" in s + assert "speedup" in cmp.to_dict() + + +class TestPercentile: + def test_basic(self): + vals = [1.0, 2.0, 3.0, 4.0, 5.0] + assert _percentile(vals, 50) == 3.0 + assert _percentile(vals, 0) == 1.0 + assert _percentile(vals, 100) == 5.0 + + def test_empty(self): + assert _percentile([], 50) == 0.0 From 91ac88233ca63e95ca5218ee7b40ae29e05a8e4b Mon Sep 17 00:00:00 2001 From: maskedsyntax Date: Fri, 10 Apr 2026 17:21:26 +0530 Subject: [PATCH 2/2] add example notebooks for ResNet18 edge deploy and BERT-tiny quantization Two Jupyter notebooks demonstrating end-to-end workflows: - ResNet18: profile, fuse, prune, quantize, benchmark, export to ONNX - BERT-tiny: low-rank decomposition + INT4 weight quant with benchmarks --- CHANGELOG.md | 2 + README.md | 7 + examples/bert_tiny_quantize.ipynb | 233 ++++++++++++++++++++++++++++ examples/resnet18_edge_deploy.ipynb | 209 +++++++++++++++++++++++++ 4 files changed, 451 insertions(+) create mode 100644 examples/bert_tiny_quantize.ipynb create mode 100644 examples/resnet18_edge_deploy.ipynb diff --git a/CHANGELOG.md b/CHANGELOG.md index 9709820..b40431a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 p50/p90/p99, min/max, and throughput. `cx.compare_benchmarks()` returns a before/after comparison with speedup and latency/throughput deltas. Quantized models are automatically run on CPU. New `comprexx bench` CLI command. +- Example notebooks: ResNet18 edge deployment (prune + quantize + ONNX export) + and BERT-tiny quantization (low-rank decomposition + INT4 weight quant). - GitHub Actions CI workflow running `pytest` on Python 3.10, 3.11, 3.12 plus a `ruff check` lint job. - `CHANGELOG.md` with history for v0.1.0 and v0.2.0. diff --git a/README.md b/README.md index 5bac748..4514ba6 100644 --- a/README.md +++ b/README.md @@ -229,6 +229,13 @@ And for picking what to compress: |------|-------------| | Sensitivity analysis | `cx.analyze_sensitivity()` probes each Conv2d/Linear layer with a prune or noise perturbation, re-runs your `eval_fn`, and ranks layers by metric drop. Can also suggest `exclude_layers` above a chosen threshold. | +## Examples + +Check out the example notebooks in [`examples/`](./examples/): + +- [ResNet18 edge deployment](./examples/resnet18_edge_deploy.ipynb): profile, fuse, prune, quantize, benchmark, and export a ResNet18 to ONNX. +- [BERT-tiny quantization](./examples/bert_tiny_quantize.ipynb): low-rank decomposition + INT4 weight quantization on a small transformer, with latency benchmarks. + ## License Apache 2.0 diff --git a/examples/bert_tiny_quantize.ipynb b/examples/bert_tiny_quantize.ipynb new file mode 100644 index 0000000..9b07d21 --- /dev/null +++ b/examples/bert_tiny_quantize.ipynb @@ -0,0 +1,233 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# BERT-tiny Quantization with Comprexx\n", + "\n", + "This notebook shows how to compress a small transformer model:\n", + "\n", + "1. Profile the model\n", + "2. Apply low-rank decomposition to shrink Linear layers\n", + "3. Apply weight-only INT4 quantization\n", + "4. Benchmark before/after\n", + "\n", + "We use a minimal 2-layer transformer so this runs in seconds on CPU.\n", + "\n", + "Install: `pip install comprexx`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "import torch.nn as nn\n\nimport comprexx as cx" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Define a small transformer\n", + "\n", + "2-layer encoder, d_model=128, 4 heads, feedforward dim=512. Small enough for a notebook, large enough to show compression working." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class TinyBERT(nn.Module):\n", + " def __init__(self, vocab_size=1000, d_model=128, nhead=4, num_layers=2, num_classes=4):\n", + " super().__init__()\n", + " self.embedding = nn.Embedding(vocab_size, d_model)\n", + " encoder_layer = nn.TransformerEncoderLayer(\n", + " d_model=d_model, nhead=nhead, dim_feedforward=512, batch_first=True,\n", + " )\n", + " self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)\n", + " self.classifier = nn.Linear(d_model, num_classes)\n", + "\n", + " def forward(self, x):\n", + " x = self.embedding(x)\n", + " x = self.encoder(x)\n", + " # Mean pooling over sequence dim\n", + " x = x.mean(dim=1)\n", + " return self.classifier(x)\n", + "\n", + "model = TinyBERT()\n", + "model.eval()\n", + "print(f\"Model: {sum(p.numel() for p in model.parameters()):,} parameters\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Profile the model\n", + "\n", + "We pass token IDs as input, but the profiler needs a float tensor. We'll profile using the embedding output shape and note that the embedding table is counted in params." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# For profiling, we use a float input that skips the embedding.\n", + "# The full model takes integer token IDs, so we profile the encoder+classifier separately.\n", + "class EncoderClassifier(nn.Module):\n", + " \"\"\"Wraps encoder + classifier with float input for profiling.\"\"\"\n", + " def __init__(self, encoder, classifier):\n", + " super().__init__()\n", + " self.encoder = encoder\n", + " self.classifier = classifier\n", + "\n", + " def forward(self, x):\n", + " x = self.encoder(x)\n", + " return self.classifier(x.mean(dim=1))\n", + "\n", + "profiling_model = EncoderClassifier(model.encoder, model.classifier)\n", + "profile = cx.analyze(profiling_model, input_shape=(1, 32, 128)) # (batch, seq_len, d_model)\n", + "print(profile.summary())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Low-rank decomposition\n", + "\n", + "The feedforward layers inside the transformer encoder are 128x512 and 512x128. SVD can factorize these into pairs of smaller layers. We keep 50% of the singular values (by energy)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from comprexx.stages.base import StageContext\n", + "\n", + "stage_lr = cx.stages.LowRankDecomposition(mode=\"energy\", energy_threshold=0.9)\n", + "ctx = StageContext(input_shape=(1, 32, 128), device=\"cpu\")\n", + "\n", + "model_lr, report_lr = stage_lr.apply(profiling_model, ctx)\n", + "print(report_lr.summary())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Weight-only INT4 quantization\n", + "\n", + "After SVD, we quantize remaining Linear weights to INT4 (group size 64, symmetric). Activations stay in float32." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "stage_wq = cx.stages.WeightOnlyQuant(bits=4, group_size=64, symmetric=True)\n", + "\n", + "model_quant, report_wq = stage_wq.apply(model_lr, ctx)\n", + "print(report_wq.summary())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Or use a Pipeline for the same thing" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline = cx.Pipeline([\n", + " cx.stages.LowRankDecomposition(mode=\"energy\", energy_threshold=0.9),\n", + " cx.stages.WeightOnlyQuant(bits=4, group_size=64),\n", + "])\n", + "\n", + "result = pipeline.run(profiling_model, input_shape=(1, 32, 128))\n", + "print(result.report.summary())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Benchmark latency" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cmp = cx.compare_benchmarks(\n", + " profiling_model, result.model,\n", + " input_shape=(1, 32, 128),\n", + " warmup=10,\n", + " iters=50,\n", + ")\n", + "print(cmp.summary())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 7. Dynamic PTQ as an alternative\n", + "\n", + "If weight-only quantization isn't giving you enough speedup, dynamic PTQ quantizes both weights and activations to INT8 at runtime. It's a simpler path that works well for inference on CPU." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline_ptq = cx.Pipeline([\n", + " cx.stages.LowRankDecomposition(mode=\"energy\", energy_threshold=0.9),\n", + " cx.stages.PTQDynamic(),\n", + "])\n", + "\n", + "result_ptq = pipeline_ptq.run(profiling_model, input_shape=(1, 32, 128))\n", + "print(result_ptq.report.summary())\n", + "\n", + "cmp_ptq = cx.compare_benchmarks(\n", + " profiling_model, result_ptq.model,\n", + " input_shape=(1, 32, 128),\n", + " warmup=10,\n", + " iters=50,\n", + ")\n", + "print(\"\\n\" + cmp_ptq.summary())" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file diff --git a/examples/resnet18_edge_deploy.ipynb b/examples/resnet18_edge_deploy.ipynb new file mode 100644 index 0000000..75d670d --- /dev/null +++ b/examples/resnet18_edge_deploy.ipynb @@ -0,0 +1,209 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# ResNet18 Edge Deployment with Comprexx\n", + "\n", + "This notebook walks through compressing a ResNet18 for edge deployment:\n", + "\n", + "1. Profile the original model\n", + "2. Fuse Conv+BN layers (free compression)\n", + "3. Prune 40% of filters by L1 norm\n", + "4. Quantize to INT8 via dynamic PTQ\n", + "5. Benchmark before/after latency\n", + "6. Export to ONNX\n", + "\n", + "Install: `pip install \"comprexx[onnx]\"`" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import torch\n", + "\n", + "import comprexx as cx" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Load and profile the model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = torch.hub.load(\"pytorch/vision\", \"resnet18\", weights=None)\n", + "model.eval()\n", + "\n", + "input_shape = (1, 3, 224, 224)\n", + "profile = cx.analyze(model, input_shape)\n", + "print(profile.summary())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Build the compression pipeline\n", + "\n", + "Three stages, applied in order:\n", + "- **Operator fusion**: fold BatchNorm into Conv2d. Zero accuracy cost, fewer params.\n", + "- **Structured pruning**: zero out the least important 40% of conv filters globally.\n", + "- **PTQ dynamic**: quantize Linear layers to INT8 at runtime." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline = cx.Pipeline([\n", + " cx.stages.OperatorFusion(),\n", + " cx.stages.StructuredPruning(sparsity=0.4, criteria=\"l1_norm\", scope=\"global\"),\n", + " cx.stages.PTQDynamic(),\n", + "])\n", + "\n", + "result = pipeline.run(model, input_shape=input_shape)\n", + "print(result.report.summary())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Verify the compressed model still works" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "compressed = result.model\n", + "\n", + "with torch.no_grad():\n", + " x = torch.randn(*input_shape)\n", + " out_original = model(x)\n", + " out_compressed = compressed(x)\n", + "\n", + "print(f\"Original output shape: {out_original.shape}\")\n", + "print(f\"Compressed output shape: {out_compressed.shape}\")\n", + "print(f\"Max abs difference: {(out_original - out_compressed).abs().max():.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 4. Benchmark inference latency" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "comparison = cx.compare_benchmarks(\n", + " model, compressed,\n", + " input_shape=input_shape,\n", + " warmup=10,\n", + " iters=50,\n", + ")\n", + "print(comparison.summary())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 5. Export to ONNX\n", + "\n", + "The exporter runs `torch.onnx.export`, validates the output against PyTorch, and writes a manifest with compression metadata." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "exporter = cx.ONNXExporter()\n", + "exporter.export(\n", + " result.model,\n", + " input_shape=input_shape,\n", + " output_path=\"resnet18_compressed.onnx\",\n", + ")\n", + "\n", + "onnx_size = os.path.getsize(\"resnet18_compressed.onnx\") / 1e6\n", + "print(f\"ONNX file size: {onnx_size:.1f} MB\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 6. Same pipeline as a YAML recipe\n", + "\n", + "You can define this exact pipeline in YAML and run it from the CLI." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "recipe_yaml = \"\"\"\n", + "name: resnet18-edge\n", + "description: Pruned and quantized ResNet18 for edge deployment\n", + "\n", + "stages:\n", + " - technique: operator_fusion\n", + "\n", + " - technique: structured_pruning\n", + " sparsity: 0.4\n", + " criteria: l1_norm\n", + " scope: global\n", + "\n", + " - technique: ptq_dynamic\n", + " format: int8\n", + "\"\"\"\n", + "\n", + "print(recipe_yaml)\n", + "print(\"# Save as resnet18-edge.yaml, then run:\")\n", + "print(\n", + " \"# comprexx compress torchvision.models.resnet18\"\n", + " \" --recipe resnet18-edge.yaml --input-shape 1,3,224,224\"\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.10.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} \ No newline at end of file