From e61437a1ff5b7d7daf10eff5c300461a618147df Mon Sep 17 00:00:00 2001
From: maskedsyntax <aftaab@aftaab.xyz>
Date: Fri, 10 Apr 2026 00:49:44 +0530
Subject: [PATCH 1/2] add benchmarking module for inference latency measurement

Adds cx.benchmark() and cx.compare_benchmarks() for measuring real
inference latency with warmup, percentiles, and throughput. Includes
a 'comprexx bench' CLI command and README/CHANGELOG entries.
---
 CHANGELOG.md                   |   5 +
 README.md                      |  26 ++++
 comprexx/__init__.py           |  10 ++
 comprexx/benchmark/__init__.py |  10 ++
 comprexx/benchmark/runner.py   | 250 +++++++++++++++++++++++++++++++++
 comprexx/cli/main.py           |  25 ++++
 tests/unit/test_benchmark.py   |  93 ++++++++++++
 7 files changed, 419 insertions(+)
 create mode 100644 comprexx/benchmark/runner.py
 create mode 100644 tests/unit/test_benchmark.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b29625d..9709820 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
 
 ### Added
+- **Benchmarking module** (`comprexx.benchmark`): `cx.benchmark()` measures real
+  inference latency with configurable warmup/iters, reporting mean, median, std,
+  p50/p90/p99, min/max, and throughput. `cx.compare_benchmarks()` returns a
+  before/after comparison with speedup and latency/throughput deltas. Quantized
+  models are automatically run on CPU. New `comprexx bench` CLI command.
 - GitHub Actions CI workflow running `pytest` on Python 3.10, 3.11, 3.12 plus a
   `ruff check` lint job.
 - `CHANGELOG.md` with history for v0.1.0 and v0.2.0.
diff --git a/README.md b/README.md
index 02df0c4..5bac748 100644
--- a/README.md
+++ b/README.md
@@ -121,6 +121,29 @@ pipeline = cx.Pipeline([
 
 The `perturbation` can be `"prune"` (zero the smallest weights) or `"noise"` (add Gaussian noise scaled by weight std). Each layer is snapshotted and restored in place, so no deep copies of the model are made.
 
+### Benchmark inference latency
+
+Param counts and FLOPs tell you how small a model got. They don't tell you how fast it runs. `cx.benchmark` measures real latency:
+
+```python
+result = cx.benchmark(model, input_shape=(1, 3, 224, 224), warmup=10, iters=50)
+print(result.summary())
+```
+
+You get mean, median, std, p50/p90/p99, min/max, and throughput in inferences per second. To see what compression actually bought you, run `compare_benchmarks` on the baseline and compressed models:
+
+```python
+cmp = cx.compare_benchmarks(
+    baseline_model, result.model,
+    input_shape=(1, 3, 224, 224),
+    iters=50,
+)
+print(cmp.summary())
+print(f"{cmp.speedup:.2f}x faster")
+```
+
+Warmup iterations are excluded from measurements so caches and JIT settle first. Quantized models are automatically run on CPU regardless of the `device` argument.
+
 ### Export to ONNX
 
 ```python
@@ -177,6 +200,9 @@ comprexx analyze model.pt --input-shape "1,3,224,224" --json
 comprexx compress model.pt --recipe recipe.yaml --input-shape "1,3,224,224"
 comprexx compress model.pt --recipe recipe.yaml --input-shape "1,3,224,224" --dry-run
 
+# Benchmark
+comprexx bench model.pt --input-shape "1,3,224,224" --iters 50
+
 # Export
 comprexx export model.pt --format onnx --input-shape "1,3,224,224"
 ```
diff --git a/comprexx/__init__.py b/comprexx/__init__.py
index fa15f19..aaf8b68 100644
--- a/comprexx/__init__.py
+++ b/comprexx/__init__.py
@@ -9,6 +9,12 @@
     SensitivityReport,
     analyze_sensitivity,
 )
+from comprexx.benchmark.runner import (
+    BenchmarkComparison,
+    BenchmarkResult,
+    benchmark,
+)
+from comprexx.benchmark.runner import compare as compare_benchmarks
 from comprexx.core.exceptions import (
     AccuracyGuardTriggered,
     CalibrationError,
@@ -27,6 +33,8 @@
 __all__ = [
     "AccuracyGuard",
     "AccuracyGuardTriggered",
+    "BenchmarkComparison",
+    "BenchmarkResult",
     "CalibrationError",
     "CompressionReport",
     "ComprexxError",
@@ -43,6 +51,8 @@
     "UnsupportedLayerError",
     "analyze",
     "analyze_sensitivity",
+    "benchmark",
+    "compare_benchmarks",
     "load_recipe",
     "stages",
 ]
diff --git a/comprexx/benchmark/__init__.py b/comprexx/benchmark/__init__.py
index e69de29..b736ea3 100644
--- a/comprexx/benchmark/__init__.py
+++ b/comprexx/benchmark/__init__.py
@@ -0,0 +1,10 @@
+"""Inference benchmarking."""
+
+from comprexx.benchmark.runner import (
+    BenchmarkComparison,
+    BenchmarkResult,
+    benchmark,
+    compare,
+)
+
+__all__ = ["BenchmarkComparison", "BenchmarkResult", "benchmark", "compare"]
diff --git a/comprexx/benchmark/runner.py b/comprexx/benchmark/runner.py
new file mode 100644
index 0000000..f658a9f
--- /dev/null
+++ b/comprexx/benchmark/runner.py
@@ -0,0 +1,250 @@
+"""Latency/throughput benchmarking for PyTorch models.
+
+Measures real inference performance so compression reports can show actual
+speedups, not just parameter-count reductions.
+"""
+
+from __future__ import annotations
+
+import json
+import statistics
+import time
+from dataclasses import asdict, dataclass, field
+from typing import Callable, Optional
+
+import torch
+import torch.nn as nn
+
+
+@dataclass
+class BenchmarkResult:
+    """Latency statistics from a benchmark run."""
+
+    device: str
+    dtype: str
+    batch_size: int
+    warmup: int
+    iters: int
+    mean_ms: float
+    median_ms: float
+    std_ms: float
+    min_ms: float
+    max_ms: float
+    p50_ms: float
+    p90_ms: float
+    p99_ms: float
+    throughput_ips: float
+    samples_ms: list[float] = field(default_factory=list)
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+    def to_json(self) -> str:
+        return json.dumps(self.to_dict(), indent=2)
+
+    def summary(self) -> str:
+        return (
+            f"Benchmark ({self.device}, batch={self.batch_size}, iters={self.iters})\n"
+            f"  Mean:       {self.mean_ms:.3f} ms\n"
+            f"  Median:     {self.median_ms:.3f} ms\n"
+            f"  Std:        {self.std_ms:.3f} ms\n"
+            f"  p50/p90/p99:{self.p50_ms:.3f} / {self.p90_ms:.3f} / {self.p99_ms:.3f} ms\n"
+            f"  Min/Max:    {self.min_ms:.3f} / {self.max_ms:.3f} ms\n"
+            f"  Throughput: {self.throughput_ips:.1f} inferences/sec"
+        )
+
+
+@dataclass
+class BenchmarkComparison:
+    """Before/after benchmark comparison."""
+
+    baseline: BenchmarkResult
+    compressed: BenchmarkResult
+
+    @property
+    def speedup(self) -> float:
+        if self.compressed.mean_ms == 0:
+            return float("inf")
+        return self.baseline.mean_ms / self.compressed.mean_ms
+
+    @property
+    def latency_reduction_pct(self) -> float:
+        if self.baseline.mean_ms == 0:
+            return 0.0
+        return (1 - self.compressed.mean_ms / self.baseline.mean_ms) * 100
+
+    @property
+    def throughput_gain_pct(self) -> float:
+        if self.baseline.throughput_ips == 0:
+            return 0.0
+        return (self.compressed.throughput_ips / self.baseline.throughput_ips - 1) * 100
+
+    def to_dict(self) -> dict:
+        return {
+            "baseline": self.baseline.to_dict(),
+            "compressed": self.compressed.to_dict(),
+            "speedup": self.speedup,
+            "latency_reduction_pct": self.latency_reduction_pct,
+            "throughput_gain_pct": self.throughput_gain_pct,
+        }
+
+    def to_json(self) -> str:
+        return json.dumps(self.to_dict(), indent=2)
+
+    def summary(self) -> str:
+        return (
+            f"Benchmark Comparison ({self.baseline.device})\n"
+            f"  Baseline:   {self.baseline.mean_ms:.3f} ms  "
+            f"({self.baseline.throughput_ips:.1f} ips)\n"
+            f"  Compressed: {self.compressed.mean_ms:.3f} ms  "
+            f"({self.compressed.throughput_ips:.1f} ips)\n"
+            f"  Speedup:    {self.speedup:.2f}x  "
+            f"({self.latency_reduction_pct:+.1f}% latency, "
+            f"{self.throughput_gain_pct:+.1f}% throughput)"
+        )
+
+
+def _make_input(
+    input_shape: tuple[int, ...] | list[tuple[int, ...]],
+    device: torch.device,
+    dtype: torch.dtype,
+) -> torch.Tensor | tuple[torch.Tensor, ...]:
+    if isinstance(input_shape, list):
+        return tuple(torch.randn(*s, device=device, dtype=dtype) for s in input_shape)
+    return torch.randn(*input_shape, device=device, dtype=dtype)
+
+
+def _sync(device: torch.device) -> None:
+    if device.type == "cuda":
+        torch.cuda.synchronize()
+
+
+def _percentile(values: list[float], pct: float) -> float:
+    if not values:
+        return 0.0
+    s = sorted(values)
+    k = (len(s) - 1) * (pct / 100.0)
+    lo = int(k)
+    hi = min(lo + 1, len(s) - 1)
+    frac = k - lo
+    return s[lo] * (1 - frac) + s[hi] * frac
+
+
+def benchmark(
+    model: nn.Module,
+    input_shape: tuple[int, ...] | list[tuple[int, ...]],
+    device: str = "cpu",
+    dtype: torch.dtype = torch.float32,
+    warmup: int = 10,
+    iters: int = 50,
+    input_fn: Optional[Callable[[], torch.Tensor | tuple[torch.Tensor, ...]]] = None,
+) -> BenchmarkResult:
+    """Benchmark a model's inference latency.
+
+    Args:
+        model: Model to benchmark (set to eval mode internally).
+        input_shape: Single tensor shape, or list of shapes for multi-input models.
+        device: "cpu" or "cuda".
+        dtype: Input tensor dtype. Quantized models ignore this.
+        warmup: Warmup iterations (not measured) to stabilize caches/JIT.
+        iters: Measured iterations.
+        input_fn: Optional callable returning a fresh input per call. Overrides
+            `input_shape` when provided.
+    """
+    if iters <= 0:
+        raise ValueError("iters must be positive")
+
+    dev = torch.device(device)
+    model = model.eval()
+
+    # Quantized models must run on CPU
+    is_quantized = any(
+        "quantized" in type(m).__module__ for m in model.modules()
+    )
+    if is_quantized and dev.type != "cpu":
+        dev = torch.device("cpu")
+
+    try:
+        model = model.to(dev)
+    except (RuntimeError, NotImplementedError):
+        # Some quantized modules refuse .to() transfers; fall through
+        pass
+
+    def _gen_input():
+        if input_fn is not None:
+            x = input_fn()
+        else:
+            x = _make_input(input_shape, dev, dtype)
+        return x
+
+    sample = _gen_input()
+    batch_size = (
+        sample[0].shape[0] if isinstance(sample, tuple) else sample.shape[0]
+    )
+
+    with torch.inference_mode():
+        # Warmup
+        for _ in range(warmup):
+            x = _gen_input()
+            if isinstance(x, tuple):
+                model(*x)
+            else:
+                model(x)
+        _sync(dev)
+
+        # Measure
+        samples_ms: list[float] = []
+        for _ in range(iters):
+            x = _gen_input()
+            _sync(dev)
+            t0 = time.perf_counter()
+            if isinstance(x, tuple):
+                model(*x)
+            else:
+                model(x)
+            _sync(dev)
+            samples_ms.append((time.perf_counter() - t0) * 1000.0)
+
+    mean_ms = statistics.fmean(samples_ms)
+    median_ms = statistics.median(samples_ms)
+    std_ms = statistics.pstdev(samples_ms) if len(samples_ms) > 1 else 0.0
+    throughput = (batch_size * 1000.0 / mean_ms) if mean_ms > 0 else 0.0
+
+    return BenchmarkResult(
+        device=str(dev),
+        dtype=str(dtype).replace("torch.", ""),
+        batch_size=batch_size,
+        warmup=warmup,
+        iters=iters,
+        mean_ms=mean_ms,
+        median_ms=median_ms,
+        std_ms=std_ms,
+        min_ms=min(samples_ms),
+        max_ms=max(samples_ms),
+        p50_ms=_percentile(samples_ms, 50),
+        p90_ms=_percentile(samples_ms, 90),
+        p99_ms=_percentile(samples_ms, 99),
+        throughput_ips=throughput,
+        samples_ms=samples_ms,
+    )
+
+
+def compare(
+    baseline_model: nn.Module,
+    compressed_model: nn.Module,
+    input_shape: tuple[int, ...] | list[tuple[int, ...]],
+    device: str = "cpu",
+    dtype: torch.dtype = torch.float32,
+    warmup: int = 10,
+    iters: int = 50,
+) -> BenchmarkComparison:
+    """Benchmark baseline and compressed models and return a comparison."""
+    base = benchmark(
+        baseline_model, input_shape, device=device, dtype=dtype,
+        warmup=warmup, iters=iters,
+    )
+    comp = benchmark(
+        compressed_model, input_shape, device=device, dtype=dtype,
+        warmup=warmup, iters=iters,
+    )
+    return BenchmarkComparison(baseline=base, compressed=comp)
diff --git a/comprexx/cli/main.py b/comprexx/cli/main.py
index 335ecb4..055af9c 100644
--- a/comprexx/cli/main.py
+++ b/comprexx/cli/main.py
@@ -177,5 +177,30 @@ def export_cmd(
         raise typer.Exit(1)
 
 
+@app.command()
+def bench(
+    model_source: str = typer.Argument(..., help="Model path or Python module path"),
+    input_shape: str = typer.Option(..., "--input-shape", help="Input shape, e.g. '1,3,224,224'"),
+    device: str = typer.Option("cpu", help="Device (cpu or cuda)"),
+    warmup: int = typer.Option(10, "--warmup", help="Warmup iterations"),
+    iters: int = typer.Option(50, "--iters", help="Measured iterations"),
+    json_output: bool = typer.Option(False, "--json", help="Output as JSON"),
+):
+    """Benchmark a model's inference latency."""
+    from comprexx.benchmark.runner import benchmark
+
+    shape = _parse_input_shape(input_shape)
+    model = _load_model(model_source)
+
+    with console.status("Benchmarking..."):
+        result = benchmark(model, input_shape=shape, device=device,
+                           warmup=warmup, iters=iters)
+
+    if json_output:
+        console.print(result.to_json())
+    else:
+        console.print(Panel(result.summary(), title="Comprexx Benchmark"))
+
+
 if __name__ == "__main__":
     app()
diff --git a/tests/unit/test_benchmark.py b/tests/unit/test_benchmark.py
new file mode 100644
index 0000000..8055eba
--- /dev/null
+++ b/tests/unit/test_benchmark.py
@@ -0,0 +1,93 @@
+"""Tests for the benchmark module."""
+
+import pytest
+import torch
+
+import comprexx as cx
+from comprexx.benchmark.runner import BenchmarkComparison, BenchmarkResult, _percentile
+from tests.fixtures.models import tiny_cnn, tiny_transformer
+
+
+class TestBenchmark:
+    def test_basic_run(self):
+        model = tiny_cnn()
+        result = cx.benchmark(model, input_shape=(1, 3, 32, 32), warmup=2, iters=5)
+        assert isinstance(result, BenchmarkResult)
+        assert result.iters == 5
+        assert result.warmup == 2
+        assert result.batch_size == 1
+        assert result.mean_ms > 0
+        assert result.throughput_ips > 0
+        assert len(result.samples_ms) == 5
+
+    def test_batch_size_from_shape(self):
+        model = tiny_cnn()
+        result = cx.benchmark(model, input_shape=(4, 3, 32, 32), warmup=1, iters=3)
+        assert result.batch_size == 4
+
+    def test_percentiles_ordered(self):
+        model = tiny_cnn()
+        result = cx.benchmark(model, input_shape=(1, 3, 32, 32), warmup=1, iters=10)
+        assert result.p50_ms <= result.p90_ms <= result.p99_ms
+        assert result.min_ms <= result.mean_ms <= result.max_ms
+
+    def test_invalid_iters(self):
+        model = tiny_cnn()
+        with pytest.raises(ValueError):
+            cx.benchmark(model, input_shape=(1, 3, 32, 32), iters=0)
+
+    def test_transformer_multi_input_shape(self):
+        model = tiny_transformer()
+        # (batch, seq, features)
+        result = cx.benchmark(model, input_shape=(2, 8, 64), warmup=1, iters=3)
+        assert result.batch_size == 2
+        assert result.mean_ms > 0
+
+    def test_custom_input_fn(self):
+        model = tiny_cnn()
+
+        def fn():
+            return torch.randn(1, 3, 32, 32)
+
+        result = cx.benchmark(model, input_shape=(1, 3, 32, 32), input_fn=fn,
+                              warmup=1, iters=3)
+        assert result.mean_ms > 0
+
+    def test_summary_and_serialization(self):
+        model = tiny_cnn()
+        result = cx.benchmark(model, input_shape=(1, 3, 32, 32), warmup=1, iters=3)
+        s = result.summary()
+        assert "Mean" in s and "ms" in s
+        d = result.to_dict()
+        assert d["iters"] == 3
+        j = result.to_json()
+        assert "mean_ms" in j
+
+
+class TestCompare:
+    def test_compare_same_model(self):
+        m = tiny_cnn()
+        cmp = cx.compare_benchmarks(m, m, input_shape=(1, 3, 32, 32),
+                                    warmup=1, iters=5)
+        assert isinstance(cmp, BenchmarkComparison)
+        # Same model: speedup should be in a sane range
+        assert 0.2 < cmp.speedup < 5.0
+
+    def test_compare_summary(self):
+        m = tiny_cnn()
+        cmp = cx.compare_benchmarks(m, m, input_shape=(1, 3, 32, 32),
+                                    warmup=1, iters=3)
+        s = cmp.summary()
+        assert "Baseline" in s and "Compressed" in s
+        assert "speedup" in cmp.to_dict()
+
+
+class TestPercentile:
+    def test_basic(self):
+        vals = [1.0, 2.0, 3.0, 4.0, 5.0]
+        assert _percentile(vals, 50) == 3.0
+        assert _percentile(vals, 0) == 1.0
+        assert _percentile(vals, 100) == 5.0
+
+    def test_empty(self):
+        assert _percentile([], 50) == 0.0

From 91ac88233ca63e95ca5218ee7b40ae29e05a8e4b Mon Sep 17 00:00:00 2001
From: maskedsyntax <aftaab@aftaab.xyz>
Date: Fri, 10 Apr 2026 17:21:26 +0530
Subject: [PATCH 2/2] add example notebooks for ResNet18 edge deploy and
 BERT-tiny quantization

Two Jupyter notebooks demonstrating end-to-end workflows:
- ResNet18: profile, fuse, prune, quantize, benchmark, export to ONNX
- BERT-tiny: low-rank decomposition + INT4 weight quant with benchmarks
---
 CHANGELOG.md                        |   2 +
 README.md                           |   7 +
 examples/bert_tiny_quantize.ipynb   | 233 ++++++++++++++++++++++++++++
 examples/resnet18_edge_deploy.ipynb | 209 +++++++++++++++++++++++++
 4 files changed, 451 insertions(+)
 create mode 100644 examples/bert_tiny_quantize.ipynb
 create mode 100644 examples/resnet18_edge_deploy.ipynb

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 9709820..b40431a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
   p50/p90/p99, min/max, and throughput. `cx.compare_benchmarks()` returns a
   before/after comparison with speedup and latency/throughput deltas. Quantized
   models are automatically run on CPU. New `comprexx bench` CLI command.
+- Example notebooks: ResNet18 edge deployment (prune + quantize + ONNX export)
+  and BERT-tiny quantization (low-rank decomposition + INT4 weight quant).
 - GitHub Actions CI workflow running `pytest` on Python 3.10, 3.11, 3.12 plus a
   `ruff check` lint job.
 - `CHANGELOG.md` with history for v0.1.0 and v0.2.0.
diff --git a/README.md b/README.md
index 5bac748..4514ba6 100644
--- a/README.md
+++ b/README.md
@@ -229,6 +229,13 @@ And for picking what to compress:
 |------|-------------|
 | Sensitivity analysis | `cx.analyze_sensitivity()` probes each Conv2d/Linear layer with a prune or noise perturbation, re-runs your `eval_fn`, and ranks layers by metric drop. Can also suggest `exclude_layers` above a chosen threshold. |
 
+## Examples
+
+Check out the example notebooks in [`examples/`](./examples/):
+
+- [ResNet18 edge deployment](./examples/resnet18_edge_deploy.ipynb): profile, fuse, prune, quantize, benchmark, and export a ResNet18 to ONNX.
+- [BERT-tiny quantization](./examples/bert_tiny_quantize.ipynb): low-rank decomposition + INT4 weight quantization on a small transformer, with latency benchmarks.
+
 ## License
 
 Apache 2.0
diff --git a/examples/bert_tiny_quantize.ipynb b/examples/bert_tiny_quantize.ipynb
new file mode 100644
index 0000000..9b07d21
--- /dev/null
+++ b/examples/bert_tiny_quantize.ipynb
@@ -0,0 +1,233 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# BERT-tiny Quantization with Comprexx\n",
+    "\n",
+    "This notebook shows how to compress a small transformer model:\n",
+    "\n",
+    "1. Profile the model\n",
+    "2. Apply low-rank decomposition to shrink Linear layers\n",
+    "3. Apply weight-only INT4 quantization\n",
+    "4. Benchmark before/after\n",
+    "\n",
+    "We use a minimal 2-layer transformer so this runs in seconds on CPU.\n",
+    "\n",
+    "Install: `pip install comprexx`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "import torch.nn as nn\n\nimport comprexx as cx"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Define a small transformer\n",
+    "\n",
+    "2-layer encoder, d_model=128, 4 heads, feedforward dim=512. Small enough for a notebook, large enough to show compression working."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class TinyBERT(nn.Module):\n",
+    "    def __init__(self, vocab_size=1000, d_model=128, nhead=4, num_layers=2, num_classes=4):\n",
+    "        super().__init__()\n",
+    "        self.embedding = nn.Embedding(vocab_size, d_model)\n",
+    "        encoder_layer = nn.TransformerEncoderLayer(\n",
+    "            d_model=d_model, nhead=nhead, dim_feedforward=512, batch_first=True,\n",
+    "        )\n",
+    "        self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_layers)\n",
+    "        self.classifier = nn.Linear(d_model, num_classes)\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = self.embedding(x)\n",
+    "        x = self.encoder(x)\n",
+    "        # Mean pooling over sequence dim\n",
+    "        x = x.mean(dim=1)\n",
+    "        return self.classifier(x)\n",
+    "\n",
+    "model = TinyBERT()\n",
+    "model.eval()\n",
+    "print(f\"Model: {sum(p.numel() for p in model.parameters()):,} parameters\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Profile the model\n",
+    "\n",
+    "We pass token IDs as input, but the profiler needs a float tensor. We'll profile using the embedding output shape and note that the embedding table is counted in params."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# For profiling, we use a float input that skips the embedding.\n",
+    "# The full model takes integer token IDs, so we profile the encoder+classifier separately.\n",
+    "class EncoderClassifier(nn.Module):\n",
+    "    \"\"\"Wraps encoder + classifier with float input for profiling.\"\"\"\n",
+    "    def __init__(self, encoder, classifier):\n",
+    "        super().__init__()\n",
+    "        self.encoder = encoder\n",
+    "        self.classifier = classifier\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        x = self.encoder(x)\n",
+    "        return self.classifier(x.mean(dim=1))\n",
+    "\n",
+    "profiling_model = EncoderClassifier(model.encoder, model.classifier)\n",
+    "profile = cx.analyze(profiling_model, input_shape=(1, 32, 128))  # (batch, seq_len, d_model)\n",
+    "print(profile.summary())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Low-rank decomposition\n",
+    "\n",
+    "The feedforward layers inside the transformer encoder are 128x512 and 512x128. SVD can factorize these into pairs of smaller layers. We keep 50% of the singular values (by energy)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from comprexx.stages.base import StageContext\n",
+    "\n",
+    "stage_lr = cx.stages.LowRankDecomposition(mode=\"energy\", energy_threshold=0.9)\n",
+    "ctx = StageContext(input_shape=(1, 32, 128), device=\"cpu\")\n",
+    "\n",
+    "model_lr, report_lr = stage_lr.apply(profiling_model, ctx)\n",
+    "print(report_lr.summary())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Weight-only INT4 quantization\n",
+    "\n",
+    "After SVD, we quantize remaining Linear weights to INT4 (group size 64, symmetric). Activations stay in float32."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stage_wq = cx.stages.WeightOnlyQuant(bits=4, group_size=64, symmetric=True)\n",
+    "\n",
+    "model_quant, report_wq = stage_wq.apply(model_lr, ctx)\n",
+    "print(report_wq.summary())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Or use a Pipeline for the same thing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline = cx.Pipeline([\n",
+    "    cx.stages.LowRankDecomposition(mode=\"energy\", energy_threshold=0.9),\n",
+    "    cx.stages.WeightOnlyQuant(bits=4, group_size=64),\n",
+    "])\n",
+    "\n",
+    "result = pipeline.run(profiling_model, input_shape=(1, 32, 128))\n",
+    "print(result.report.summary())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Benchmark latency"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cmp = cx.compare_benchmarks(\n",
+    "    profiling_model, result.model,\n",
+    "    input_shape=(1, 32, 128),\n",
+    "    warmup=10,\n",
+    "    iters=50,\n",
+    ")\n",
+    "print(cmp.summary())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 7. Dynamic PTQ as an alternative\n",
+    "\n",
+    "If weight-only quantization isn't giving you enough speedup, dynamic PTQ quantizes both weights and activations to INT8 at runtime. It's a simpler path that works well for inference on CPU."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline_ptq = cx.Pipeline([\n",
+    "    cx.stages.LowRankDecomposition(mode=\"energy\", energy_threshold=0.9),\n",
+    "    cx.stages.PTQDynamic(),\n",
+    "])\n",
+    "\n",
+    "result_ptq = pipeline_ptq.run(profiling_model, input_shape=(1, 32, 128))\n",
+    "print(result_ptq.report.summary())\n",
+    "\n",
+    "cmp_ptq = cx.compare_benchmarks(\n",
+    "    profiling_model, result_ptq.model,\n",
+    "    input_shape=(1, 32, 128),\n",
+    "    warmup=10,\n",
+    "    iters=50,\n",
+    ")\n",
+    "print(\"\\n\" + cmp_ptq.summary())"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file
diff --git a/examples/resnet18_edge_deploy.ipynb b/examples/resnet18_edge_deploy.ipynb
new file mode 100644
index 0000000..75d670d
--- /dev/null
+++ b/examples/resnet18_edge_deploy.ipynb
@@ -0,0 +1,209 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# ResNet18 Edge Deployment with Comprexx\n",
+    "\n",
+    "This notebook walks through compressing a ResNet18 for edge deployment:\n",
+    "\n",
+    "1. Profile the original model\n",
+    "2. Fuse Conv+BN layers (free compression)\n",
+    "3. Prune 40% of filters by L1 norm\n",
+    "4. Quantize to INT8 via dynamic PTQ\n",
+    "5. Benchmark before/after latency\n",
+    "6. Export to ONNX\n",
+    "\n",
+    "Install: `pip install \"comprexx[onnx]\"`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "import torch\n",
+    "\n",
+    "import comprexx as cx"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Load and profile the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = torch.hub.load(\"pytorch/vision\", \"resnet18\", weights=None)\n",
+    "model.eval()\n",
+    "\n",
+    "input_shape = (1, 3, 224, 224)\n",
+    "profile = cx.analyze(model, input_shape)\n",
+    "print(profile.summary())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Build the compression pipeline\n",
+    "\n",
+    "Three stages, applied in order:\n",
+    "- **Operator fusion**: fold BatchNorm into Conv2d. Zero accuracy cost, fewer params.\n",
+    "- **Structured pruning**: zero out the least important 40% of conv filters globally.\n",
+    "- **PTQ dynamic**: quantize Linear layers to INT8 at runtime."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pipeline = cx.Pipeline([\n",
+    "    cx.stages.OperatorFusion(),\n",
+    "    cx.stages.StructuredPruning(sparsity=0.4, criteria=\"l1_norm\", scope=\"global\"),\n",
+    "    cx.stages.PTQDynamic(),\n",
+    "])\n",
+    "\n",
+    "result = pipeline.run(model, input_shape=input_shape)\n",
+    "print(result.report.summary())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Verify the compressed model still works"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "compressed = result.model\n",
+    "\n",
+    "with torch.no_grad():\n",
+    "    x = torch.randn(*input_shape)\n",
+    "    out_original = model(x)\n",
+    "    out_compressed = compressed(x)\n",
+    "\n",
+    "print(f\"Original output shape:   {out_original.shape}\")\n",
+    "print(f\"Compressed output shape: {out_compressed.shape}\")\n",
+    "print(f\"Max abs difference:      {(out_original - out_compressed).abs().max():.4f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Benchmark inference latency"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "comparison = cx.compare_benchmarks(\n",
+    "    model, compressed,\n",
+    "    input_shape=input_shape,\n",
+    "    warmup=10,\n",
+    "    iters=50,\n",
+    ")\n",
+    "print(comparison.summary())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Export to ONNX\n",
+    "\n",
+    "The exporter runs `torch.onnx.export`, validates the output against PyTorch, and writes a manifest with compression metadata."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "exporter = cx.ONNXExporter()\n",
+    "exporter.export(\n",
+    "    result.model,\n",
+    "    input_shape=input_shape,\n",
+    "    output_path=\"resnet18_compressed.onnx\",\n",
+    ")\n",
+    "\n",
+    "onnx_size = os.path.getsize(\"resnet18_compressed.onnx\") / 1e6\n",
+    "print(f\"ONNX file size: {onnx_size:.1f} MB\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Same pipeline as a YAML recipe\n",
+    "\n",
+    "You can define this exact pipeline in YAML and run it from the CLI."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "recipe_yaml = \"\"\"\n",
+    "name: resnet18-edge\n",
+    "description: Pruned and quantized ResNet18 for edge deployment\n",
+    "\n",
+    "stages:\n",
+    "  - technique: operator_fusion\n",
+    "\n",
+    "  - technique: structured_pruning\n",
+    "    sparsity: 0.4\n",
+    "    criteria: l1_norm\n",
+    "    scope: global\n",
+    "\n",
+    "  - technique: ptq_dynamic\n",
+    "    format: int8\n",
+    "\"\"\"\n",
+    "\n",
+    "print(recipe_yaml)\n",
+    "print(\"# Save as resnet18-edge.yaml, then run:\")\n",
+    "print(\n",
+    "    \"# comprexx compress torchvision.models.resnet18\"\n",
+    "    \" --recipe resnet18-edge.yaml --input-shape 1,3,224,224\"\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
\ No newline at end of file