Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ externals/
user/*.o
user/vpipe-capture-mmap
user/vpipe-capture-read
user/vpipe-capture-userptr
user/vpipe-capture-m2m
user/vpipe-capture-dmabuf
user/vpipe-bench-fixture
Expand Down
27 changes: 27 additions & 0 deletions docs/benchmark.md
Original file line number Diff line number Diff line change
Expand Up @@ -129,3 +129,30 @@ reference + kernel
| Path | Copies / frame | Context switches / frame | Latency median / p95 / p99 | Cache refs / misses | Timestamp source | Notes |
| --- | --- | --- | --- | --- | --- | --- |
| explicit fence path | N/A | N/A | N/A | N/A | N/A | validated Ubuntu 25.10 lima guest (`6.17.0-22-generic`) exposes `request_fd` request support but no cited V4L2 userspace out-fence API, fence flags, or `fence_fd` field in the checked headers |

## Phase 6: Cost Decomposition

Phase 6 rows separate currently-aggregated costs into named knobs against the
heap-backed Phase 4 baseline. Each row names its column additions explicitly.

### USERPTR ingress (memory and cache cost)

Buffer model column distinguishes the two USERPTR usage patterns vivid + vb2
exposes; per-QBUF minor faults come from the `minor_faults_delta` column the
harness records around each `VIDIOC_QBUF` ioctl.

| Path | Buffer model | Per-QBUF minor faults (mean / p95 / max) | Latency median / p95 / p99 | mmap_lock_hold_us | Notes |
| --- | --- | --- | --- | --- | --- |
| vivid -> USERPTR (4-buffer pool) | fixed pool of 4 anon mmaps reused indefinitely | 0 / 0 / 0 | 132.778 / 137.110 / 138.562 ms | TBD (ftrace) | `__qbuf_userptr` caches the `(userptr, length)` -> pinned-page mapping per slot; re-QBUF on the same userptr reuses the existing pin and skips GUP entirely |
| vivid -> USERPTR (fresh per QBUF) | fresh anon `mmap` per re-QBUF, allocated before the previous one is freed so the kernel cannot reuse the VA | 75 / 75 / 75 | 132.745 / 137.901 / 166.039 ms | TBD (ftrace) | 75 == `sizeimage / PAGE_SIZE`; vb2 cache miss forces a full GUP slow-path on each QBUF, charged to the calling thread; per-QBUF re-pin inflates p99 by ~22 ms over the cached row without moving the median |

Validation conditions:

- 600-frame run on Ubuntu 25.10 lima guest (`6.17.0-22-generic`, aarch64),
`vivid` 640x480 `V4L2_PIX_FMT_GREY` at 30 fps, 4 buffers
- harness: `user/vpipe-capture-userptr.c`
- summarizer: `scripts/summarize-benchmark.py phase_userptr <csv>`
- artifacts: `bench/userptr-noreset.csv`, `bench/userptr-reset.csv`
- `mmap_lock_hold_us` cell still depends on the ftrace `function_graph`
capture filtered to `mmap_read_lock`/`mmap_write_lock` around `VIDIOC_QBUF`
that the TODO calls out separately
17 changes: 17 additions & 0 deletions scripts/bench_userptr.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env bash
set -euo pipefail

frames=${1:-600}
video=${2:-/dev/video0}
mode=${3:-reset}
out_dir=${4:-bench}

case "$mode" in
reset|noreset) ;;
*) echo "invalid mode '$mode' (expected reset|noreset)" >&2; exit 1 ;;
esac

mkdir -p "$out_dir"

scripts/bench_perf.sh "$out_dir/userptr-$mode" \
user/vpipe-capture-userptr "$video" "$out_dir/userptr-$mode.csv" "$frames" "$mode"
72 changes: 63 additions & 9 deletions scripts/summarize_benchmark.py → scripts/summarize-benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def summarize_phase1(path):
for row in rows
]
intervals = [
int(rows[i]["dequeue_monotonic_ns"]) - int(rows[i - 1]["dequeue_monotonic_ns"])
int(rows[i]["dequeue_monotonic_ns"])
- int(rows[i - 1]["dequeue_monotonic_ns"])
for i in range(1, len(rows))
]
timestamp_source = "V4L2 buffer timestamp + CLOCK_MONOTONIC"
Expand All @@ -59,7 +60,8 @@ def summarize_phase1(path):
for row in rows
]
intervals = [
int(rows[i]["read_end_monotonic_ns"]) - int(rows[i - 1]["read_end_monotonic_ns"])
int(rows[i]["read_end_monotonic_ns"])
- int(rows[i - 1]["read_end_monotonic_ns"])
for i in range(1, len(rows))
]
timestamp_source = "CLOCK_MONOTONIC"
Expand Down Expand Up @@ -100,22 +102,68 @@ def summarize_latency_csv(path, start_key, end_key):
for row in reader:
rows.append(row)

latencies = [
int(row[end_key]) - int(row[start_key])
for row in rows
]
latencies = [int(row[end_key]) - int(row[start_key]) for row in rows]
return {
"frames": len(rows),
"median_ns": statistics.median(latencies),
"p95_ns": percentile(sorted(latencies), 0.95),
"p99_ns": percentile(sorted(latencies), 0.99),
}


def summarize_phase_userptr(path):
rows = []
with open(path, newline="") as fh:
reader = csv.DictReader(fh)
required = {
"buffer_index",
"enqueue_monotonic_ns",
"dequeue_monotonic_ns",
"minor_faults_delta",
}
if reader.fieldnames is None or not required.issubset(reader.fieldnames):
missing = sorted(required.difference(reader.fieldnames or []))
raise SystemExit(f"{path}: missing required columns: {missing}")
for idx, row in enumerate(reader):
try:
row["_buffer_index"] = int(row["buffer_index"])
row["_enqueue_ns"] = int(row["enqueue_monotonic_ns"])
row["_dequeue_ns"] = int(row["dequeue_monotonic_ns"])
row["_faults"] = int(row["minor_faults_delta"])
except (TypeError, ValueError) as exc:
raise SystemExit(f"{path}: non-numeric value at row {idx}: {exc}")
rows.append(row)

if not rows:
raise SystemExit(f"{path}: no data rows")

latencies = [row["_dequeue_ns"] - row["_enqueue_ns"] for row in rows]
# The harness primes one USERPTR buffer per queue slot before STREAMON.
# Those first-touch costs show up once per distinct buffer index, so skip
# one initial dequeue for each buffer seen in the capture.
warmup_rows = len({row["_buffer_index"] for row in rows})
steady_faults = [row["_faults"] for row in rows[warmup_rows:]]
return {
"frames": len(rows),
"median_ns": statistics.median(latencies),
"p95_ns": percentile(sorted(latencies), 0.95),
"p99_ns": percentile(sorted(latencies), 0.99),
"warmup_rows_skipped": min(warmup_rows, len(rows)),
"minor_faults_per_qbuf_mean": (
statistics.mean(steady_faults) if steady_faults else 0.0
),
"minor_faults_per_qbuf_p95": (
percentile(sorted(steady_faults), 0.95) if steady_faults else 0
),
"minor_faults_per_qbuf_max": max(steady_faults) if steady_faults else 0,
}


def main():
if len(sys.argv) != 3:
print(
"usage: summarize_benchmark.py phase1|phase2|phase_m2m|phase_fixture csv_path",
"usage: summarize-benchmark.py "
"phase1|phase2|phase_m2m|phase_fixture|phase_userptr csv_path",
file=sys.stderr,
)
return 1
Expand All @@ -136,15 +184,21 @@ def main():
summary = summarize_latency_csv(
csv_path, "enqueue_monotonic_ns", "dequeue_monotonic_ns"
)
elif mode == "phase_userptr":
summary = summarize_phase_userptr(csv_path)
else:
print(f"unsupported mode: {mode}", file=sys.stderr)
return 1

perf = load_perf(perf_path) if perf_path.exists() else {}
frames = summary["frames"]
if perf and frames:
summary["context_switches_per_frame"] = perf.get("context-switches", 0.0) / frames
summary["cache_references_per_frame"] = perf.get("cache-references", 0.0) / frames
summary["context_switches_per_frame"] = (
perf.get("context-switches", 0.0) / frames
)
summary["cache_references_per_frame"] = (
perf.get("cache-references", 0.0) / frames
)
summary["cache_misses_per_frame"] = perf.get("cache-misses", 0.0) / frames
summary["cycles_per_frame"] = perf.get("cycles", 0.0) / frames
summary["instructions_per_frame"] = perf.get("instructions", 0.0) / frames
Expand Down
2 changes: 2 additions & 0 deletions user/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ COMMON_OBJS := vpipe-common.o
BINARIES := \
vpipe-capture-mmap \
vpipe-capture-read \
vpipe-capture-userptr \
vpipe-capture-m2m \
vpipe-capture-dmabuf \
vpipe-bench-fixture \
Expand All @@ -21,6 +22,7 @@ all: $(BINARIES) $(TEST_BINARIES)

vpipe-capture-mmap: vpipe-capture-mmap.o $(COMMON_OBJS)
vpipe-capture-read: vpipe-capture-read.o $(COMMON_OBJS)
vpipe-capture-userptr: vpipe-capture-userptr.o $(COMMON_OBJS)
vpipe-capture-m2m: vpipe-capture-m2m.o $(COMMON_OBJS)
vpipe-capture-dmabuf: vpipe-capture-dmabuf.o $(COMMON_OBJS)
vpipe-bench-fixture: vpipe-bench-fixture.o $(COMMON_OBJS)
Expand Down
Loading