From 05b1e912231d7568efc1ae6f4e1d45f40c5271e9 Mon Sep 17 00:00:00 2001 From: Jim Huang Date: Thu, 7 May 2026 18:05:12 +0800 Subject: [PATCH] Add USERPTR capture harness This introduces user/vpipe-capture-userptr.c, a V4L2 USERPTR ingress harness against vivid plus vb2_vmalloc. Two modes share a 4-buffer pool: noreset reuses the slot indefinitely and shows that vb2 caches the (userptr, length) pin per slot so re-QBUF skips GUP and per-frame minor faults stay at zero. reset allocates a fresh anonymous mapping before freeing the previous one so the kernel cannot reuse the VA, forces the cache to miss, and surfaces the full GUP slow-path cost as 75 minor faults per QBUF (== sizeimage / PAGE_SIZE on the validated guest). Per-QBUF fault accounting wraps only the QBUF ioctl with getrusage RUSAGE_SELF deltas so other buffers' submissions in the same thread do not pollute the count. Each CSV row pairs a DQBUF with the matching QBUF that submitted it for both latency and fault delta. scripts/bench_userptr.sh wraps bench_perf.sh with a reset/noreset allowlist on the mode argument. scripts/summarize-benchmark.py gains a phase_userptr mode with column validation, empty-CSV handling, non-numeric row diagnostics, and a warmup-row count derived from the distinct buffer indices observed in the CSV so the script stays in sync if the harness BUFFER_COUNT changes. Change-Id: I537abe4bb0bb4d4e9615cc17d5bec28330da7c0f --- .gitignore | 1 + docs/benchmark.md | 27 ++ scripts/bench_userptr.sh | 17 + ...ze_benchmark.py => summarize-benchmark.py} | 72 +++- user/Makefile | 2 + user/vpipe-capture-userptr.c | 324 ++++++++++++++++++ 6 files changed, 434 insertions(+), 9 deletions(-) create mode 100755 scripts/bench_userptr.sh rename scripts/{summarize_benchmark.py => summarize-benchmark.py} (59%) create mode 100644 user/vpipe-capture-userptr.c diff --git a/.gitignore b/.gitignore index a11f8ef..8ae0c27 100644 --- a/.gitignore +++ b/.gitignore @@ -15,6 +15,7 @@ externals/ user/*.o user/vpipe-capture-mmap user/vpipe-capture-read +user/vpipe-capture-userptr user/vpipe-capture-m2m user/vpipe-capture-dmabuf user/vpipe-bench-fixture diff --git a/docs/benchmark.md b/docs/benchmark.md index c70c8ed..68c7689 100644 --- a/docs/benchmark.md +++ b/docs/benchmark.md @@ -129,3 +129,30 @@ reference + kernel | Path | Copies / frame | Context switches / frame | Latency median / p95 / p99 | Cache refs / misses | Timestamp source | Notes | | --- | --- | --- | --- | --- | --- | --- | | explicit fence path | N/A | N/A | N/A | N/A | N/A | validated Ubuntu 25.10 lima guest (`6.17.0-22-generic`) exposes `request_fd` request support but no cited V4L2 userspace out-fence API, fence flags, or `fence_fd` field in the checked headers | + +## Phase 6: Cost Decomposition + +Phase 6 rows separate currently-aggregated costs into named knobs against the +heap-backed Phase 4 baseline. Each row names its column additions explicitly. + +### USERPTR ingress (memory and cache cost) + +Buffer model column distinguishes the two USERPTR usage patterns vivid + vb2 +exposes; per-QBUF minor faults come from the `minor_faults_delta` column the +harness records around each `VIDIOC_QBUF` ioctl. + +| Path | Buffer model | Per-QBUF minor faults (mean / p95 / max) | Latency median / p95 / p99 | mmap_lock_hold_us | Notes | +| --- | --- | --- | --- | --- | --- | +| vivid -> USERPTR (4-buffer pool) | fixed pool of 4 anon mmaps reused indefinitely | 0 / 0 / 0 | 132.778 / 137.110 / 138.562 ms | TBD (ftrace) | `__qbuf_userptr` caches the `(userptr, length)` -> pinned-page mapping per slot; re-QBUF on the same userptr reuses the existing pin and skips GUP entirely | +| vivid -> USERPTR (fresh per QBUF) | fresh anon `mmap` per re-QBUF, allocated before the previous one is freed so the kernel cannot reuse the VA | 75 / 75 / 75 | 132.745 / 137.901 / 166.039 ms | TBD (ftrace) | 75 == `sizeimage / PAGE_SIZE`; vb2 cache miss forces a full GUP slow-path on each QBUF, charged to the calling thread; per-QBUF re-pin inflates p99 by ~22 ms over the cached row without moving the median | + +Validation conditions: + +- 600-frame run on Ubuntu 25.10 lima guest (`6.17.0-22-generic`, aarch64), + `vivid` 640x480 `V4L2_PIX_FMT_GREY` at 30 fps, 4 buffers +- harness: `user/vpipe-capture-userptr.c` +- summarizer: `scripts/summarize-benchmark.py phase_userptr ` +- artifacts: `bench/userptr-noreset.csv`, `bench/userptr-reset.csv` +- `mmap_lock_hold_us` cell still depends on the ftrace `function_graph` + capture filtered to `mmap_read_lock`/`mmap_write_lock` around `VIDIOC_QBUF` + that the TODO calls out separately diff --git a/scripts/bench_userptr.sh b/scripts/bench_userptr.sh new file mode 100755 index 0000000..326fabd --- /dev/null +++ b/scripts/bench_userptr.sh @@ -0,0 +1,17 @@ +#!/usr/bin/env bash +set -euo pipefail + +frames=${1:-600} +video=${2:-/dev/video0} +mode=${3:-reset} +out_dir=${4:-bench} + +case "$mode" in + reset|noreset) ;; + *) echo "invalid mode '$mode' (expected reset|noreset)" >&2; exit 1 ;; +esac + +mkdir -p "$out_dir" + +scripts/bench_perf.sh "$out_dir/userptr-$mode" \ + user/vpipe-capture-userptr "$video" "$out_dir/userptr-$mode.csv" "$frames" "$mode" diff --git a/scripts/summarize_benchmark.py b/scripts/summarize-benchmark.py similarity index 59% rename from scripts/summarize_benchmark.py rename to scripts/summarize-benchmark.py index ab79375..aa46589 100755 --- a/scripts/summarize_benchmark.py +++ b/scripts/summarize-benchmark.py @@ -49,7 +49,8 @@ def summarize_phase1(path): for row in rows ] intervals = [ - int(rows[i]["dequeue_monotonic_ns"]) - int(rows[i - 1]["dequeue_monotonic_ns"]) + int(rows[i]["dequeue_monotonic_ns"]) + - int(rows[i - 1]["dequeue_monotonic_ns"]) for i in range(1, len(rows)) ] timestamp_source = "V4L2 buffer timestamp + CLOCK_MONOTONIC" @@ -59,7 +60,8 @@ def summarize_phase1(path): for row in rows ] intervals = [ - int(rows[i]["read_end_monotonic_ns"]) - int(rows[i - 1]["read_end_monotonic_ns"]) + int(rows[i]["read_end_monotonic_ns"]) + - int(rows[i - 1]["read_end_monotonic_ns"]) for i in range(1, len(rows)) ] timestamp_source = "CLOCK_MONOTONIC" @@ -100,22 +102,68 @@ def summarize_latency_csv(path, start_key, end_key): for row in reader: rows.append(row) - latencies = [ - int(row[end_key]) - int(row[start_key]) - for row in rows - ] + latencies = [int(row[end_key]) - int(row[start_key]) for row in rows] + return { + "frames": len(rows), + "median_ns": statistics.median(latencies), + "p95_ns": percentile(sorted(latencies), 0.95), + "p99_ns": percentile(sorted(latencies), 0.99), + } + + +def summarize_phase_userptr(path): + rows = [] + with open(path, newline="") as fh: + reader = csv.DictReader(fh) + required = { + "buffer_index", + "enqueue_monotonic_ns", + "dequeue_monotonic_ns", + "minor_faults_delta", + } + if reader.fieldnames is None or not required.issubset(reader.fieldnames): + missing = sorted(required.difference(reader.fieldnames or [])) + raise SystemExit(f"{path}: missing required columns: {missing}") + for idx, row in enumerate(reader): + try: + row["_buffer_index"] = int(row["buffer_index"]) + row["_enqueue_ns"] = int(row["enqueue_monotonic_ns"]) + row["_dequeue_ns"] = int(row["dequeue_monotonic_ns"]) + row["_faults"] = int(row["minor_faults_delta"]) + except (TypeError, ValueError) as exc: + raise SystemExit(f"{path}: non-numeric value at row {idx}: {exc}") + rows.append(row) + + if not rows: + raise SystemExit(f"{path}: no data rows") + + latencies = [row["_dequeue_ns"] - row["_enqueue_ns"] for row in rows] + # The harness primes one USERPTR buffer per queue slot before STREAMON. + # Those first-touch costs show up once per distinct buffer index, so skip + # one initial dequeue for each buffer seen in the capture. + warmup_rows = len({row["_buffer_index"] for row in rows}) + steady_faults = [row["_faults"] for row in rows[warmup_rows:]] return { "frames": len(rows), "median_ns": statistics.median(latencies), "p95_ns": percentile(sorted(latencies), 0.95), "p99_ns": percentile(sorted(latencies), 0.99), + "warmup_rows_skipped": min(warmup_rows, len(rows)), + "minor_faults_per_qbuf_mean": ( + statistics.mean(steady_faults) if steady_faults else 0.0 + ), + "minor_faults_per_qbuf_p95": ( + percentile(sorted(steady_faults), 0.95) if steady_faults else 0 + ), + "minor_faults_per_qbuf_max": max(steady_faults) if steady_faults else 0, } def main(): if len(sys.argv) != 3: print( - "usage: summarize_benchmark.py phase1|phase2|phase_m2m|phase_fixture csv_path", + "usage: summarize-benchmark.py " + "phase1|phase2|phase_m2m|phase_fixture|phase_userptr csv_path", file=sys.stderr, ) return 1 @@ -136,6 +184,8 @@ def main(): summary = summarize_latency_csv( csv_path, "enqueue_monotonic_ns", "dequeue_monotonic_ns" ) + elif mode == "phase_userptr": + summary = summarize_phase_userptr(csv_path) else: print(f"unsupported mode: {mode}", file=sys.stderr) return 1 @@ -143,8 +193,12 @@ def main(): perf = load_perf(perf_path) if perf_path.exists() else {} frames = summary["frames"] if perf and frames: - summary["context_switches_per_frame"] = perf.get("context-switches", 0.0) / frames - summary["cache_references_per_frame"] = perf.get("cache-references", 0.0) / frames + summary["context_switches_per_frame"] = ( + perf.get("context-switches", 0.0) / frames + ) + summary["cache_references_per_frame"] = ( + perf.get("cache-references", 0.0) / frames + ) summary["cache_misses_per_frame"] = perf.get("cache-misses", 0.0) / frames summary["cycles_per_frame"] = perf.get("cycles", 0.0) / frames summary["instructions_per_frame"] = perf.get("instructions", 0.0) / frames diff --git a/user/Makefile b/user/Makefile index fba228e..08b587b 100644 --- a/user/Makefile +++ b/user/Makefile @@ -5,6 +5,7 @@ COMMON_OBJS := vpipe-common.o BINARIES := \ vpipe-capture-mmap \ vpipe-capture-read \ + vpipe-capture-userptr \ vpipe-capture-m2m \ vpipe-capture-dmabuf \ vpipe-bench-fixture \ @@ -21,6 +22,7 @@ all: $(BINARIES) $(TEST_BINARIES) vpipe-capture-mmap: vpipe-capture-mmap.o $(COMMON_OBJS) vpipe-capture-read: vpipe-capture-read.o $(COMMON_OBJS) +vpipe-capture-userptr: vpipe-capture-userptr.o $(COMMON_OBJS) vpipe-capture-m2m: vpipe-capture-m2m.o $(COMMON_OBJS) vpipe-capture-dmabuf: vpipe-capture-dmabuf.o $(COMMON_OBJS) vpipe-bench-fixture: vpipe-bench-fixture.o $(COMMON_OBJS) diff --git a/user/vpipe-capture-userptr.c b/user/vpipe-capture-userptr.c new file mode 100644 index 0000000..9e77b9b --- /dev/null +++ b/user/vpipe-capture-userptr.c @@ -0,0 +1,324 @@ +/* SPDX-License-Identifier: MIT */ + +#define _POSIX_C_SOURCE 200809L +/* _DEFAULT_SOURCE exposes MAP_ANONYMOUS and ru_minflt on glibc. */ +#define _DEFAULT_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vpipe-common.h" + +#define BUFFER_COUNT 4 + +static int parse_uint_range(const char *s, + unsigned long min, + unsigned long max, + unsigned long *out) +{ + char *end; + unsigned long v; + + if (!s || *s == '\0' || *s == '-') + return -1; + errno = 0; + v = strtoul(s, &end, 10); + if (errno || *end != '\0' || v < min || v > max) + return -1; + *out = v; + return 0; +} + +static long minor_faults_now(void) +{ + struct rusage ru; + + if (getrusage(RUSAGE_SELF, &ru) < 0) + return -1; + return ru.ru_minflt; +} + +/* USERPTR ingress harness for the vpipe Phase 6 cost-decomposition row. + * + * vivid backs its capture queue with vb2_vmalloc, so the kernel pin path here + * is mmap-and-fault, not DMA pinning. v4l2's __qbuf_userptr caches pinned-page + * mapping per buffer slot: if the same userptr/length is QBUFed to the same + * slot, vb2 reuses the existing pin and skips GUP entirely, so a fixed buffer + * pool incurs zero per-frame faults. To measure the worst-case per-QBUF pin + * cost (the row named userptr_minor_faults_per_frame), reset mode mmaps a fresh + * anonymous region for each re-QBUF, forcing the cache to miss and the kernel + * to re-pin from scratch. noreset mode preserves the fixed pool and + * demonstrates the cached pin path. + */ +int main(int argc, char **argv) +{ + const char *video = argc > 1 ? argv[1] : "/dev/video0"; + const char *csv = argc > 2 ? argv[2] : "bench/userptr.csv"; + unsigned long frames_ul = 600; + + /* Default reset to expose the per-frame fault cost the row is named for; + * "noreset" measures the steady-state buffer-reuse case. + */ + bool reset_between_frames = true; + void *buffers[BUFFER_COUNT] = {0}; + size_t lengths[BUFFER_COUNT] = {0}; + uint64_t queued_at[BUFFER_COUNT] = {0}; + long qbuf_minflt[BUFFER_COUNT] = {0}; + bool streaming = false; + size_t sizeimage; + size_t aligned_len; + uint32_t userptr_length; + long page_size; + FILE *fp = NULL; + int fd = -1; + int rc = 1; + unsigned long i; + + if (argc > 3 && parse_uint_range(argv[3], 1, UINT_MAX, &frames_ul) < 0) { + fprintf(stderr, "invalid frames: %s\n", argv[3]); + return 1; + } + + if (argc > 4) { + if (strcmp(argv[4], "reset") == 0) + reset_between_frames = true; + else if (strcmp(argv[4], "noreset") == 0) + reset_between_frames = false; + else { + fprintf(stderr, "invalid mode '%s' (expected reset|noreset)\n", + argv[4]); + return 1; + } + } + + page_size = sysconf(_SC_PAGESIZE); + if (page_size <= 0) + page_size = 4096; + + fd = open(video, O_RDWR); + if (fd < 0) { + perror("open"); + goto cleanup; + } + + if (vpipe_set_format(fd, V4L2_BUF_TYPE_VIDEO_CAPTURE, 640, 480, + V4L2_PIX_FMT_GREY) < 0) { + perror("VIDIOC_S_FMT"); + goto cleanup; + } + + sizeimage = vpipe_get_sizeimage(fd, V4L2_BUF_TYPE_VIDEO_CAPTURE); + if (sizeimage == 0) { + fprintf(stderr, "VIDIOC_G_FMT: zero sizeimage\n"); + goto cleanup; + } + + /* Page-align so the kernel pin path operates on whole pages and reset + * mode's mmap/munmap pair sees a clean per-frame VA reuse. + */ + aligned_len = + (sizeimage + (size_t) page_size - 1) & ~((size_t) page_size - 1); + + /* buf.length is __u32; reject formats whose page-aligned size would + * truncate before any QBUF advertises a shorter buffer than was mapped. + */ + if (aligned_len > UINT32_MAX) { + fprintf(stderr, "aligned buffer length %zu exceeds uint32_t\n", + aligned_len); + goto cleanup; + } + userptr_length = (uint32_t) aligned_len; + + for (i = 0; i < BUFFER_COUNT; i++) { + void *p = mmap(NULL, aligned_len, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (p == MAP_FAILED) { + perror("mmap anonymous"); + goto cleanup; + } + buffers[i] = p; + lengths[i] = aligned_len; + } + + { + struct v4l2_requestbuffers req; + + memset(&req, 0, sizeof(req)); + req.count = BUFFER_COUNT; + req.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + req.memory = V4L2_MEMORY_USERPTR; + if (vpipe_xioctl(fd, VIDIOC_REQBUFS, &req) < 0) { + perror("VIDIOC_REQBUFS USERPTR"); + goto cleanup; + } + if (req.count < BUFFER_COUNT) { + fprintf(stderr, + "USERPTR REQBUFS returned %u (expected %u); driver may " + "lack USERPTR support\n", + req.count, (unsigned int) BUFFER_COUNT); + goto cleanup; + } + } + + fp = vpipe_open_csv( + csv, + "frame,buffer_index,enqueue_monotonic_ns,dequeue_monotonic_ns,v4l2_" + "timestamp_sec,v4l2_timestamp_usec,sequence,bytesused,minor_faults_" + "delta"); + if (!fp) { + perror("csv"); + goto cleanup; + } + setvbuf(fp, NULL, _IOLBF, 0); + + for (i = 0; i < BUFFER_COUNT; i++) { + struct v4l2_buffer buf; + + memset(&buf, 0, sizeof(buf)); + buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + buf.memory = V4L2_MEMORY_USERPTR; + buf.index = i; + buf.m.userptr = (unsigned long) buffers[i]; + buf.length = userptr_length; + queued_at[i] = vpipe_now_monotonic_ns(); + /* Pre-streamon QBUFs are the warmup pin; their fault cost is + * intentionally not in the CSV. */ + if (vpipe_xioctl(fd, VIDIOC_QBUF, &buf) < 0) { + perror("VIDIOC_QBUF"); + goto cleanup; + } + qbuf_minflt[i] = 0; + } + + { + enum v4l2_buf_type type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + + if (vpipe_xioctl(fd, VIDIOC_STREAMON, &type) < 0) { + perror("VIDIOC_STREAMON"); + goto cleanup; + } + streaming = true; + } + + for (i = 0; i < frames_ul; i++) { + struct v4l2_buffer buf; + uint64_t dq_ns; + uint64_t enqueue_ns; + long minflt_before_qbuf; + long minflt_after_qbuf; + long delta; + unsigned int idx; + + memset(&buf, 0, sizeof(buf)); + buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + buf.memory = V4L2_MEMORY_USERPTR; + if (vpipe_xioctl(fd, VIDIOC_DQBUF, &buf) < 0) { + perror("VIDIOC_DQBUF"); + goto cleanup; + } + if (buf.index >= BUFFER_COUNT) { + fprintf(stderr, "userptr index out of range: %u\n", buf.index); + goto cleanup; + } + idx = buf.index; + if (buf.m.userptr != (unsigned long) buffers[idx]) { + fprintf(stderr, + "DQBUF returned mismatched userptr %#lx for index %u " + "(expected %#lx)\n", + (unsigned long) buf.m.userptr, idx, + (unsigned long) buffers[idx]); + goto cleanup; + } + + dq_ns = vpipe_now_monotonic_ns(); + /* Pair this DQBUF with the QBUF that submitted it so the row's latency + * and fault delta both describe the same buffer transit. + */ + enqueue_ns = queued_at[idx]; + delta = qbuf_minflt[idx]; + + fprintf(fp, "%lu,%u,%" PRIu64 ",%" PRIu64 ",%ld,%ld,%u,%u,%ld\n", i, + idx, enqueue_ns, dq_ns, (long) buf.timestamp.tv_sec, + (long) buf.timestamp.tv_usec, buf.sequence, buf.bytesused, + delta); + + if (reset_between_frames) { + /* Replace the slot with a fresh anonymous mapping so the next QBUF + * advertises a userptr the vb2 cache has not seen, forcing GUP to + * re-fault all pages and surfacing the cost in min_flt. Allocate + * the new region BEFORE freeing the old so the kernel cannot reuse + * the same VA — a same-address remap would silently hit + * __qbuf_userptr's (userptr, length) cache and bypass GUP. + */ + void *old = buffers[idx]; + size_t old_len = lengths[idx]; + void *fresh = mmap(NULL, aligned_len, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (fresh == MAP_FAILED) { + perror("mmap anonymous (reset)"); + goto cleanup; + } + if (fresh == old) { + fprintf(stderr, "reset mmap collided with prior buffer at %p\n", + old); + munmap(fresh, aligned_len); + goto cleanup; + } + buffers[idx] = fresh; + lengths[idx] = aligned_len; + if (munmap(old, old_len) < 0) { + perror("munmap (reset)"); + goto cleanup; + } + } + + memset(&buf, 0, sizeof(buf)); + buf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + buf.memory = V4L2_MEMORY_USERPTR; + buf.index = idx; + buf.m.userptr = (unsigned long) buffers[idx]; + buf.length = userptr_length; + + /* Wrap only the QBUF ioctl so the delta attributes faults to the kernel + * pin path, not to other buffers' work in this thread. + */ + minflt_before_qbuf = minor_faults_now(); + queued_at[idx] = vpipe_now_monotonic_ns(); + if (vpipe_xioctl(fd, VIDIOC_QBUF, &buf) < 0) { + perror("VIDIOC_QBUF"); + goto cleanup; + } + minflt_after_qbuf = minor_faults_now(); + qbuf_minflt[idx] = (minflt_before_qbuf < 0 || minflt_after_qbuf < 0) + ? -1 + : minflt_after_qbuf - minflt_before_qbuf; + } + + rc = 0; + +cleanup: + if (fd >= 0 && streaming) { + enum v4l2_buf_type type = V4L2_BUF_TYPE_VIDEO_CAPTURE; + (void) vpipe_xioctl(fd, VIDIOC_STREAMOFF, &type); + } + if (fp) + fclose(fp); + for (i = 0; i < BUFFER_COUNT; i++) { + if (buffers[i]) + munmap(buffers[i], lengths[i]); + } + if (fd >= 0) + close(fd); + return rc; +}