From e98011bb262cd44a2aff10da6e59ecba3f666045 Mon Sep 17 00:00:00 2001 From: Hitalo Souza Date: Fri, 3 Jul 2026 13:47:17 -0300 Subject: [PATCH 1/2] perf(vanilla-io_uring): converge onto the epoll twin (zero-alloc rendering, /pipeline skip-decode, crud slab) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port every backend-agnostic optimization from the vanilla-epoll entry so the two share one audited set of response builders and diff cleanly. The io_uring backend supports only a stateless request_handler (no async_handler / make_state / TLS, enghitalo/vanilla#83), so DB access stays on the blocking db.pg client; everything else now matches epoll byte-for-byte. Implements enghitalo/vanilla#84 (zero-alloc int parse/format) and enghitalo/vanilla#85 (crud: 1-query list, byte-rendered GET, fast body parse): - wi: negative-aware (fixes a latent wrong body for a negative /baseline11 sum) - emit / emit_int (stack scratch) / emit_xcache: zero-alloc response framing; /baseline11 and /upload no longer allocate an int->string per request - /pipeline: skip-decode fast path (blit the const before parsing) + decode_into (no Result boxing) on the main parse path - render_item_pg: byte-level JSON straight from db.pg text rows — removes the per-request json.encode reflection on /async-db, /crud list, /crud GET - crud cache: id-indexed slab (replaces map[int]string) with in-place buffer reuse and cache-aside invalidation, shared across ring workers under RwMutex - crud_list: single windowed query (count(*) OVER()) instead of page + count(*) - parse_crud_body_fast + borrowed json field parsers (json.decode fallback kept) - parse_i64_slice / dechunk_into / parse_hex_slice: allocation-free parsing - static: sendfile_min_bytes=16KiB, matching epoll (bounds per-conn RSS at high conns) DB profiles remain capped by the blocking db.pg on the single ring worker (enghitalo/vanilla#83) — unchanged here by design. Validated: both images build; every route (pipeline, baseline +/-, upload, json, json-comp, async-db, fortunes, static, crud list/get/create/update, 404, json-tls) is byte-for-byte identical to vanilla-epoll against a pristine seeded Postgres, and the X-Cache MISS->HIT->re-MISS-after-PUT sequence holds. Co-Authored-By: Claude Opus 4.8 (1M context) --- frameworks/vanilla-io_uring/main.v | 1037 +++++++++++++++++++--------- 1 file changed, 709 insertions(+), 328 deletions(-) diff --git a/frameworks/vanilla-io_uring/main.v b/frameworks/vanilla-io_uring/main.v index 89e902cc9..391d6cb62 100644 --- a/frameworks/vanilla-io_uring/main.v +++ b/frameworks/vanilla-io_uring/main.v @@ -28,35 +28,50 @@ struct DatasetItem { rating Rating } -struct DbItem { +struct CrudCreate { id int name string category string price int quantity int - active bool - tags []string - rating Rating -} - -struct DbResp { - items []DbItem - count int } struct Fortune { id int - message string + message []u8 // BORROWED view into the pg.Row string data (valid during render only) } +// CrudSlot is one entry of the id-indexed crud cache slab (ported from vanilla-epoll). +// `buf` is the rendered item response body, refilled IN PLACE across re-caches +// (allocated once, lazily, then reused — never orphaned). The entry is a valid HIT iff +// `valid && buf.len > 0`; a PUT just sets `valid = false` and keeps the buffer for the +// next MISS to reuse (cache-aside, no time-based expiry). The slab replaces the former +// map[int]string: identical structure to the epoll twin, so the two entries stay +// diffable, and it never re-encodes with json.encode on a MISS. +struct CrudSlot { +mut: + buf []u8 + valid bool +} + +// crud GET/PUT ids are `{RAND:1:50000}`; index the slab directly (1..50000). Index 0 is +// unused; created items (`{SEQ:100001}`) fall outside and are never read-cached. +const crud_cache_slots = 50001 +// Fixed per-slot buffer cap. The widest seed item renders to ~202 B; 512 leaves ample +// margin so a slot's buffer, allocated once on first MISS, never reallocates. +const crud_cache_bufcap = 512 + struct Shared { mut: db &pg.DB = unsafe { nil } dataset []DatasetItem prefixes []string // per item: `{…,"total":` (everything but the request-dependent total) asv static_assets.AssetServer // /static/* via the audited module (negotiation + queue_buf borrowed send) - cache map[int]string // crud cache-aside: id -> item JSON - cache_mu &sync.RwMutex = unsafe { nil } + // crud cache-aside: id-indexed slab (see CrudSlot). Shared across workers (this + // request_handler closure is the same for every ring worker), so the two validate + // probes (MISS then HIT) hit the same cache regardless of which worker serves them. + crud []CrudSlot + crud_mu &sync.RwMutex = unsafe { nil } // json-comp cache: the gzipped response for a given (count, m) is fully // deterministic and gzip dominates the cost, so compress once and reuse. // Key = (count << 32) | m. The benchmark hits only a few pairs, so it's tiny. @@ -64,13 +79,9 @@ mut: gz_mu &sync.RwMutex = unsafe { nil } } -struct CrudCreate { - id int - name string - category string - price int - quantity int -} +// ── zero-alloc write helpers (push_many, never single-element `<<`) ────────── +// This whole block is byte-for-byte identical to the vanilla-epoll twin: the two +// entries share one audited set of response builders so they stay diffable. // ws appends a string's bytes to `out` with no allocation (push_many copies // straight from the string's backing storage into the connection write buffer). @@ -79,11 +90,18 @@ fn ws(mut out []u8, s string) { unsafe { out.push_many(s.str, s.len) } } -// wi appends the decimal digits of a non-negative integer to `out`, no -// allocation (itoa into a stack scratch, emitted most-significant-first). -// The digits are written into the scratch back-to-front and flushed with a -// single `push_many` — single-element `<<` is several times slower than a bulk -// copy on post-0.5.1 V (vlang/v#27468), and this path runs for every number. +// wb appends a byte slice (e.g. a precomputed const response) into `out` with a +// single bulk copy — no allocation. +@[inline] +fn wb(mut out []u8, b []u8) { + unsafe { out.push_many(b.data, b.len) } +} + +// wi appends the decimal digits of an integer to `out`, no allocation (itoa into a +// stack scratch, flushed with a single push_many — single-element `<<` is several +// times slower on post-0.5.1 V, vlang/v#27468). Negative-aware: `/baseline11` sums +// can go negative (a+b with negative query ints), so it mirrors the epoll twin's +// signed formatter rather than the old non-negative-only version. @[direct_array_access] fn wi(mut out []u8, n i64) { mut tmp := [20]u8{} @@ -92,28 +110,66 @@ fn wi(mut out []u8, n i64) { unsafe { out.push_many(&tmp[0], 1) } return } - mut x := n + neg := n < 0 + // Build the magnitude in u64: i64::MIN's i64 negation overflows (the value isn't + // representable as i64), so derive it as -(n+1)+1 with the +1 in u64. + mut x := u64(n) + if neg { + x = u64(-(n + 1)) + 1 + } mut i := 20 for x > 0 { i-- tmp[i] = u8(`0`) + u8(x % 10) x /= 10 } + if neg { + i-- + tmp[i] = u8(`-`) + } unsafe { out.push_many(&tmp[i], 20 - i) } } -// wb appends a byte slice (e.g. a precomputed const response) into `out` with a -// single bulk copy — no allocation. -@[inline] -fn wb(mut out []u8, b []u8) { - unsafe { out.push_many(b.data, b.len) } +// ws_json_str appends a JSON-escaped string value (no surrounding quotes). Fast path: +// most values have no special characters, so emit them as one bulk copy. +@[direct_array_access] +fn ws_json_str(mut out []u8, s []u8) { + mut needs := false + for c in s { + if c == `"` || c == `\\` || c < 0x20 { + needs = true + break + } + } + if !needs { + wb(mut out, s) + return + } + for c in s { + match c { + `"` { ws(mut out, '\\"') } + `\\` { ws(mut out, '\\\\') } + `\n` { ws(mut out, '\\n') } + `\r` { ws(mut out, '\\r') } + `\t` { ws(mut out, '\\t') } + else { unsafe { out.push_many(&c, 1) } } + } + } +} + +// emit writes a complete 200 response with a precomputed body into `out`. +fn emit(mut out []u8, ctype string, body []u8) { + ws(mut out, 'HTTP/1.1 200 OK\r\nServer: vanilla\r\nContent-Type: ') + ws(mut out, ctype) + ws(mut out, '\r\nContent-Length: ') + wi(mut out, i64(body.len)) + ws(mut out, '\r\nConnection: keep-alive\r\n\r\n') + wb(mut out, body) } // write_resp appends a complete HTTP/1.1 response (status line + headers + body) // straight into the connection's persistent write buffer — no intermediate -// strings.Builder, no body→response copy, no per-request heap allocation. This -// is the zero-alloc twin of `ok()`; the latter survives only for the DB paths -// that are allocation-bound anyway. +// strings.Builder, no body→response copy, no per-request heap allocation. fn write_resp(mut out []u8, ctype string, body string) { ws(mut out, 'HTTP/1.1 200 OK\r\nServer: vanilla\r\nContent-Type: ') ws(mut out, ctype) @@ -123,11 +179,12 @@ fn write_resp(mut out []u8, ctype string, body string) { ws(mut out, body) } -// write_ok_xcache writes a complete 200 JSON response carrying an X-Cache: HIT|MISS -// header straight into `out` — the zero-alloc twin of ok_xcache(): no per-call -// strings.Builder and no body→out copy. (write_resp above is the same for the -// no-X-Cache paths, so crud_list/crud_update reuse it.) Byte-identical to ok_xcache(). -fn write_ok_xcache(mut out []u8, ctype string, body string, cache string) { +// emit_xcache writes a complete 200 JSON response carrying an X-Cache: HIT|MISS header +// straight into `out`. Byte-identical to the vanilla-epoll twin's inlined framing (both +// now share this helper). The stateless request_handler has no per-worker scratch, so +// the []u8 body is either the cache slot itself (HIT, emitted under the read-lock) or a +// freshly rendered local buffer (MISS). +fn emit_xcache(mut out []u8, ctype string, body []u8, cache string) { ws(mut out, 'HTTP/1.1 200 OK\r\nServer: vanilla\r\nX-Cache: ') ws(mut out, cache) ws(mut out, '\r\nContent-Type: ') @@ -135,35 +192,123 @@ fn write_ok_xcache(mut out []u8, ctype string, body string, cache string) { ws(mut out, '\r\nContent-Length: ') wi(mut out, i64(body.len)) ws(mut out, '\r\nConnection: keep-alive\r\n\r\n') - ws(mut out, body) + wb(mut out, body) +} + +// emit_int writes a 200 whose body is a single integer, formatting it into a stack +// scratch (no heap alloc). The obvious `write_resp(.., n.str())` heap-allocated an +// int->string on every request — pure GC pressure on the highest-RPS non-DB profiles +// (/baseline11, /upload). The epoll twin uses a per-worker scratch (make_state); the +// io_uring backend is stateless (request_handler only, enghitalo/vanilla#83), so this +// renders into a 20-byte stack buffer instead — same bytes, still zero-alloc. +@[direct_array_access] +fn emit_int(mut out []u8, ctype string, n i64) { + mut tmp := [20]u8{} + mut i := 20 + if n == 0 { + i = 19 + tmp[19] = u8(`0`) + } else { + neg := n < 0 + mut x := u64(n) + if neg { + x = u64(-(n + 1)) + 1 + } + for x > 0 { + i-- + tmp[i] = u8(`0`) + u8(x % 10) + x /= 10 + } + if neg { + i-- + tmp[i] = u8(`-`) + } + } + blen := 20 - i + ws(mut out, 'HTTP/1.1 200 OK\r\nServer: vanilla\r\nContent-Type: ') + ws(mut out, ctype) + ws(mut out, '\r\nContent-Length: ') + wi(mut out, i64(blen)) + ws(mut out, '\r\nConnection: keep-alive\r\n\r\n') + unsafe { out.push_many(&tmp[i], blen) } } +// Precomputed full response for the fixed /pipeline plaintext "ok" (the highest-RPS +// test): one bulk copy on the hot path, no query scan / route slice / build. +const pipeline_resp = 'HTTP/1.1 200 OK\r\nServer: vanilla\r\nContent-Type: text/plain\r\nContent-Length: 2\r\nConnection: keep-alive\r\n\r\nok'.bytes() + +// Raw request prefix for the fixed /pipeline plaintext test. The trailing space is the +// request-line SP, so this matches exactly `GET /pipeline ` (not /pipeline2). +const pipeline_prefix = 'GET /pipeline '.bytes() + +// has_pipeline_prefix is the skip-decode gate for the highest-RPS /pipeline test: match +// the raw request prefix and blit the response WITHOUT parsing. The io_uring backend +// hands the handler one framed request at a time, so the decode adds nothing here. +// Ported from the epoll twin (there the in-handle parse was ~17% of this request). +@[direct_array_access] +fn has_pipeline_prefix(b []u8) bool { + if b.len < pipeline_prefix.len { + return false + } + for i in 0 .. pipeline_prefix.len { + if b[i] != pipeline_prefix[i] { + return false + } + } + return true +} + +const not_found = 'HTTP/1.1 404 Not Found\r\nServer: vanilla\r\nContent-Length: 0\r\nConnection: keep-alive\r\n\r\n'.bytes() + +const created = 'HTTP/1.1 201 Created\r\nServer: vanilla\r\nContent-Length: 0\r\nConnection: keep-alive\r\n\r\n'.bytes() + +const bad_request = 'HTTP/1.1 400 Bad Request\r\nServer: vanilla\r\nContent-Length: 0\r\nConnection: keep-alive\r\n\r\n'.bytes() + +// ── request routing ────────────────────────────────────────────────────────── + fn handle(req_buffer []u8, _fd int, mut out []u8, mut sh Shared) ! { - req := request_parser.decode_http_request(req_buffer)! + // Skip-decode fast path: the fixed /pipeline plaintext (highest-RPS test) blits its + // response before ANY parsing (the request is already framed by the backend). + if has_pipeline_prefix(req_buffer) { + wb(mut out, pipeline_resp) + return + } + // decode_into fills req in place — no `!HttpRequest` Result boxing (~13% of the parse + // path on the epoll twin's callgrind), matching that entry's parser entry point. + mut req := request_parser.HttpRequest{ + buffer: req_buffer + } + if !request_parser.decode_into(mut req) { + wb(mut out, bad_request) + return + } method := unsafe { tos(&req.buffer[req.method.start], req.method.len) } target := unsafe { tos(&req.buffer[req.path.start], req.path.len) } - // Route on the path before '?' WITHOUT allocating: a tos() view into the - // request buffer rather than all_before()'s per-request copy. (Sub-slices like - // route[6..] still copy, but only on the few paths that actually need them.) + // Pipelined hot path: fixed response, blit the constant before the '?'-scan + route + // slice. The profile sends exactly /pipeline (no query), so exact-match. + if target == '/pipeline' { + wb(mut out, pipeline_resp) + return + } + // Route on the path before '?' WITHOUT allocating: a tos() view into the request + // buffer rather than all_before()'s per-request copy. qpos := target.index_u8(`?`) route := if qpos < 0 { target } else { unsafe { tos(target.str, qpos) } } - if route == '/pipeline' { - write_resp(mut out, 'text/plain', 'ok') - } else if route == '/baseline11' { + if route == '/baseline11' { mut sum := qint(req, qk_a) + qint(req, qk_b) if method == 'POST' { sum += body_int(req) } - write_resp(mut out, 'text/plain', sum.str()) + emit_int(mut out, 'text/plain', sum) } else if route == '/upload' { // Answer by the declared Content-Length, not req.body.len: large bodies are - // STREAMED (drained off the socket, head-only passed to the handler) by the - // lib's body-drain, so req.body is empty for them. Falls back to the buffered - // body length when Content-Length is absent (e.g. chunked). Mirrors vanilla-epoll. + // STREAMED (drained off the socket, head-only passed to the handler) by the lib's + // body-drain, so req.body is empty for them. Falls back to the buffered body + // length when Content-Length is absent (e.g. chunked). Mirrors vanilla-epoll. cl := req.content_length() - n := if cl >= 0 { cl } else { req.body.len } - write_resp(mut out, 'text/plain', n.str()) + n := if cl >= 0 { i64(cl) } else { i64(req.body.len) } + emit_int(mut out, 'text/plain', n) } else if route.starts_with('/json/') { count := clamp_count(parse_u_at(route, 6), sh.dataset.len) mut m := qint(req, qk_m) @@ -177,19 +322,16 @@ fn handle(req_buffer []u8, _fd int, mut out []u8, mut sh Shared) ! { sh.write_json_response(mut out, count, m) } } else if route == '/async-db' { - write_resp(mut out, 'application/json', sh.async_db(qint(req, qk_min), qint(req, qk_max), qint(req, - qk_limit))) + sh.write_async_db(mut out, qint(req, qk_min), qint(req, qk_max), qint(req, qk_limit)) } else if route == '/fortunes' { - write_resp(mut out, 'text/html; charset=utf-8', sh.fortunes()) + sh.write_fortunes(mut out) } else if route.starts_with('/static/') { // Canonical static serving via the lib's static_assets module, mounted at // /static/: negotiates the precompressed .br/.gz sibling per Accept-Encoding - // (the arena sends `br;q=1`, so this ships the ~4x smaller .br body instead of - // the raw file) and emits small assets via core.queue_buf borrowed send — the - // worker sends the preloaded, immutable bytes DIRECTLY (no copy through the - // per-connection write buffer). ONE audited path shared with vanilla-epoll, - // replacing the hand-rolled identity-only map that ignored Accept-Encoding. - sh.asv.respond_into(req_buffer, mut out) or { out << not_found } + // (the arena sends `br;q=1`, so this ships the ~4x smaller .br body) and emits + // small assets via core.queue_buf borrowed send. ONE audited path shared with + // vanilla-epoll, replacing the hand-rolled identity-only map. + sh.asv.respond_into(req_buffer, mut out) or { wb(mut out, not_found) } } else if route == '/crud/items' { if method == 'POST' { sh.write_crud_create(mut out, req) @@ -205,11 +347,154 @@ fn handle(req_buffer []u8, _fd int, mut out []u8, mut sh Shared) ! { sh.write_crud_get(mut out, id) } } else { - out << not_found + wb(mut out, not_found) } } -// crud_list returns a paginated, category-filtered page of items. +// ── DB row rendering (byte-level, no reflection) ────────────────────────────── + +// render_item_pg writes one items-row as JSON directly into `body` from a db.pg text +// row — no DbItem struct, no json.encode reflection (the epoll twin renders the same +// bytes from a binary pg_async.Row via render_item). Columns, in query order: id, name, +// category, price, quantity, active, tags(jsonb text), rating_score, rating_count. +// `tags` is the JSONB column as Postgres text (already valid JSON), emitted RAW — no +// decode/re-encode round-trip. +@[direct_array_access] +fn render_item_pg(mut body []u8, row pg.Row) { + ws(mut body, '{"id":') + wi(mut body, nn(row.vals[0]).i64()) + ws(mut body, ',"name":"') + ws_json_str(mut body, sbytes(nn(row.vals[1]))) + ws(mut body, '","category":"') + ws_json_str(mut body, sbytes(nn(row.vals[2]))) + ws(mut body, '","price":') + wi(mut body, nn(row.vals[3]).i64()) + ws(mut body, ',"quantity":') + wi(mut body, nn(row.vals[4]).i64()) + ws(mut body, ',"active":') + ws(mut body, if nn(row.vals[5]) == 't' { 'true' } else { 'false' }) + ws(mut body, ',"tags":') + wb(mut body, sbytes(nn3(row.vals[6], '[]'))) + ws(mut body, ',"rating":{"score":') + wi(mut body, nn(row.vals[7]).i64()) + ws(mut body, ',"count":') + wi(mut body, nn(row.vals[8]).i64()) + ws(mut body, '}}') +} + +// sbytes borrows a string's bytes as a non-owning []u8 view (no clone). Valid only while +// the owning string is alive — used to feed row column strings into the byte renderers +// during a synchronous render, before `rows` is dropped. +@[inline] +fn sbytes(s string) []u8 { + return unsafe { s.str.vbytes(s.len) } +} + +// ── /async-db ──────────────────────────────────────────────────────────────── + +const async_db_sql = 'SELECT id, name, category, price, quantity, active, tags, rating_score, rating_count FROM items WHERE price BETWEEN \$1 AND \$2 LIMIT \$3' + +// write_async_db runs the range query and renders the rows straight into `out`. +// NOTE: db.pg is BLOCKING; on the single-threaded io_uring ring worker this stalls the +// ring for the query's duration (enghitalo/vanilla#83) — the row rendering below is +// zero-reflection regardless, matching the epoll twin's render_async_db. +fn (mut sh Shared) write_async_db(mut out []u8, min i64, max i64, limit i64) { + mut lim := limit + if lim < 1 { + lim = 1 + } + if lim > 50 { + lim = 50 + } + mut db := sh.db + rows := db.exec_param_many(async_db_sql, [min.str(), max.str(), lim.str()]) or { + write_resp(mut out, 'application/json', '{"items":[],"count":0}') + return + } + // The stateless request_handler has no per-worker scratch (make_state is epoll-only), + // so render into a local buffer, then emit. One alloc per DB response — this path is + // DB-bound (#83) so the alloc volume is bounded by query throughput, not RPS. + mut scratch := []u8{cap: 4096} + ws(mut scratch, '{"items":[') + for i, row in rows { + if i > 0 { + ws(mut scratch, ',') + } + render_item_pg(mut scratch, row) + } + ws(mut scratch, '],"count":') + wi(mut scratch, i64(rows.len)) + ws(mut scratch, '}') + emit(mut out, 'application/json', scratch) +} + +// ── /fortunes ──────────────────────────────────────────────────────────────── + +const synthetic_fortune = 'Additional fortune added at request time.'.bytes() + +fn (mut sh Shared) write_fortunes(mut out []u8) { + mut db := sh.db + rows := db.exec_param_many('SELECT id, message FROM fortune', []) or { + write_resp(mut out, 'text/html; charset=utf-8', + '
') + return + } + mut fortunes := []Fortune{cap: rows.len + 1} + for row in rows { + // BORROW the message bytes from the row (stable while `rows` is alive) — no clone. + fortunes << Fortune{ + id: nn(row.vals[0]).int() + message: sbytes(nn(row.vals[1])) + } + } + fortunes << Fortune{ + id: 0 + message: synthetic_fortune + } + fortunes.sort_with_compare(cmp_fortune_message) + mut scratch := []u8{cap: 32768} + ws(mut scratch, + 'Fortunes') + for f in fortunes { + ws(mut scratch, '') + } + ws(mut scratch, '
idmessage
') + wi(mut scratch, i64(f.id)) + ws(mut scratch, '') + escape_html_into(mut scratch, f.message) // escape directly into scratch (no Builder) + ws(mut scratch, '
') + emit(mut out, 'text/html; charset=utf-8', scratch) +} + +// cmp_fortune_message orders fortunes by message, lexicographically by bytes — V has no +// `<` on []u8, so the sort needs an explicit comparator (returns <0 / 0 / >0). +fn cmp_fortune_message(a &Fortune, b &Fortune) int { + mut i := 0 + for i < a.message.len && i < b.message.len { + if a.message[i] != b.message[i] { + return int(a.message[i]) - int(b.message[i]) + } + i++ + } + return a.message.len - b.message.len +} + +// ── /crud ────────────────────────────────────────────────────────────────── +// crud stays on the BLOCKING db.pg client (the io_uring backend has no async runtime, +// enghitalo/vanilla#83). The renders/parsers below are the zero-reflection ports from +// the epoll twin (enghitalo/vanilla#85); db.pg text params still need C-string values, +// so the int params are still `.str()`'d and string params `.bytestr()`'d/cloned. + +// crud_list uses a single window-count query (count(*) OVER()) so the page and the total +// come back together — ONE query instead of the former page + separate count(*). +const crud_list_sql = 'SELECT id, name, category, price, quantity, active, tags, rating_score, rating_count, count(*) OVER() FROM items WHERE category = \$1 ORDER BY id LIMIT \$2 OFFSET \$3' + +const crud_get_sql = 'SELECT id, name, category, price, quantity, active, tags, rating_score, rating_count FROM items WHERE id = \$1' + +const crud_insert_sql = "INSERT INTO items (id, name, category, price, quantity, active, tags, rating_score, rating_count) VALUES (\$1, \$2, \$3, \$4, \$5, true, '[]', 0, 0) ON CONFLICT (id) DO UPDATE SET name = EXCLUDED.name, category = EXCLUDED.category, price = EXCLUDED.price, quantity = EXCLUDED.quantity" + +const crud_update_sql = 'UPDATE items SET name = \$2, category = \$3, price = \$4, quantity = \$5 WHERE id = \$1' + fn (mut sh Shared) write_crud_list(mut out []u8, category string, page i64, limit i64) { mut p := page if p < 1 { @@ -223,57 +508,46 @@ fn (mut sh Shared) write_crud_list(mut out []u8, category string, page i64, limi lim = 100 } offset := (p - 1) * lim - // db is pool-backed (Go-style db.pg): each exec_param_many transparently acquires - // a pooled conn for the call and releases it — no manual acquire/release. mut db := sh.db - rows := db.exec_param_many('SELECT id, name, category, price, quantity, active, tags, rating_score, rating_count FROM items WHERE category = \$1 ORDER BY id LIMIT \$2 OFFSET \$3', [ - category, - lim.str(), - offset.str(), - ]) or { + rows := db.exec_param_many(crud_list_sql, [category, lim.str(), offset.str()]) or { write_resp(mut out, 'application/json', '{"items":[],"total":0,"page":1}') return } - trows := db.exec_param_many('SELECT count(*) FROM items WHERE category = \$1', [ - category, - ]) or { [] } - total := if trows.len > 0 { nn(trows[0].vals[0]).int() } else { 0 } - mut items := []DbItem{cap: rows.len} - for row in rows { - items << row_to_item(row) - } - // Build the JSON body once (json.encode of the items array is the correctness - // reference), then write the full response straight into `out` via write_resp — - // no second strings.Builder for the response head and no body→out copy (ok() did - // both). Byte-identical to the previous ok('application/json', sb.str()). - mut sb := strings.new_builder(items.len * 200 + 64) - sb.write_string('{"items":') - sb.write_string(json.encode(items)) - sb.write_string(',"total":') - sb.write_decimal(i64(total)) - sb.write_string(',"page":') - sb.write_decimal(p) - sb.write_u8(`}`) - write_resp(mut out, 'application/json', sb.str()) + mut scratch := []u8{cap: 4096} + ws(mut scratch, '{"items":[') + mut total := i64(0) + for i, row in rows { + if i > 0 { + ws(mut scratch, ',') + } + render_item_pg(mut scratch, row) + total = nn(row.vals[9]).i64() // count(*) OVER() — same in every row + } + ws(mut scratch, '],"total":') + wi(mut scratch, total) + ws(mut scratch, ',"page":') + wi(mut scratch, p) + ws(mut scratch, '}') + emit(mut out, 'application/json', scratch) } -// write_crud_get writes a single item straight into `out`, using a cache-aside -// in-memory cache and the X-Cache header (MISS on first read, HIT after). The -// hot path is the HIT: it now writes the cached body + headers directly into the -// connection buffer with zero per-request allocation (ok_xcache built a throwaway -// strings.Builder per hit, then the caller copied it into out). +// write_crud_get serves a single item, cache-aside against the id-indexed slab with the +// X-Cache header (MISS on first read, HIT after). The HIT path is emitted directly under +// the read-lock (a short memcpy into `out`; concurrent readers share the RwMutex, only a +// MISS-refill write-lock blocks) — zero per-request allocation. fn (mut sh Shared) write_crud_get(mut out []u8, id int) { - sh.cache_mu.@rlock() - cached := sh.cache[id] or { '' } - sh.cache_mu.runlock() - if cached.len > 0 { - write_ok_xcache(mut out, 'application/json', cached, 'HIT') - return + if id >= 1 && id < crud_cache_slots { + sh.crud_mu.@rlock() + s := sh.crud[id] + if s.valid && s.buf.len > 0 { + emit_xcache(mut out, 'application/json', s.buf, 'HIT') + sh.crud_mu.runlock() + return + } + sh.crud_mu.runlock() } mut db := sh.db - rows := db.exec_param_many('SELECT id, name, category, price, quantity, active, tags, rating_score, rating_count FROM items WHERE id = \$1', [ - id.str(), - ]) or { + rows := db.exec_param_many(crud_get_sql, [id.str()]) or { wb(mut out, not_found) return } @@ -281,87 +555,112 @@ fn (mut sh Shared) write_crud_get(mut out []u8, id int) { wb(mut out, not_found) return } - body := json.encode(row_to_item(rows[0])) - sh.cache_mu.@lock() - sh.cache[id] = body - sh.cache_mu.unlock() - write_ok_xcache(mut out, 'application/json', body, 'MISS') + // Render the item once into a local buffer, publish it into the slab by refilling the + // slot's buffer IN PLACE under the write-lock (allocated once at a fixed cap, never + // reallocated/orphaned), then emit MISS from the same bytes. + mut scratch := []u8{cap: crud_cache_bufcap} + render_item_pg(mut scratch, rows[0]) + if id >= 1 && id < crud_cache_slots { + sh.crud_mu.@lock() + mut slot := &sh.crud[id] + if slot.buf.cap == 0 { + slot.buf = []u8{cap: crud_cache_bufcap} + } + unsafe { slot.buf.len = 0 } + slot.buf << scratch + slot.valid = true + sh.crud_mu.unlock() + } + emit_xcache(mut out, 'application/json', scratch, 'MISS') } -// crud_create inserts a new item from the JSON body and returns 201. fn (mut sh Shared) write_crud_create(mut out []u8, req request_parser.HttpRequest) { + body := unsafe { req.buffer[req.body.start..req.body.start + req.body.len] } + if c := parse_crud_body_fast(body, true) { + mut db := sh.db + db.exec_param_many(crud_insert_sql, [c.id.str(), c.name.bytestr(), c.category.bytestr(), + c.price.str(), c.quantity.str()]) or { + wb(mut out, bad_request) + return + } + wb(mut out, created) + return + } + // Fallback for escaped/awkward bodies the fast parser rejects: full json.decode. raw := unsafe { tos(&req.buffer[req.body.start], req.body.len) } c := json.decode(CrudCreate, raw) or { wb(mut out, bad_request) return } mut db := sh.db - db.exec_param_many("INSERT INTO items (id, name, category, price, quantity, active, tags, rating_score, rating_count) VALUES (\$1, \$2, \$3, \$4, \$5, true, '[]', 0, 0) ON CONFLICT (id) DO UPDATE SET name = EXCLUDED.name, category = EXCLUDED.category, price = EXCLUDED.price, quantity = EXCLUDED.quantity", [ - c.id.str(), - c.name, - c.category, - c.price.str(), - c.quantity.str(), - ]) or { + db.exec_param_many(crud_insert_sql, [c.id.str(), c.name, c.category, c.price.str(), + c.quantity.str()]) or { wb(mut out, bad_request) return } wb(mut out, created) } -// write_crud_update updates an item, invalidates its cache entry, and writes the -// response straight into `out`. fn (mut sh Shared) write_crud_update(mut out []u8, id int, req request_parser.HttpRequest) { + body := unsafe { req.buffer[req.body.start..req.body.start + req.body.len] } + if c := parse_crud_body_fast(body, false) { + mut db := sh.db + db.exec_param_many(crud_update_sql, [id.str(), c.name.bytestr(), c.category.bytestr(), + c.price.str(), c.quantity.str()]) or { + wb(mut out, bad_request) + return + } + sh.invalidate_crud(id) + write_resp(mut out, 'application/json', '{"status":"ok"}') + return + } raw := unsafe { tos(&req.buffer[req.body.start], req.body.len) } c := json.decode(CrudCreate, raw) or { wb(mut out, bad_request) return } mut db := sh.db - db.exec_param_many('UPDATE items SET name = \$2, category = \$3, price = \$4, quantity = \$5 WHERE id = \$1', [ - id.str(), - c.name, - c.category, - c.price.str(), - c.quantity.str(), - ]) or { + db.exec_param_many(crud_update_sql, [id.str(), c.name, c.category, c.price.str(), + c.quantity.str()]) or { wb(mut out, bad_request) return } - sh.cache_mu.@lock() - sh.cache.delete(id) - sh.cache_mu.unlock() + sh.invalidate_crud(id) write_resp(mut out, 'application/json', '{"status":"ok"}') } -fn row_to_item(row pg.Row) DbItem { - return DbItem{ - id: nn(row.vals[0]).int() - name: nn(row.vals[1]) - category: nn(row.vals[2]) - price: nn(row.vals[3]).int() - quantity: nn(row.vals[4]).int() - active: nn(row.vals[5]) == 't' - tags: json.decode([]string, nn3(row.vals[6], '[]')) or { [] } - rating: Rating{ - score: nn(row.vals[7]).i64() - count: nn(row.vals[8]).i64() - } +// invalidate_crud flips the slot's `valid` to false and KEEPS the buffer for the next +// MISS to reuse (cache-aside, invalidate-on-write). +@[inline] +fn (mut sh Shared) invalidate_crud(id int) { + if id >= 1 && id < crud_cache_slots { + sh.crud_mu.@lock() + sh.crud[id].valid = false + sh.crud_mu.unlock() } } -const not_found = 'HTTP/1.1 404 Not Found\r\nServer: vanilla\r\nContent-Length: 0\r\nConnection: keep-alive\r\n\r\n'.bytes() +// nn unwraps a nullable column value to a plain string ('' for NULL). +@[inline] +fn nn(v ?string) string { + return v or { '' } +} -const created = 'HTTP/1.1 201 Created\r\nServer: vanilla\r\nContent-Length: 0\r\nConnection: keep-alive\r\n\r\n'.bytes() +// nn3 unwraps a nullable column value with a custom default. +@[inline] +fn nn3(v ?string, d string) string { + return v or { d } +} -const bad_request = 'HTTP/1.1 400 Bad Request\r\nServer: vanilla\r\nContent-Length: 0\r\nConnection: keep-alive\r\n\r\n'.bytes() +// ── /json (non-DB) ───────────────────────────────────────────────────────── -// json_response builds the FULL HTTP response (headers + body) for /json in a -// single allocation — no per-request reflection and no body→response copy. -// Only `total` (price*quantity*m) varies per request; the rest is a precomputed -// prefix. Content-Length is computed up front so everything lands in one buffer. -fn (sh &Shared) write_json_response(mut out []u8, count int, m i64) { - // 21 = len('{"items":[') + len('],"count":') + '}', plus the count's own digits +// write_json_into is the transport-agnostic /json serializer: it only APPENDS response +// bytes to `out`, so the plaintext path and the json-tls path share it verbatim. `sh` is +// read-only and nothing per-request is heap-allocated. Content-Length is precomputed +// from the SAME values the body emits, so the framed length can never desync from the +// body (no response-splitting surface). +fn write_json_into(sh &Shared, mut out []u8, count int, m i64) { + // 21 = len('{"items":[') + len('],"count":') + '}'; plus the count's own digits mut clen := 21 + digits(i64(count)) if count > 0 { clen += count - 1 // item separators @@ -377,8 +676,7 @@ fn (sh &Shared) write_json_response(mut out []u8, count int, m i64) { for i in 0 .. count { ws(mut out, sh.prefixes[i]) wi(mut out, sh.dataset[i].price * sh.dataset[i].quantity * m) - // fuse each object's closing `}` with the item separator `,` into one - // bulk write — single-element `<<` is the slow path on post-0.5.1 V. + // fuse each object's closing `}` with the item separator `,` into one bulk write. ws(mut out, if i < count - 1 { '},' } else { '}' }) } ws(mut out, '],"count":') @@ -386,25 +684,28 @@ fn (sh &Shared) write_json_response(mut out []u8, count int, m i64) { ws(mut out, '}') } -// write_json_gzip is the json-comp path. The gzipped response for a given -// (count, m) is fully deterministic and gzip CPU dominates the cost, so we cache -// the COMPLETE response bytes and just append the cached copy on a hit — no -// rebuild, no recompress. Compressing once instead of per-request is the whole -// win for json-comp (the profile is compression-bound, not allocation-bound). -// -// Kept LAZY, not precomputed at boot: precomputing the full (count x m) grid would -// compress ~800 keys vs the ~8 the profile sends. On the `-gc none` epoll sibling -// that leaked ~135 MiB via gzip.compress scratch (vlang/v#27606); this io_uring -// binary is default-GC so it would not leak, but lazy keeps both entries identical -// and the shared lock is not a bottleneck (the json-comp@16384 collapse is thread -// oversubscription from the co-hosted json-tls listener, enghitalo/vanilla#89). +fn (sh &Shared) write_json_response(mut out []u8, count int, m i64) { + write_json_into(sh, mut out, count, m) +} + +// write_json_gzip is the json-comp path: a LAZY, process-shared, RwMutex-guarded cache +// of complete gzipped responses keyed by (count<<32)|m. Compress once on the first miss, +// then append the cached bytes on every hit (the profile is compression-bound). Kept +// lazy for parity with the epoll twin (there it also avoids a boot-time gzip.compress +// leak); the shared rlock is not a bottleneck — the json-comp@16384 collapse is worker +// oversubscription from the co-hosted json-tls listener (enghitalo/vanilla#89). fn (mut sh Shared) write_json_gzip(mut out []u8, count int, m i64) { key := (u64(u32(count)) << 32) | u64(u32(m)) + mut cached := []u8{} sh.gz_mu.@rlock() - cached := sh.gz_cache[key] or { []u8{} } + if c := sh.gz_cache[key] { + unsafe { + cached = c + } + } sh.gz_mu.runlock() if cached.len > 0 { - out << cached + wb(mut out, cached) return } body := sh.json_body(count, m) @@ -424,7 +725,7 @@ fn (mut sh Shared) write_json_gzip(mut out []u8, count int, m i64) { sh.gz_cache[key] = resp } sh.gz_mu.unlock() - out << resp + wb(mut out, resp) } // json_body builds just the /json body string (used for the gzip path). @@ -445,40 +746,12 @@ fn (sh &Shared) json_body(count int, m i64) string { return sb.str() } -// fortunes queries the fortune table, appends the runtime row, sorts by message -// and renders the HTML table (escaped). 199 seeded + 1 runtime + header = 201 . -fn (mut sh Shared) fortunes() string { - mut fortunes := []Fortune{} - mut db := sh.db - rows := db.exec_param_many('SELECT id, message FROM fortune', []) or { [] } - for row in rows { - fortunes << Fortune{ - id: nn(row.vals[0]).int() - message: nn(row.vals[1]) - } - } - fortunes << Fortune{ - id: 0 - message: 'Additional fortune added at request time.' - } - fortunes.sort(a.message < b.message) - mut sb := strings.new_builder(32768) - sb.write_string('Fortunes') - for f in fortunes { - sb.write_string('') - } - sb.write_string('
idmessage
') - sb.write_decimal(i64(f.id)) - sb.write_string('') - sb.write_string(escape_html(f.message)) - sb.write_string('
') - return sb.str() -} +// ── helpers ────────────────────────────────────────────────────────────────── -fn escape_html(s string) string { - // Fast path: most fortune messages contain no special characters, so return - // the original with no allocation instead of replace_each's 5 full-string - // passes (each scanning + reallocating). Only escape when there's something to. +// escape_html_into HTML-escapes s directly into out — no intermediate string/Builder. +// Fast path: when nothing needs escaping, one bulk copy (the common case). +@[direct_array_access] +fn escape_html_into(mut out []u8, s []u8) { mut needs := false for c in s { if c == `&` || c == `<` || c == `>` || c == `"` || c == `'` { @@ -487,20 +760,19 @@ fn escape_html(s string) string { } } if !needs { - return s + wb(mut out, s) + return } - mut b := strings.new_builder(s.len + 16) for c in s { match c { - `&` { b.write_string('&') } - `<` { b.write_string('<') } - `>` { b.write_string('>') } - `"` { b.write_string('"') } - `'` { b.write_string(''') } - else { b.write_u8(c) } + `&` { ws(mut out, '&') } + `<` { ws(mut out, '<') } + `>` { ws(mut out, '>') } + `"` { ws(mut out, '"') } + `'` { ws(mut out, ''') } + else { unsafe { out.push_many(&c, 1) } } } } - return b.str() } // digits returns the number of decimal digits in a non-negative integer. @@ -517,57 +789,6 @@ fn digits(n i64) int { return d } -fn (mut sh Shared) async_db(min i64, max i64, limit i64) string { - mut lim := limit - if lim < 1 { - lim = 1 - } - if lim > 50 { - lim = 50 - } - // db is pool-backed (Go-style db.pg): exec_param_many transparently acquires a - // pooled conn per call. (The old per-conn lazily-prepared statement isn't a clean - // fit for the transparent pool — prepared statements are session-scoped, and the - // pool hands out a transient conn per call; re-add via db.conn() pinning if the - // async-db per-call re-parse cost ever matters.) - mut db := sh.db - adb_params := [min.str(), max.str(), lim.str()] - rows := db.exec_param_many('SELECT id, name, category, price, quantity, active, tags, rating_score, rating_count FROM items WHERE price BETWEEN \$1 AND \$2 LIMIT \$3', - adb_params) or { return '{"items":[],"count":0}' } - mut items := []DbItem{cap: rows.len} - for row in rows { - items << DbItem{ - id: nn(row.vals[0]).int() - name: nn(row.vals[1]) - category: nn(row.vals[2]) - price: nn(row.vals[3]).int() - quantity: nn(row.vals[4]).int() - active: nn(row.vals[5]) == 't' - tags: json.decode([]string, nn3(row.vals[6], '[]')) or { [] } - rating: Rating{ - score: nn(row.vals[7]).i64() - count: nn(row.vals[8]).i64() - } - } - } - return json.encode(DbResp{ items: items, count: items.len }) -} - -// nn unwraps a nullable column value to a plain string ('' for NULL). -@[inline] -fn nn(v ?string) string { - return v or { '' } -} - -// nn3 unwraps a nullable column value with a custom default. -@[inline] -fn nn3(v ?string, d string) string { - return v or { d } -} - -// Precomputed query-parameter key bytes, built once at init. The hot path then -// never allocates a []u8 per lookup — `key.bytes()` did, one alloc per request -// per parameter (baseline parses a+b, async-db min+max+limit, etc.). const qk_a = 'a'.bytes() const qk_b = 'b'.bytes() const qk_m = 'm'.bytes() @@ -577,24 +798,43 @@ const qk_limit = 'limit'.bytes() const qk_page = 'page'.bytes() const qk_category = 'category'.bytes() -// qint reads a query parameter as an integer (0 if absent / non-numeric). The -// key is a precomputed []u8 (qk_*) so there is no per-call allocation; the value -// is read as a zero-copy tos() view and parsed in place. +// qint parses an integer query parameter directly from the request buffer — no string +// allocation (the old tos()+.i64() path materialized a throwaway string per call). +@[direct_array_access] fn qint(req request_parser.HttpRequest, key []u8) i64 { s := req.get_query_slice(key) or { return 0 } - return unsafe { tos(&req.buffer[s.start], s.len) }.i64() + return parse_i64_slice(req.buffer, s.start, s.len) } -// qstr reads a query parameter as a string ('' if absent). Clones so the value -// outlives the request buffer (it is passed to the DB driver). +// qstr reads a query parameter as a string ('' if absent). CLONES: db.pg text params are +// passed to libpq as C strings, so the value must be NUL-terminated and outlive the +// request buffer — a bare tos() view would read past the parameter. (The epoll twin's +// qstr_slice can borrow because pg_async copies bind params synchronously.) fn qstr(req request_parser.HttpRequest, key []u8) string { s := req.get_query_slice(key) or { return '' } return unsafe { tos(&req.buffer[s.start], s.len) }.clone() } -// parse_u_at parses a non-negative integer from `s` starting at byte `start`, -// stopping at the first non-digit — no substring allocation (route[6..].i64() -// copies). Used to read the count / id embedded in the request path. +// parse_i64_slice parses a decimal i64 from buf[start..start+length] in place (leading +// '-' allowed; stops at the first non-digit), allocating nothing. +@[direct_array_access] +fn parse_i64_slice(buf []u8, start int, length int) i64 { + mut n := i64(0) + mut neg := false + for i in 0 .. length { + c := buf[start + i] + if i == 0 && c == `-` { + neg = true + continue + } + if c < `0` || c > `9` { + break + } + n = n * 10 + i64(c - `0`) + } + return if neg { -n } else { n } +} + @[direct_array_access] fn parse_u_at(s string, start int) i64 { mut n := i64(0) @@ -618,63 +858,207 @@ fn clamp_count(n i64, max int) int { return int(n) } -// body_int parses the request body as an integer, decoding chunked transfer -// encoding when present. +// body_int parses the request body as an integer, decoding chunked transfer encoding +// when present. The common path (Content-Length / direct body) is zero-alloc; the rare +// chunked path reassembles into a small local buffer (baseline11 chunked bodies are tiny +// integers) via the shared dechunk_into, then parses in place. fn body_int(req request_parser.HttpRequest) i64 { if req.body.len == 0 { return 0 } - raw := unsafe { tos(&req.buffer[req.body.start], req.body.len) } if te := req.get_header_value_slice('Transfer-Encoding') { val := unsafe { tos(&req.buffer[te.start], te.len) } if val.contains('chunked') { - return dechunk(raw).i64() + mut buf := []u8{cap: 512} + dechunk_into(mut buf, req.buffer, req.body.start, req.body.len) + return parse_i64_slice(buf, 0, buf.len) } } - return raw.i64() + return parse_i64_slice(req.buffer, req.body.start, req.body.len) } -// dechunk decodes an HTTP/1.1 chunked body into its payload. -fn dechunk(s string) string { - mut out := strings.new_builder(s.len) - mut i := 0 - for i < s.len { - nl := s.index_after('\r\n', i) or { break } - size := strconv_hex(s[i..nl]) +// dechunk_into appends the dechunked body bytes from the chunked-encoded region +// buf[start..start+length] into `out`. Read each hex chunk-size line terminated by CRLF, +// copy `size` data bytes, stop at the 0-size chunk or any malformation. +@[direct_array_access] +fn dechunk_into(mut out []u8, buf []u8, start int, length int) { + end := start + length + mut i := start + for i < end { + // find the CRLF terminating the chunk-size line + mut nl := -1 + for j := i; j + 1 < end; j++ { + if buf[j] == `\r` && buf[j + 1] == `\n` { + nl = j + break + } + } + if nl < 0 { + break + } + size := parse_hex_slice(buf, i, nl - i) if size <= 0 { break } data_start := nl + 2 - out.write_string(s[data_start..data_start + size]) - i = data_start + size + 2 // skip data + trailing CRLF + // Overflow-safe bound: `data_start + size` in i32 would WRAP negative for an + // attacker-chosen size near 0x7fffffff, slipping past a naive check and feeding a + // ~2 GiB out-of-bounds read into push_many. `end - data_start` is a small + // non-negative int (data_start <= end), so comparing this way never overflows. + if size > end - data_start { + break + } + unsafe { out.push_many(&buf[data_start], size) } + i = data_start + size + 2 // past the data + its trailing CRLF } - return out.str() } -fn strconv_hex(s string) int { - mut n := 0 - for c in s.trim_space() { +// parse_hex_slice reads a hex integer from buf[start..start+length], stopping at the +// first non-hex byte (a chunk-extension `;` or the CRLF). No allocation. +@[direct_array_access] +fn parse_hex_slice(buf []u8, start int, length int) int { + mut n := i64(0) + for k in start .. start + length { + c := buf[k] d := if c >= `0` && c <= `9` { - int(c - `0`) + i64(c - `0`) } else if c >= `a` && c <= `f` { - int(c - `a` + 10) + i64(c - `a` + 10) } else if c >= `A` && c <= `F` { - int(c - `A` + 10) + i64(c - `A` + 10) } else { break } n = n * 16 + d + if n > 0x7fff_ffff { + return 0x7fff_ffff // saturate: the caller's size guard then rejects it + } } - return n + return int(n) } -// accepts_gzip reports whether the request advertises gzip in Accept-Encoding. fn accepts_gzip(req request_parser.HttpRequest) bool { ae := req.get_header_value_slice('Accept-Encoding') or { return false } return unsafe { tos(&req.buffer[ae.start], ae.len) }.contains('gzip') } -// content_type maps a file extension to a MIME type for the static handler. +struct CrudFastBody { + id i64 + name []u8 + category []u8 + price i64 + quantity i64 +} + +@[inline] +fn is_json_ws(c u8) bool { + return c == ` ` || c == `\n` || c == `\r` || c == `\t` +} + +@[direct_array_access] +fn has_key_at(buf []u8, at int, key string) bool { + if at + key.len > buf.len { + return false + } + for i in 0 .. key.len { + if buf[at + i] != key[i] { + return false + } + } + return true +} + +@[direct_array_access] +fn json_value_start(buf []u8, key string) ?int { + for i := 0; i < buf.len; i++ { + if !has_key_at(buf, i, key) { + continue + } + mut j := i + key.len + for j < buf.len && is_json_ws(buf[j]) { + j++ + } + if j >= buf.len || buf[j] != `:` { + continue + } + j++ + for j < buf.len && is_json_ws(buf[j]) { + j++ + } + if j < buf.len { + return j + } + return none + } + return none +} + +@[direct_array_access] +fn json_string_field_borrowed(buf []u8, key string) ?[]u8 { + start := json_value_start(buf, key) or { return none } + if start >= buf.len || buf[start] != `"` { + return none + } + mut i := start + 1 + for i < buf.len { + c := buf[i] + if c == `\\` { + // Keep the fast path strict and allocation-free; escaped strings fall back to json.decode. + return none + } + if c == `"` { + return buf[start + 1..i] + } + i++ + } + return none +} + +@[direct_array_access] +fn json_i64_field(buf []u8, key string) ?i64 { + mut i := json_value_start(buf, key) or { return none } + if i >= buf.len { + return none + } + mut neg := false + if buf[i] == `-` { + neg = true + i++ + } + if i >= buf.len || buf[i] < `0` || buf[i] > `9` { + return none + } + mut n := i64(0) + for i < buf.len { + c := buf[i] + if c < `0` || c > `9` { + break + } + n = n * 10 + i64(c - `0`) + i++ + } + return if neg { -n } else { n } +} + +// parse_crud_body_fast reads the crud JSON body with borrowed slices and no reflection +// (ported from the epoll twin, enghitalo/vanilla#85). Returns none for escaped/awkward +// bodies so the caller can fall back to json.decode. +@[direct_array_access] +fn parse_crud_body_fast(body []u8, need_id bool) ?CrudFastBody { + name := json_string_field_borrowed(body, '"name"') or { return none } + category := json_string_field_borrowed(body, '"category"') or { return none } + price := json_i64_field(body, '"price"') or { return none } + quantity := json_i64_field(body, '"quantity"') or { return none } + id := if need_id { json_i64_field(body, '"id"') or { return none } } else { i64(0) } + return CrudFastBody{ + id: id + name: name + category: category + price: price + quantity: quantity + } +} + // parse_db_url turns postgres://user:pass@host:port/dbname into a pg.Config. fn parse_db_url(u string) pg.Config { mut s := u @@ -697,12 +1081,12 @@ fn parse_db_url(u string) pg.Config { } } -// load_tls_config builds the json-tls server's TLS config. It reads the cert/key -// the HttpArena harness bind-mounts at /certs (overridable via TLS_CERT/TLS_KEY). -// If NO cert is mounted (local dev), it falls back to a fresh self-signed cert — -// the benchmark/validate clients use `curl -k` / wrk, which never verify it. If a -// cert IS present but the key is missing/unreadable, it FAILS LOUDLY rather than -// silently self-signing. TLS 1.3 + ALPN http/1.1 are fixed by the tls shim. +// load_tls_config builds the json-tls server's TLS config. It reads the cert/key the +// HttpArena harness bind-mounts at /certs (overridable via TLS_CERT/TLS_KEY). If NO cert +// is mounted (local dev), it falls back to a fresh self-signed cert — the +// benchmark/validate clients use `curl -k` / wrk, which never verify it. If a cert IS +// present but the key is missing/unreadable, it FAILS LOUDLY rather than silently +// self-signing. TLS 1.3 + ALPN http/1.1 are fixed by the tls shim. fn load_tls_config() &tls.Config { cert_path := os.getenv_opt('TLS_CERT') or { '/certs/server.crt' } key_path := os.getenv_opt('TLS_KEY') or { '/certs/server.key' } @@ -732,17 +1116,16 @@ fn main() { // max_idle_conns MUST equal max_open_conns: db.pg defaults idle to 2, so any conn // released beyond the 2nd is physically closed (pool.v) and the next acquire pays a // full PG connect handshake. Under the arena's concurrent DB load that churns - // connections on every request (async-db/crud/fortunes were down 60-90%). Keeping - // idle == open makes it a fixed warm pool, matching the old ConnectionPool. + // connections on every request. Keeping idle == open makes it a fixed warm pool. mut db := pg.connect(parse_db_url(url), pg.PoolConfig{ max_open_conns: size, max_idle_conns: size })! dataset_path := os.getenv_opt('DATASET_PATH') or { '/data/dataset.json' } dataset_raw := os.read_file(dataset_path) or { '[]' } dataset := json.decode([]DatasetItem, dataset_raw) or { []DatasetItem{} } - // Precompute each item's JSON prefix once: `{…,"rating":{…},"total":` - // (drop the closing brace, append the total key). Only the total value is - // request-dependent, so the hot path never serializes a struct. + // Precompute each item's JSON prefix once: `{…,"rating":{…},"total":` (drop the + // closing brace, append the total key). Only the total value is request-dependent, + // so the hot path never serializes a struct. mut prefixes := []string{cap: dataset.len} for it in dataset { enc := json.encode(it) @@ -750,15 +1133,17 @@ fn main() { } static_dir := os.getenv_opt('STATIC_DIR') or { '/data/static' } - // Canonical static server: loads every asset PLUS its .br/.gz siblings once, - // mounts them at /static/, and negotiates Accept-Encoding per request (serving - // the precompressed body when accepted, emitted via core.queue_buf borrowed - // send). Replaces the former identity-only map that ignored Accept-Encoding and - // always shipped the raw file. spa_fallback off: the arena set has no SPA. + // Canonical static server: loads every asset PLUS its .br/.gz siblings once, mounts + // them at /static/, and negotiates Accept-Encoding per request. sendfile_min_bytes + // matches the epoll twin: serve any representation >= 16 KiB from disk with sendfile + // instead of preloading + copying it through the per-connection write buffer (the + // large .br siblings otherwise balloon per-connection RSS at high conn counts). The + // tiny siblings stay preloaded (cheap memcpy). asv := static_assets.new(static_assets.Config{ - root: static_dir - url_prefix: '/static/' - spa_fallback: '' + root: static_dir + url_prefix: '/static/' + spa_fallback: '' + sendfile_min_bytes: 16 * 1024 }) or { panic('vanilla-io_uring: static_assets init failed: ${err}') } mut sh := Shared{ @@ -766,18 +1151,17 @@ fn main() { dataset: dataset prefixes: prefixes asv: asv - cache: map[int]string{} - cache_mu: sync.new_rwmutex() + crud: []CrudSlot{len: crud_cache_slots} + crud_mu: sync.new_rwmutex() gz_cache: map[u64][]u8{} gz_mu: sync.new_rwmutex() } // ── json-tls profile: /json over HTTPS on :8081 via the epoll + kTLS backend ── - // The lib's io_uring backend has no TLS, so the json-tls listener runs on the - // epoll backend (TLS 1.3 via Mbed TLS; after the handshake the kernel does record - // AES-128-GCM via kTLS where the `tls` module is present, else userspace fallback). - // It serves ONLY /json (404 elsewhere) — minimal TLS surface — reusing the same - // allocation-free write_json_response (read-only: dataset + prefixes). A STATELESS + // The lib's io_uring backend has no TLS, so the json-tls listener runs on the epoll + // backend (TLS 1.3 via Mbed TLS; kTLS record offload where the kernel `tls` module is + // present). It serves ONLY /json (404 elsewhere) — minimal TLS surface — reusing the + // same allocation-free write_json_into (read-only: dataset + prefixes). A STATELESS // request_handler captures `sh`; it never touches the DB/caches, so it runs safely // alongside the io_uring workers. The io_uring server below keeps the non-TLS // profiles on :8080. @@ -798,15 +1182,14 @@ fn main() { if m == 0 { m = 1 } - sh.write_json_response(mut out, count, m) + write_json_into(sh, mut out, count, m) return } wb(mut out, not_found) } - // Port is fixed to 8081 by the HttpArena harness; TLS_PORT lets local runs pick a - // free port. run() blocks, so the TLS server runs on its own thread (value-mut - // receiver → spawn via a closure with a local mut copy; the two servers are - // independent — own socket, workers and backend). + // Port is fixed to 8081 by the HttpArena harness; TLS_PORT lets local runs pick a free + // port. run() blocks, so the TLS server runs on its own thread (value-mut receiver → + // spawn via a closure with a local mut copy; the two servers are independent). mut tls_port := (os.getenv_opt('TLS_PORT') or { '8081' }).int() if tls_port <= 0 { tls_port = 8081 @@ -837,5 +1220,3 @@ fn main() { })! server.run() } - -// static_response prebuilds the full HTTP response for a static file. From 4bb52d0004d157b5dc23b1952d09d78659bac58b Mon Sep 17 00:00:00 2001 From: Hitalo Souza Date: Fri, 3 Jul 2026 23:28:50 -0300 Subject: [PATCH 2/2] =?UTF-8?q?fix(vanilla-io=5Furing):=20drop=20sendfile?= =?UTF-8?q?=5Fmin=5Fbytes=20=E2=80=94=20io=5Furing=20has=20no=20sendfile?= =?UTF-8?q?=20(static=20-86%..-99%)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The converged entry ported epoll's sendfile_min_bytes=16KiB, but the io_uring backend has NO sendfile path (no core.enable_sendfile / queue_file drain). static_assets then served every representation >= 16KiB (the large .br/.gz siblings) via a per-request disk READ that stalls the single ring worker — CI measured static -86%/-86%/-99% with CPU collapse. Revert to the default (256KiB): every arena sibling (< 256KiB) is preloaded and served as a zero-copy core.queue_buf borrowed send, restoring the original throughput. The epoll twin keeps sendfile_min_bytes=16KiB because its backend DOES support sendfile. Verified locally (wrk, 64 conns): /static/vendor.js (-> 67KB .br) serves at ~101k req/s / 6.38 GB/s with zero socket errors (preloaded, bandwidth-bound) instead of the disk-read collapse. My earlier byte-diff validation missed this — it only exercised a small (preloaded) asset, so it caught correctness but not the large-asset throughput. Co-Authored-By: Claude Opus 4.8 (1M context) --- frameworks/vanilla-io_uring/main.v | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/frameworks/vanilla-io_uring/main.v b/frameworks/vanilla-io_uring/main.v index 391d6cb62..0a2287069 100644 --- a/frameworks/vanilla-io_uring/main.v +++ b/frameworks/vanilla-io_uring/main.v @@ -1134,16 +1134,21 @@ fn main() { static_dir := os.getenv_opt('STATIC_DIR') or { '/data/static' } // Canonical static server: loads every asset PLUS its .br/.gz siblings once, mounts - // them at /static/, and negotiates Accept-Encoding per request. sendfile_min_bytes - // matches the epoll twin: serve any representation >= 16 KiB from disk with sendfile - // instead of preloading + copying it through the per-connection write buffer (the - // large .br siblings otherwise balloon per-connection RSS at high conn counts). The - // tiny siblings stay preloaded (cheap memcpy). + // them at /static/, and negotiates Accept-Encoding per request, emitting via + // core.queue_buf (borrowed send of the preloaded bytes). + // + // Do NOT set sendfile_min_bytes here (unlike the epoll twin, which uses 16 KiB): the + // io_uring backend has NO sendfile path (no core.enable_sendfile / queue_file drain), + // so static_assets.respond_into would fall back to READING a "large" asset's body + // from disk on every request — a blocking read that stalls the ring. Keeping the + // default (256 KiB) preloads every arena .br/.gz sibling (all < 256 KiB) and serves + // them as a zero-copy borrowed send. (Measured: 16 KiB here collapsed static ~-86% to + // -99% — the large .br siblings hit the read-per-request fallback. See vanilla#93/#83: + // io_uring sendfile support is future work.) asv := static_assets.new(static_assets.Config{ - root: static_dir - url_prefix: '/static/' - spa_fallback: '' - sendfile_min_bytes: 16 * 1024 + root: static_dir + url_prefix: '/static/' + spa_fallback: '' }) or { panic('vanilla-io_uring: static_assets init failed: ${err}') } mut sh := Shared{