pytorch · mergennachin · Jun 9, 2026
diff --git a/examples/models/qwen3_5_moe/README.md b/examples/models/qwen3_5_moe/README.md
@@ -197,6 +197,7 @@ is safe under asyncio.
 | `--max-context` | (none) | Reject prompts that exceed it with 400 |
 | `--no-think` | off | Default reasoning off (`enable_thinking=False`) |
 | `--max-sessions` | `1` | Isolated sessions on one weight load (see Sessions) |
+| `--warm-resume` / `--no-warm-resume` | on | Reuse a session's KV across turns (see Sessions) |
 
 ### Sessions
 
@@ -211,24 +212,41 @@ aliases, the `X-ExecuTorch-Session-ID` / `session_id` / `x-session-affinity`
 headers (body wins, then that header order). The header aliases let a client that
 already emits a stable per-conversation affinity id (e.g. pi's
 `sendSessionAffinityHeaders`) route with no extra config. Requests without any
-share a transient scratch session. Free a session with `DELETE /v1/sessions/{id}`.
+share a transient scratch session.
 
 ```bash
 curl http://127.0.0.1:8000/v1/chat/completions \
   -H 'Content-Type: application/json' \
   -d '{"model":"qwen3.5-moe","session_id":"alice",
        "messages":[{"role":"user","content":"hi"}]}'
+
+curl -X POST   http://127.0.0.1:8000/v1/sessions/alice/reset  # clear context, keep the slot
+curl -X DELETE http://127.0.0.1:8000/v1/sessions/alice        # free context + slot (VRAM)
 ```
 
 Admission is up front: an explicit `session_id` on a single-session server
 returns **400** (`unsupported_session`); past capacity it returns **429**
 (`capacity_exhausted`) before any response bytes.
 
-This is **isolation, not concurrency or warm resume**: execution is still
+**Warm append-only resume** (on by default): when a named session's next request
+is an exact-token extension of its resident context (e.g. the same conversation
+plus a new turn), the worker prefills **only the new suffix** instead of
+re-prefilling the whole prompt — continuing the KV/recurrent state in place. The
+check is exact-token (never re-tokenized text), so it is always correct: anything
+that can't be proven an exact extension (token mismatch, a stop-string trim, a
+prior error) falls back to a full reset + prefill. This is **per-session** warm
+append-only resume, **not** global prefix caching: there is no cross-session
+prefix sharing, so a system prompt common to two different `session_id`s is
+prefilled independently for each (unlike vLLM/llama.cpp global prefix reuse).
+Each `done` event reports
+`reused_prompt_tokens`, `prefilled_prompt_tokens`, and `session_reset_reason`
+(`new`/`exact_prefix`/`dirty`/`mismatch`/`equal`) for measuring the hit rate.
+`--no-warm-resume` forces a full prefill every request (for A/B comparison).
+
+This is **isolation + warm resume, not concurrency**: execution is still
 synchronous (one in-flight request; `--num-runners > 1` is rejected since more
-workers would duplicate the weights), and each request resets its session — the
-recurrent/conv state cannot be rewound by position (`seek()` is NotSupported), so
-turn-to-turn KV reuse (append-only warm resume) is a follow-up.
+workers would duplicate the weights). Fair interleaving across in-flight requests
+is a follow-up.
 
 ### Other limitations
 

diff --git a/examples/models/qwen3_5_moe/qwen35_moe_worker.cpp b/examples/models/qwen3_5_moe/qwen35_moe_worker.cpp
@@ -18,11 +18,11 @@
 // process segfaults in the int4 matmul (validated). Here the model runs in a
 // plain synchronous loop in its own process, which is reliable.
 //
-// Multi-session (isolation): the engine loads weights once and hosts multiple
-// isolated sessions on that one ~18GB allocation; the shared worker loop
-// (worker_loop.h) routes requests to per-session_id state, up to
-// --max_sessions. Execution is still synchronous (one in-flight request); warm
-// context reuse across requests is a follow-up.
+// Multi-session: the engine loads weights once and hosts multiple isolated
+// sessions on that one ~18GB allocation; the shared worker loop (worker_loop.h)
+// routes requests to per-session_id state (up to --max_sessions) and warm-
+// resumes each session's context across requests (append-only suffix prefill).
+// Execution is synchronous (one in-flight request).
 
 #include <gflags/gflags.h>
 
@@ -41,6 +41,12 @@ DEFINE_int32(
     "Max physical sessions to host on the one weight allocation (CUDA "
     "per-session mutable rebinding). Clamped to 1 if the backend cannot "
     "rebind.");
+DEFINE_bool(
+    warm_resume,
+    true,
+    "Warm append-only resume for named sessions: prefill only the suffix when a "
+    "request's tokens extend the session's resident context. Off resets every "
+    "request (useful for A/B measurement).");
 
 namespace {
 namespace llm = ::executorch::extension::llm;
@@ -73,5 +79,6 @@ int main(int argc, char** argv) {
   // ids back to text internally. The shared loop owns per-session_id state.
   ::tokenizers::Tokenizer* tokenizer = engine->tokenizer();
 
-  return llm::run_worker_stdio_loop(*engine, *tokenizer, engine->metadata());
+  return llm::run_worker_stdio_loop(
+      *engine, *tokenizer, engine->metadata(), FLAGS_warm_resume);
 }
diff --git a/examples/models/qwen3_5_moe/serve.py b/examples/models/qwen3_5_moe/serve.py
@@ -23,8 +23,10 @@
     requests share a scratch session). See --max-sessions.
   * Execution is synchronous: one in-flight request at a time, concurrent HTTP
     requests queue. Sessions provide isolation, not concurrent throughput.
-  * No warm context reuse yet: each request resets its session (Qwen seek() is
-    NotSupported; append-only reuse is a follow-up).
+  * Warm append-only resume is on by default (--warm-resume): a named session
+    reuses its resident context across turns when the prompt is an exact-token
+    extension, including tool-call turns via token-ID prompt segments. Anonymous
+    (scratch) requests always reset.
   * The control plane only does blocking pipe I/O on its executor thread (no
     CUDA), which is safe under asyncio.
 
@@ -83,6 +85,7 @@ def _spawn(args):
     if args.data_path:
         cmd += ["--data_path", args.data_path]
     cmd += ["--max_sessions", str(args.max_sessions)]
+    cmd += [f"--warm_resume={'true' if args.warm_resume else 'false'}"]
     logger.info("Starting Qwen worker subprocess (loads the model once)...")
     return spawn_worker(cmd, env=env)
 
@@ -162,6 +165,14 @@ def main() -> None:
         "cannot rebind. One slot is reserved for anonymous requests, so the "
         "number of addressable session_ids is max-sessions - 1.",
     )
+    p.add_argument(
+        "--warm-resume",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+        help="Warm append-only resume for named sessions: a request whose tokens "
+        "extend the session's resident context prefills only the suffix. "
+        "--no-warm-resume resets every request (for A/B measurement).",
+    )
     p.add_argument(
         "--worker-bin",
         default=None,

diff --git a/examples/models/qwen3_5_moe/test_serve.py b/examples/models/qwen3_5_moe/test_serve.py
@@ -76,6 +76,7 @@ def fake_spawn(cmd, env=None):
             tokenizer_path="t.json",
             data_path="d.ptd",
             max_sessions=4,
+            warm_resume=True,
         )
     )
     assert captured["cmd"] == [
@@ -88,6 +89,7 @@ def fake_spawn(cmd, env=None):
         "d.ptd",
         "--max_sessions",
         "4",
+        "--warm_resume=true",
     ]
 
 
@@ -103,6 +105,7 @@ def test_spawn_defaults_worker_bin_and_omits_empty_data_path(monkeypatch):
             tokenizer_path="t.json",
             data_path=None,
             max_sessions=4,
+            warm_resume=True,
         )
     )
     cmd = captured["cmd"]

@@ -86,3 +86,12 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
   target_link_options_gc_sections(text_llm_worker)
   target_link_options(text_llm_worker PRIVATE "LINKER:-s")
 endif()
+
+# Pure unit test for the warm-resume prefill planner (worker_prefill_plan.h). No
+# ET/model/tokenizer dependency, so it builds and runs standalone via ctest.
+enable_testing()
+add_executable(test_worker_prefill_plan test_worker_prefill_plan.cpp)
+target_include_directories(
+  test_worker_prefill_plan PUBLIC ${_common_include_directories}
+)
+add_test(NAME worker_prefill_plan COMMAND test_worker_prefill_plan)
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Unit tests for plan_prefill() (warm-resume decision). No model/session/ET
+// runtime dependency -- the header is pure, so this compiles and runs
+// standalone. Self-contained assertions (no gtest) so it has no build deps.
+
+#include <executorch/extension/llm/server/cpp/worker_prefill_plan.h>
+
+#include <cstdio>
+#include <cstring>
+#include <string>
+#include <vector>
+
+using executorch::extension::llm::plan_prefill;
+using executorch::extension::llm::PrefillPlan;
+
+namespace {
+int g_failures = 0;
+
+void expect(
+    const char* name,
+    const PrefillPlan& p,
+    PrefillPlan::Action action,
+    size_t suffix_start,
+    const char* reason) {
+  bool ok = p.action == action && p.suffix_start == suffix_start &&
+      std::strcmp(p.reason, reason) == 0;
+  if (!ok) {
+    ++g_failures;
+    printf(
+        "  [FAIL] %s: got action=%d suffix_start=%zu reason=%s\n",
+        name,
+        (int)p.action,
+        p.suffix_start,
+        p.reason);
+  } else {
+    printf("  [PASS] %s\n", name);
+  }
+}
+} // namespace
+
+int main() {
+  using V = std::vector<uint64_t>;
+
+  // First request: nothing resident -> full prefill, "new".
+  expect(
+      "new (resident empty)",
+      plan_prefill(V{}, V{1, 2, 3}, false),
+      PrefillPlan::kFull,
+      0,
+      "new");
+
+  // Exact token extension -> prefill only the suffix.
+  expect(
+      "exact_prefix (suffix reuse)",
+      plan_prefill(V{1, 2, 3}, V{1, 2, 3, 4, 5}, false),
+      PrefillPlan::kSuffix,
+      3,
+      "exact_prefix");
+
+  // Single-token extension still reuses.
+  expect(
+      "exact_prefix (one-token suffix)",
+      plan_prefill(V{1, 2, 3}, V{1, 2, 3, 4}, false),
+      PrefillPlan::kSuffix,
+      3,
+      "exact_prefix");
+
+  // Divergent token -> mismatch, full reset.
+  expect(
+      "mismatch (divergent token)",
+      plan_prefill(V{1, 2, 3}, V{1, 2, 9, 4}, false),
+      PrefillPlan::kFull,
+      0,
+      "mismatch");
+
+  // Prompt shorter than resident (rewind) -> mismatch, full reset.
+  expect(
+      "mismatch (prompt shorter)",
+      plan_prefill(V{1, 2, 3}, V{1, 2}, false),
+      PrefillPlan::kFull,
+      0,
+      "mismatch");
+
+  // Dirty wins even over an otherwise-exact extension.
+  expect(
+      "dirty (overrides exact prefix)",
+      plan_prefill(V{1, 2, 3}, V{1, 2, 3, 4}, true),
+      PrefillPlan::kFull,
+      0,
+      "dirty");
+
+  // Prompt identical to resident -> reset + full (no empty-suffix prefill).
+  expect(
+      "equal (prompt == resident)",
+      plan_prefill(V{1, 2, 3}, V{1, 2, 3}, false),
+      PrefillPlan::kFull,
+      0,
+      "equal");
+
+  // Dirty + empty resident still resets as dirty (dirty checked first).
+  expect(
+      "dirty (empty resident)",
+      plan_prefill(V{}, V{1, 2}, true),
+      PrefillPlan::kFull,
+      0,
+      "dirty");
+
+  printf(
+      "\n%s (%d failure(s))\n",
+      g_failures == 0 ? "ALL PASS" : "FAILED",
+      g_failures);
+  return g_failures == 0 ? 0 : 1;
+}