From 79496aa2bf624d4a10bd9c2d532c583d10a2710a Mon Sep 17 00:00:00 2001 From: andreinknv Date: Thu, 14 May 2026 18:39:27 -0400 Subject: [PATCH] feat(LlamaContext): default itemPrioritizationStrategy to firstInFirstOut MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Concurrent prompt throughput on a single LlamaContext with N sequences improves substantially when batched items are prioritized in arrival order rather than spread evenly across slots. The existing maximumParallelism strategy divides each batch evenly across all in-flight prompts so they all finish around the same time. That is fair, but it produces a flat wall-clock latency floor for every concurrent caller equal to roughly the slowest prompt's full duration. The firstInFirstOut strategy (already implemented in this repo at src/evaluator/LlamaContext/_strategies/firstInFirstOutStrategy.ts) matches llama-server's slot-iteration pattern: finish earlier prompts first, free their slots, accept new work. Measured on Qwen2.5-Coder-3B-Instruct Q4_K_M on Apple Silicon (MacBook M-series, Metal backend, contextSize=8192, sequences=8, flashAttention=true, identical prompts at 1500 tokens prompt + 256 tokens decode): c=8, p=1500 maximumParallelism (current default) ttft_min 31955 ms firstInFirstOut ttft_min 5790 ms (5.5x faster) maximumParallelism ttft_avg 33214 ms firstInFirstOut ttft_avg 19204 ms c=12 stress firstInFirstOut stable; no regression vs c=8 Total prompt throughput is unchanged (same total work in the same wall-clock window). What changes is the latency distribution: with FIFO, earlier callers see their tokens within seconds; with maximumParallelism every caller waits for the whole batch to finish. For most concurrent use cases the FIFO behavior is what callers expect from a context with multiple sequences. The flag is unchanged and documented, so users who want maximumParallelism can still opt in explicitly via `batching.itemPrioritizationStrategy`. No new tests — this is a one-line default flip. The firstInFirstOutStrategy was already exercised by existing tests that pass it explicitly. --- src/evaluator/LlamaContext/LlamaContext.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts index 248c763e..41640bb8 100644 --- a/src/evaluator/LlamaContext/LlamaContext.ts +++ b/src/evaluator/LlamaContext/LlamaContext.ts @@ -109,7 +109,7 @@ export class LlamaContext { threads, batching: { dispatchSchedule: batchingDispatchSchedule = "nextCycle", - itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" + itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "firstInFirstOut" } = {}, swaFullCache = _model.defaultContextSwaFullCache, performanceTracking = false,