From 3c7369404a4d822f2f560484c84bc9ec1104364a Mon Sep 17 00:00:00 2001 From: andreinknv Date: Thu, 14 May 2026 18:48:09 -0400 Subject: [PATCH] feat(LlamaContext): expose ubatchSize separately from batchSize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the C++ binding always sets `n_ubatch = n_batch`, with the comment that the batch queue is managed JS-side. That's true for the default case, but it prevents callers from ever asking for a smaller physical micro-batch than the logical batch — equivalent to llama.cpp's `--ubatch-size` flag. This PR adds a `ubatchSize?: number` option on `LlamaContextOptions`. When set, it forwards to `llama_context_params.n_ubatch`, overriding the `n_ubatch = n_batch` default in the binding. When unset, behavior is unchanged. Two real use cases: 1. Hardware where the model is sensitive to per-ubatch VRAM peaks — a smaller ubatch lets a larger total batch fit. 2. Throughput tuning probes — sweeping `n_ubatch` independently of `n_batch` is useful when characterizing a model+hardware combo for sustained-load deployments (matches what `llama-server --batch-size N --ubatch-size M` already permits). Plumbing: - LlamaContextOptions.ubatchSize (types.ts) — public option with docstring. - LlamaContext constructor (LlamaContext.ts) — destructured and forwarded into the AddonContext options bag. - AddonContext.cpp — when `options.Has("ubatchSize")`, overrides `context_params.n_ubatch` (must come AFTER the `batchSize` handler so the explicit `ubatchSize` wins over the `n_ubatch = n_batch` default). No default change. Existing callers see no behavior shift. --- llama/addon/AddonContext.cpp | 4 ++++ src/evaluator/LlamaContext/LlamaContext.ts | 2 ++ src/evaluator/LlamaContext/types.ts | 14 ++++++++++++++ 3 files changed, 20 insertions(+) diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp index 017c6967..10232fbb 100644 --- a/llama/addon/AddonContext.cpp +++ b/llama/addon/AddonContext.cpp @@ -414,6 +414,10 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap().Uint32Value(); + } + if (options.Has("sequences")) { context_params.n_seq_max = options.Get("sequences").As().Uint32Value(); } diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts index 248c763e..7a6c01c6 100644 --- a/src/evaluator/LlamaContext/LlamaContext.ts +++ b/src/evaluator/LlamaContext/LlamaContext.ts @@ -105,6 +105,7 @@ export class LlamaContext { sequences, contextSize, batchSize, + ubatchSize, flashAttention = _model.defaultContextFlashAttention, threads, batching: { @@ -162,6 +163,7 @@ export class LlamaContext { ? 1 // +1 to handle edge cases with SWA KV cache : 0 ), + ubatchSize, sequences: this._totalSequences, flashAttention: this._flashAttention, threads: this._idealThreads, diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts index a0a64f02..34396078 100644 --- a/src/evaluator/LlamaContext/types.ts +++ b/src/evaluator/LlamaContext/types.ts @@ -48,6 +48,20 @@ export type LlamaContextOptions = { */ batchSize?: number, + /** + * The physical micro-batch size used inside the model's forward pass. + * + * Defaults to `batchSize` — the batch queue is managed JS-side and a + * single ubatch processes the whole batch. Set this lower than + * `batchSize` to chunk a large logical batch into smaller GPU + * submissions (matches llama.cpp's `--ubatch-size` flag, useful when + * the model is sensitive to per-ubatch VRAM peaks or when probing + * different `n_ubatch` values for throughput on a given hardware). + * + * Must be ≤ `batchSize`. + */ + ubatchSize?: number, + /** * Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory. *