withcatai · andreinknv · May 14, 2026
diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
@@ -414,6 +414,10 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
             context_params.n_ubatch = context_params.n_batch; // the batch queue is managed in the JS side, so there's no need for managing it on the C++ side
         }
 
+        if (options.Has("ubatchSize")) {
+            context_params.n_ubatch = options.Get("ubatchSize").As<Napi::Number>().Uint32Value();
+        }
+
         if (options.Has("sequences")) {
             context_params.n_seq_max = options.Get("sequences").As<Napi::Number>().Uint32Value();
         }

diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -105,6 +105,7 @@ export class LlamaContext {
         sequences,
         contextSize,
         batchSize,
+        ubatchSize,
         flashAttention = _model.defaultContextFlashAttention,
         threads,
         batching: {
@@ -162,6 +163,7 @@ export class LlamaContext {
                     ? 1 // +1 to handle edge cases with SWA KV cache
                     : 0
             ),
+            ubatchSize,
             sequences: this._totalSequences,
             flashAttention: this._flashAttention,
             threads: this._idealThreads,

diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
@@ -48,6 +48,20 @@ export type LlamaContextOptions = {
      */
     batchSize?: number,
 
+    /**
+     * The physical micro-batch size used inside the model's forward pass.
+     *
+     * Defaults to `batchSize` — the batch queue is managed JS-side and a
+     * single ubatch processes the whole batch. Set this lower than
+     * `batchSize` to chunk a large logical batch into smaller GPU
+     * submissions (matches llama.cpp's `--ubatch-size` flag, useful when
+     * the model is sensitive to per-ubatch VRAM peaks or when probing
+     * different `n_ubatch` values for throughput on a given hardware).
+     *
+     * Must be ≤ `batchSize`.
+     */
+    ubatchSize?: number,
+
     /**
      * Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory.
      *