From 3c7369404a4d822f2f560484c84bc9ec1104364a Mon Sep 17 00:00:00 2001
From: andreinknv <andrei.nknv@outlook.com>
Date: Thu, 14 May 2026 18:48:09 -0400
Subject: [PATCH] feat(LlamaContext): expose ubatchSize separately from
 batchSize
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently the C++ binding always sets `n_ubatch = n_batch`, with the
comment that the batch queue is managed JS-side. That's true for the
default case, but it prevents callers from ever asking for a smaller
physical micro-batch than the logical batch — equivalent to llama.cpp's
`--ubatch-size` flag.

This PR adds a `ubatchSize?: number` option on `LlamaContextOptions`.
When set, it forwards to `llama_context_params.n_ubatch`, overriding
the `n_ubatch = n_batch` default in the binding. When unset, behavior
is unchanged.

Two real use cases:
  1. Hardware where the model is sensitive to per-ubatch VRAM peaks —
     a smaller ubatch lets a larger total batch fit.
  2. Throughput tuning probes — sweeping `n_ubatch` independently of
     `n_batch` is useful when characterizing a model+hardware combo
     for sustained-load deployments (matches what
     `llama-server --batch-size N --ubatch-size M` already permits).

Plumbing:
  - LlamaContextOptions.ubatchSize (types.ts) — public option with docstring.
  - LlamaContext constructor (LlamaContext.ts) — destructured and
    forwarded into the AddonContext options bag.
  - AddonContext.cpp — when `options.Has("ubatchSize")`, overrides
    `context_params.n_ubatch` (must come AFTER the `batchSize` handler
    so the explicit `ubatchSize` wins over the `n_ubatch = n_batch`
    default).

No default change. Existing callers see no behavior shift.
---
 llama/addon/AddonContext.cpp               |  4 ++++
 src/evaluator/LlamaContext/LlamaContext.ts |  2 ++
 src/evaluator/LlamaContext/types.ts        | 14 ++++++++++++++
 3 files changed, 20 insertions(+)

diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
index 017c6967..10232fbb 100644
--- a/llama/addon/AddonContext.cpp
+++ b/llama/addon/AddonContext.cpp
@@ -414,6 +414,10 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
             context_params.n_ubatch = context_params.n_batch; // the batch queue is managed in the JS side, so there's no need for managing it on the C++ side
         }
 
+        if (options.Has("ubatchSize")) {
+            context_params.n_ubatch = options.Get("ubatchSize").As<Napi::Number>().Uint32Value();
+        }
+
         if (options.Has("sequences")) {
             context_params.n_seq_max = options.Get("sequences").As<Napi::Number>().Uint32Value();
         }
diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
index 248c763e..7a6c01c6 100644
--- a/src/evaluator/LlamaContext/LlamaContext.ts
+++ b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -105,6 +105,7 @@ export class LlamaContext {
         sequences,
         contextSize,
         batchSize,
+        ubatchSize,
         flashAttention = _model.defaultContextFlashAttention,
         threads,
         batching: {
@@ -162,6 +163,7 @@ export class LlamaContext {
                     ? 1 // +1 to handle edge cases with SWA KV cache
                     : 0
             ),
+            ubatchSize,
             sequences: this._totalSequences,
             flashAttention: this._flashAttention,
             threads: this._idealThreads,
diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
index a0a64f02..34396078 100644
--- a/src/evaluator/LlamaContext/types.ts
+++ b/src/evaluator/LlamaContext/types.ts
@@ -48,6 +48,20 @@ export type LlamaContextOptions = {
      */
     batchSize?: number,
 
+    /**
+     * The physical micro-batch size used inside the model's forward pass.
+     *
+     * Defaults to `batchSize` — the batch queue is managed JS-side and a
+     * single ubatch processes the whole batch. Set this lower than
+     * `batchSize` to chunk a large logical batch into smaller GPU
+     * submissions (matches llama.cpp's `--ubatch-size` flag, useful when
+     * the model is sensitive to per-ubatch VRAM peaks or when probing
+     * different `n_ubatch` values for throughput on a given hardware).
+     *
+     * Must be ≤ `batchSize`.
+     */
+    ubatchSize?: number,
+
     /**
      * Flash attention is an optimization in the attention mechanism that makes inference faster, more efficient and uses less memory.
      *