withcatai · andreinknv · May 14, 2026
diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp
@@ -460,6 +460,10 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap<Ad
         if (options.Has("swaFullCache")) {
             context_params.swa_full = options.Get("swaFullCache").As<Napi::Boolean>().Value();
         }
+
+        if (options.Has("kvUnified")) {
+            context_params.kv_unified = options.Get("kvUnified").As<Napi::Boolean>().Value();
+        }
     }
 }
 AddonContext::~AddonContext() {

diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts
@@ -81,6 +81,7 @@ export class LlamaContext {
     /** @internal */ private readonly _unusedSequenceIds: number[] = [];
     /** @internal */ private readonly _batchingOptions: Required<BatchingOptions>;
     /** @internal */ public readonly _swaFullCache: boolean = false;
+    /** @internal */ private readonly _kvUnified: boolean | undefined = undefined;
     /** @internal */ private readonly _queuedDecodeSequenceIds = new Set<number>();
     /** @internal */ private readonly _queuedDecodes: InternalQueuedDecode[] = [];
     /** @internal */ private readonly _disposeAggregator = new AsyncDisposeAggregator();
@@ -112,6 +113,7 @@ export class LlamaContext {
             itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism"
         } = {},
         swaFullCache = _model.defaultContextSwaFullCache,
+        kvUnified,
         performanceTracking = false,
         experimentalKvCacheKeyType,
         experimentalKvCacheValueType,
@@ -155,6 +157,7 @@ export class LlamaContext {
         this._kvCacheKeyType = experimentalKvCacheKeyType;
         this._kvCacheValueType = experimentalKvCacheValueType;
         this._swaFullCache = !!swaFullCache;
+        this._kvUnified = kvUnified;
         this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({
             contextSize: padSafeContextSize(this._contextSize * this._totalSequences, "up"), // each sequence needs its own <contextSize> of cells
             batchSize: this._batchSize + (
@@ -170,7 +173,8 @@ export class LlamaContext {
             performanceTracking: this._performanceTracking,
             kvCacheKeyType: this._kvCacheKeyType,
             kvCacheValueType: this._kvCacheValueType,
-            swaFullCache: this._swaFullCache
+            swaFullCache: this._swaFullCache,
+            kvUnified: this._kvUnified
         }));
         this._batchingOptions = {
             dispatchSchedule: batchingDispatchSchedule,

diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts
@@ -153,6 +153,21 @@ export type LlamaContextOptions = {
      */
     swaFullCache?: boolean,
 
+    /**
+     * Use a unified KV buffer shared across all sequences when computing attention.
+     *
+     * When enabled, llama.cpp uses a single contiguous KV buffer indexed by sequence id,
+     * which can significantly improve multi-sequence prefill/decode throughput on GPU
+     * backends by reducing per-sequence buffer juggling.
+     *
+     * The llama.cpp default for `kv_unified` depends on whether the number of sequences
+     * is auto-detected; explicitly setting `kvUnified: true` matches the behavior of
+     * `llama-server` running with `--kv-unified` (which is the default in many configurations).
+     *
+     * Defaults to the llama.cpp default for the context configuration.
+     */
+    kvUnified?: boolean,
+
     /**
      * Load the provided LoRA adapters onto the context.
      * LoRA adapters are used to modify the weights of a pretrained model to adapt to new tasks or domains