diff --git a/llama/addon/AddonContext.cpp b/llama/addon/AddonContext.cpp index 017c6967..c2fce409 100644 --- a/llama/addon/AddonContext.cpp +++ b/llama/addon/AddonContext.cpp @@ -460,6 +460,10 @@ AddonContext::AddonContext(const Napi::CallbackInfo& info) : Napi::ObjectWrap().Value(); } + + if (options.Has("kvUnified")) { + context_params.kv_unified = options.Get("kvUnified").As().Value(); + } } } AddonContext::~AddonContext() { diff --git a/src/evaluator/LlamaContext/LlamaContext.ts b/src/evaluator/LlamaContext/LlamaContext.ts index 248c763e..11a39f12 100644 --- a/src/evaluator/LlamaContext/LlamaContext.ts +++ b/src/evaluator/LlamaContext/LlamaContext.ts @@ -81,6 +81,7 @@ export class LlamaContext { /** @internal */ private readonly _unusedSequenceIds: number[] = []; /** @internal */ private readonly _batchingOptions: Required; /** @internal */ public readonly _swaFullCache: boolean = false; + /** @internal */ private readonly _kvUnified: boolean | undefined = undefined; /** @internal */ private readonly _queuedDecodeSequenceIds = new Set(); /** @internal */ private readonly _queuedDecodes: InternalQueuedDecode[] = []; /** @internal */ private readonly _disposeAggregator = new AsyncDisposeAggregator(); @@ -112,6 +113,7 @@ export class LlamaContext { itemPrioritizationStrategy: batchingItemsPrioritizationStrategy = "maximumParallelism" } = {}, swaFullCache = _model.defaultContextSwaFullCache, + kvUnified, performanceTracking = false, experimentalKvCacheKeyType, experimentalKvCacheValueType, @@ -155,6 +157,7 @@ export class LlamaContext { this._kvCacheKeyType = experimentalKvCacheKeyType; this._kvCacheValueType = experimentalKvCacheValueType; this._swaFullCache = !!swaFullCache; + this._kvUnified = kvUnified; this._ctx = new this._llama._bindings.AddonContext(this._model._model, removeNullFields({ contextSize: padSafeContextSize(this._contextSize * this._totalSequences, "up"), // each sequence needs its own of cells batchSize: this._batchSize + ( @@ -170,7 +173,8 @@ export class LlamaContext { performanceTracking: this._performanceTracking, kvCacheKeyType: this._kvCacheKeyType, kvCacheValueType: this._kvCacheValueType, - swaFullCache: this._swaFullCache + swaFullCache: this._swaFullCache, + kvUnified: this._kvUnified })); this._batchingOptions = { dispatchSchedule: batchingDispatchSchedule, diff --git a/src/evaluator/LlamaContext/types.ts b/src/evaluator/LlamaContext/types.ts index a0a64f02..a78bf32b 100644 --- a/src/evaluator/LlamaContext/types.ts +++ b/src/evaluator/LlamaContext/types.ts @@ -153,6 +153,21 @@ export type LlamaContextOptions = { */ swaFullCache?: boolean, + /** + * Use a unified KV buffer shared across all sequences when computing attention. + * + * When enabled, llama.cpp uses a single contiguous KV buffer indexed by sequence id, + * which can significantly improve multi-sequence prefill/decode throughput on GPU + * backends by reducing per-sequence buffer juggling. + * + * The llama.cpp default for `kv_unified` depends on whether the number of sequences + * is auto-detected; explicitly setting `kvUnified: true` matches the behavior of + * `llama-server` running with `--kv-unified` (which is the default in many configurations). + * + * Defaults to the llama.cpp default for the context configuration. + */ + kvUnified?: boolean, + /** * Load the provided LoRA adapters onto the context. * LoRA adapters are used to modify the weights of a pretrained model to adapt to new tasks or domains