From c042c47e918d06f9d14de8436e13cc560e4f7c43 Mon Sep 17 00:00:00 2001 From: andreinknv Date: Thu, 14 May 2026 18:58:46 -0400 Subject: [PATCH] fix(GgufInsights): correct KV cache size estimate for quantized types MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The KV cache size estimator multiplies element count by the result of `getTypeSizeForGgmlType()`. That binding wraps llama.cpp's `ggml_type_size()`, which returns bytes per BLOCK — not bytes per element. For block-quantized types (Q4_0, Q5_0, Q8_0, etc.) one block holds 32 elements, so the per-element cost is 32× smaller than the block size. Before this fix: - Q8_0 KV cache: estimate is 32× too large (block size 34, true bytes/element ≈ 1.0625) - Q4_0 KV cache: estimate is 32× too large (block size 18, true bytes/element ≈ 0.5625) - F16 / F32: correct (block size = 1, no scaling) The overestimate trips the VRAM rejection path (`GgufInsightsConfigurationResolver.resolveConfigForUsage`), so valid configurations with quantized KV (e.g. Q8_0 at 8k context on a model that easily fits with FP16 + 8k) get refused with a "not enough VRAM" result. Fix: also fetch `getBlockSizeForGgmlType()` for each KV type and compute `keyBytesPerElement = blockBytes / blockSize`. The other existing consumer of these bindings (`calculateTensorSize` for general tensor size estimation, lines 827+) already uses both functions together — the KV-cache estimator was the only path missing the block-size division. For F16 / F32 (blockSize=1) the division is a no-op so no behavior changes there. --- src/gguf/insights/GgufInsights.ts | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts index ed364c35..701b1562 100644 --- a/src/gguf/insights/GgufInsights.ts +++ b/src/gguf/insights/GgufInsights.ts @@ -603,8 +603,19 @@ export class GgufInsights { const nEmbdHeadK = this._ggufFileInfo.architectureMetadata.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead)); const nHeadKv: number | number[] = this._ggufFileInfo.architectureMetadata.attention?.head_count_kv ?? nHead; const nEmbdHeadV = this._ggufFileInfo.architectureMetadata.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead); + // `getTypeSizeForGgmlType` returns bytes per BLOCK (matches + // `ggml_type_size` in llama.cpp). For block-quantized types + // (Q4_0, Q8_0, ...) one block holds N elements (block size > 1), + // so per-element bytes = block-bytes / block-elements. F16 / F32 + // are scalar (blockSize=1) so the division is a no-op there. + // Without this division, quantized KV-cache estimates overshoot + // by ~32× and the configuration resolver rejects valid configs. const keyTypeSize = this._llama._bindings.getTypeSizeForGgmlType(kvCacheKeyType) ?? this._llama._consts.ggmlTypeF16Size; const valueTypeSize = this._llama._bindings.getTypeSizeForGgmlType(kvCacheValueType) ?? this._llama._consts.ggmlTypeF16Size; + const keyBlockSize = this._llama._bindings.getBlockSizeForGgmlType(kvCacheKeyType) ?? 1; + const valueBlockSize = this._llama._bindings.getBlockSizeForGgmlType(kvCacheValueType) ?? 1; + const keyBytesPerElement = keyTypeSize / Math.max(1, keyBlockSize); + const valueBytesPerElement = valueTypeSize / Math.max(1, valueBlockSize); // source: `llama_model::load_tensors` in `llama-model.cpp` // repeating layers are assigned to GPU from `i_gpu_start = n_layer + 1 - n_gpu_layers` @@ -644,9 +655,9 @@ export class GgufInsights { } const gpuKVCacheSize = usingGpu - ? ((gpuKvElementsK * keyTypeSize) + (gpuKvElementsV * valueTypeSize)) + ? ((gpuKvElementsK * keyBytesPerElement) + (gpuKvElementsV * valueBytesPerElement)) : 0; - const cpuKVCacheSize = (cpuKvElementsK * keyTypeSize) + (cpuKvElementsV * valueTypeSize); + const cpuKVCacheSize = (cpuKvElementsK * keyBytesPerElement) + (cpuKvElementsV * valueBytesPerElement); const recurrentCellSize = Math.max(1, sequences); const gpuRecurrentStateSize = usingGpu