From c042c47e918d06f9d14de8436e13cc560e4f7c43 Mon Sep 17 00:00:00 2001
From: andreinknv <andrei.nknv@outlook.com>
Date: Thu, 14 May 2026 18:58:46 -0400
Subject: [PATCH] fix(GgufInsights): correct KV cache size estimate for
 quantized types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The KV cache size estimator multiplies element count by the result of
`getTypeSizeForGgmlType()`. That binding wraps llama.cpp's
`ggml_type_size()`, which returns bytes per BLOCK — not bytes per
element. For block-quantized types (Q4_0, Q5_0, Q8_0, etc.) one block
holds 32 elements, so the per-element cost is 32× smaller than the
block size.

Before this fix:
  - Q8_0 KV cache: estimate is 32× too large (block size 34, true
    bytes/element ≈ 1.0625)
  - Q4_0 KV cache: estimate is 32× too large (block size 18, true
    bytes/element ≈ 0.5625)
  - F16 / F32: correct (block size = 1, no scaling)

The overestimate trips the VRAM rejection path
(`GgufInsightsConfigurationResolver.resolveConfigForUsage`), so valid
configurations with quantized KV (e.g. Q8_0 at 8k context on a model
that easily fits with FP16 + 8k) get refused with a "not enough VRAM"
result.

Fix: also fetch `getBlockSizeForGgmlType()` for each KV type and
compute `keyBytesPerElement = blockBytes / blockSize`. The other
existing consumer of these bindings (`calculateTensorSize` for general
tensor size estimation, lines 827+) already uses both functions
together — the KV-cache estimator was the only path missing the
block-size division.

For F16 / F32 (blockSize=1) the division is a no-op so no behavior
changes there.
---
 src/gguf/insights/GgufInsights.ts | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/src/gguf/insights/GgufInsights.ts b/src/gguf/insights/GgufInsights.ts
index ed364c35..701b1562 100644
--- a/src/gguf/insights/GgufInsights.ts
+++ b/src/gguf/insights/GgufInsights.ts
@@ -603,8 +603,19 @@ export class GgufInsights {
         const nEmbdHeadK = this._ggufFileInfo.architectureMetadata.attention?.key_length ?? ((nHead == 0) ? 0 : (nEmbd / nHead));
         const nHeadKv: number | number[] = this._ggufFileInfo.architectureMetadata.attention?.head_count_kv ?? nHead;
         const nEmbdHeadV = this._ggufFileInfo.architectureMetadata.attention?.value_length ?? ((nHead == 0) ? 0 : nEmbd / nHead);
+        // `getTypeSizeForGgmlType` returns bytes per BLOCK (matches
+        // `ggml_type_size` in llama.cpp). For block-quantized types
+        // (Q4_0, Q8_0, ...) one block holds N elements (block size > 1),
+        // so per-element bytes = block-bytes / block-elements. F16 / F32
+        // are scalar (blockSize=1) so the division is a no-op there.
+        // Without this division, quantized KV-cache estimates overshoot
+        // by ~32× and the configuration resolver rejects valid configs.
         const keyTypeSize = this._llama._bindings.getTypeSizeForGgmlType(kvCacheKeyType) ?? this._llama._consts.ggmlTypeF16Size;
         const valueTypeSize = this._llama._bindings.getTypeSizeForGgmlType(kvCacheValueType) ?? this._llama._consts.ggmlTypeF16Size;
+        const keyBlockSize = this._llama._bindings.getBlockSizeForGgmlType(kvCacheKeyType) ?? 1;
+        const valueBlockSize = this._llama._bindings.getBlockSizeForGgmlType(kvCacheValueType) ?? 1;
+        const keyBytesPerElement = keyTypeSize / Math.max(1, keyBlockSize);
+        const valueBytesPerElement = valueTypeSize / Math.max(1, valueBlockSize);
 
         // source: `llama_model::load_tensors` in `llama-model.cpp`
         // repeating layers are assigned to GPU from `i_gpu_start = n_layer + 1 - n_gpu_layers`
@@ -644,9 +655,9 @@ export class GgufInsights {
         }
 
         const gpuKVCacheSize = usingGpu
-            ? ((gpuKvElementsK * keyTypeSize) + (gpuKvElementsV * valueTypeSize))
+            ? ((gpuKvElementsK * keyBytesPerElement) + (gpuKvElementsV * valueBytesPerElement))
             : 0;
-        const cpuKVCacheSize = (cpuKvElementsK * keyTypeSize) + (cpuKvElementsV * valueTypeSize);
+        const cpuKVCacheSize = (cpuKvElementsK * keyBytesPerElement) + (cpuKvElementsV * valueBytesPerElement);
 
         const recurrentCellSize = Math.max(1, sequences);
         const gpuRecurrentStateSize = usingGpu