(revert) kv-cache : do not quantize SWA KV cache (#21332)

ggerganov · web-flow · commit 39b27f0da027 · 2026-04-03T09:07:01.000+03:00
This reverts commit 17193cc.
diff --git a/src/llama-kv-cache-iswa.cpp b/src/llama-kv-cache-iswa.cpp
@@ -66,9 +66,8 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(
 
     LLAMA_LOG_INFO("%s: creating     SWA KV cache, size = %u cells\n", __func__, size_swa);
 
-    // note: the SWA cache is never quantized because it is relatively small
     kv_swa = std::make_unique<llama_kv_cache>(
-            model, GGML_TYPE_F16, GGML_TYPE_F16,
+            model, type_k, type_v,
             v_trans, offload, unified, size_swa, n_seq_max, n_pad,
             hparams.n_swa, hparams.swa_type, filter_swa, reuse);
 }

Original file line number	Diff line number	Diff line change
`@@ -66,9 +66,8 @@ llama_kv_cache_iswa::llama_kv_cache_iswa(`
`66`	`66`
`67`	`67`	`LLAMA_LOG_INFO("%s: creating SWA KV cache, size = %u cells\n", __func__, size_swa);`
`68`	`68`
`69`		`- // note: the SWA cache is never quantized because it is relatively small`
`70`	`69`	`kv_swa = std::make_unique<llama_kv_cache>(`
`71`		`- model, GGML_TYPE_F16, GGML_TYPE_F16,`
	`70`	`+ model, type_k, type_v,`
`72`	`71`	`v_trans, offload, unified, size_swa, n_seq_max, n_pad,`
`73`	`72`	`hparams.n_swa, hparams.swa_type, filter_swa, reuse);`
`74`	`73`	`}`