quant-qwen leverage concatKV; add 8_0 to example main

DrJesseGlass · DrJesseGlass · commit 8205914a0d8d · 2025-10-28T14:01:02.000-04:00
diff --git a/candle-examples/examples/quantized-qwen3/main.rs b/candle-examples/examples/quantized-qwen3/main.rs
@@ -21,6 +21,8 @@ const DEFAULT_PROMPT: &str = "Write a Rust function to calculate the factorial o
 enum Which {
     #[value(name = "0.6b")]
     W3_0_6b,
+    #[value(name = "0.6b8_0")]
+    W3_0_6b8_0,
     #[value(name = "1.7b")]
     W3_1_7b,
     #[value(name = "4b")]
@@ -103,6 +105,7 @@ impl Args {
                 let api = hf_hub::api::sync::Api::new()?;
                 let repo = match self.which {
                     Which::W3_0_6b => "Qwen/Qwen3-0.6B",
+                    Which::W3_0_6b8_0 => "Qwen/Qwen3-0.6B",
                     Which::W3_1_7b => "Qwen/Qwen3-1.7B",
                     Which::W3_4b => "Qwen/Qwen3-4B",
                     Which::W3_8b => "Qwen/Qwen3-8B",
@@ -122,6 +125,7 @@ impl Args {
             None => {
                 let (repo, filename, revision) = match self.which {
                     Which::W3_0_6b => ("unsloth/Qwen3-0.6B-GGUF", "Qwen3-0.6B-Q4_K_M.gguf", "main"),
+                    Which::W3_0_6b8_0 => ("unsloth/Qwen3-0.6B-GGUF", "Qwen3-0.6B-Q8_0.gguf", "main"),
                     Which::W3_1_7b => ("unsloth/Qwen3-1.7B-GGUF", "Qwen3-1.7B-Q4_K_M.gguf", "main"),
                     Which::W3_4b => ("unsloth/Qwen3-4B-GGUF", "Qwen3-4B-Q4_K_M.gguf", "main"),
                     Which::W3_8b => ("unsloth/Qwen3-8B-GGUF", "Qwen3-8B-Q4_K_M.gguf", "main"),
diff --git a/candle-transformers/src/models/quantized_qwen3.rs b/candle-transformers/src/models/quantized_qwen3.rs
@@ -10,7 +10,7 @@ use super::with_tracing::QMatMul;
 use crate::{quantized_nn::RmsNorm, utils::repeat_kv};
 use candle::quantized::{gguf_file, QTensor};
 use candle::{DType, Device, Result, Tensor};
-use candle_nn::{kv_cache::KvCache, Activation, Embedding, Module};
+use candle_nn::{kv_cache::ConcatKvCache, Activation, Embedding, Module};
 use std::io::{Read, Seek};
 use std::sync::Arc;
 
@@ -136,7 +136,7 @@ struct AttentionWeights {
     num_kv_groups: usize,
     head_dim: usize,
     rotary_emb: Arc<RotaryEmbedding>,
-    kv_cache: KvCache,
+    kv_cache: ConcatKvCache,
     span_attn: tracing::Span,
 }
 
@@ -160,9 +160,7 @@ impl AttentionWeights {
         let q_norm = gg.rms_norm(&format!("{prefix}.attn_q_norm.weight"), rms_norm_eps)?;
         let k_norm = gg.rms_norm(&format!("{prefix}.attn_k_norm.weight"), rms_norm_eps)?;
 
-        // Initialize KV cache with 512 tokens capacity to reduce initial memory allocation.
-        // The cache will grow in chunks of 512 tokens when needed.
-        let kv_cache = KvCache::new(2, 512);
+        let kv_cache = ConcatKvCache::new(2);
 
         let span_attn = tracing::span!(tracing::Level::TRACE, "attn");
 
@@ -211,10 +209,6 @@ impl AttentionWeights {
 
         let (q, k) = self.rotary_emb.apply(&q, &k, offset)?;
 
-        // Reset KV cache if we're at the first position
-        if offset == 0 {
-            self.kv_cache.reset();
-        }
         let (k, v) = self.kv_cache.append(&k.contiguous()?, &v.contiguous()?)?;
 
         // Make tensor contiguous to avoid some strided copies