[KV Cache] fix per channel shape

evian · evian · commit a8faf8dce72f · 2025-08-04T22:08:15.000+08:00
Signed-off-by: evian &lt;eviantai@u.nus.edu&gt;
diff --git a/src/llmcompressor/modifiers/quantization/cache.py b/src/llmcompressor/modifiers/quantization/cache.py
@@ -94,7 +94,8 @@ def update(
             _pad_and_append_at_idx_(self.k_observers, layer_idx, k_observer)
             _pad_and_append_at_idx_(self.v_observers, layer_idx, v_observer)
 
-        if key_states.dim() == 4:
+        kv_states_dim = key_states.dim()
+        if kv_states_dim == 4:
             # reshape for per channel scenario
             num_heads = key_states.shape[1]
             head_dim = key_states.shape[-1]
@@ -115,7 +116,7 @@ def update(
             q_value_states, KVCacheScaleType.VALUE, layer_idx
         )
 
-        if key_states.dim() == 4:
+        if kv_states_dim == 4:
             # reshape for per channel scenario
             # from [batch_size, seq_len - residual_length, num_heads * head_dim]
             # to [batch_size, num_heads, seq_len - residual_length, head_dim]
diff --git a/src/llmcompressor/modifiers/quantization/calibration.py b/src/llmcompressor/modifiers/quantization/calibration.py
@@ -256,6 +256,9 @@ def calibrate_kv_cache_output_hook(module: Module, _args: Any, _output: torch.Te
     kv_cache = getattr(module, "kv_cache")
     k_scale = kv_cache.k_scales[module.layer_idx]
     v_scale = kv_cache.v_scales[module.layer_idx]
+    if kv_cache.quantization_args.strategy == QuantizationStrategy.CHANNEL:
+        k_scale = k_scale.unsqueeze(-1)
+        v_scale = v_scale.unsqueeze(-1)
     update_parameter_data(module, k_scale, KVCacheScaleType.KEY.value)
     update_parameter_data(module, v_scale, KVCacheScaleType.VALUE.value)