[quantization] Ouput kv-tuples

stamalakhov · stamalakhov · commit 23e4c61232b3 · 2026-03-23T15:30:52.000+03:00
This PR outputs kv-tuples in case `use_cache` was set.

TICO-DCO-1.0-Signed-off-by: s.malakhov &lt;s.malakhov@partner.samsung.com&gt;
diff --git a/tico/quantization/wrapq/examples/quantize_full_qmodel_with_gptq.py b/tico/quantization/wrapq/examples/quantize_full_qmodel_with_gptq.py
@@ -334,6 +334,12 @@ def main():
         type=str,
         default=None,
     )
+    parser.add_argument(
+        "--use_cache",
+        action="store_true",
+        default=False,
+        help="Wether to use cache",
+    )
     args = parser.parse_args()
     print(args)
 
@@ -370,7 +376,7 @@ def main():
         .eval()
     )
 
-    model.config.use_cache = False  # TODO use args for it
+    model.config.use_cache = args.use_cache
     if args.calibrate_seq_len is not None:
         model.config.max_position_embeddings = min(
             model.config.max_position_embeddings, args.calibrate_seq_len
@@ -420,6 +426,8 @@ def main():
         if not args.no_GPTQ:
             print("Applying GPTQ …")
 
+        old_use_cache = model.config.use_cache
+        model.config.use_cache = False  # to save memory
         sens = None
         if args.gptq_mse is not None and args.gptq_mse == "smse":
             if args.sensitivity_path is not None:
@@ -440,6 +448,7 @@ def main():
                 q_m(inp.to(args.device))
 
         q_m = convert(q_m, inplace=True)  # materialize INT-weight tensors
+        model.config.use_cache = old_use_cache
     else:
         q_m = model
 
diff --git a/tico/quantization/wrapq/wrappers/llama/quant_model.py b/tico/quantization/wrapq/wrappers/llama/quant_model.py
@@ -185,12 +185,15 @@ def forward(
             inputs_embeds = self.embed_tokens(input_ids)
 
         if use_cache and past_key_values is None:
-            past_key_values = DynamicCache()
+            past_key_values = []
 
         if cache_position is None:
             past_seen_tokens = (
-                past_key_values.get_seq_length() if past_key_values is not None else 0
+                0
+                if (past_key_values is None or len(past_key_values) == 0)
+                else past_key_values[0][0].shape[-2]
             )
+
             cache_position = torch.arange(
                 past_seen_tokens,
                 past_seen_tokens + inputs_embeds.shape[1],
@@ -217,15 +220,21 @@ def forward(
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
 
-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+        for idx, decoder_layer in enumerate(
+            self.layers[: self.config.num_hidden_layers]
+        ):
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)  # type: ignore[operator]
 
             layer_outputs = decoder_layer(
                 hidden_states,
                 attention_mask=causal_mask,
                 position_ids=position_ids,
-                past_key_value=past_key_values,
+                past_key_value=(
+                    past_key_values[idx]
+                    if past_key_values is not None and len(past_key_values) > idx
+                    else None
+                ),
                 output_attentions=output_attentions,
                 use_cache=use_cache,
                 cache_position=cache_position,
@@ -235,6 +244,15 @@ def forward(
 
             if decoder_layer.wrapped.return_type == "tuple":
                 hidden_states = layer_outputs[0]
+            elif use_cache:
+                hidden_states = layer_outputs[0]
+                assert isinstance(layer_outputs[1], tuple)
+                if len(past_key_values) >= idx:  # type: ignore[arg-type]
+                    # prefill mode
+                    past_key_values += (layer_outputs[1],)  # type: ignore[operator]
+                else:
+                    # decode mode
+                    past_key_values[idx] = (layer_outputs[1],)  # type: ignore[index]
             else:
                 hidden_states = layer_outputs