[quantization] Ouput kv-tuples

stamalakhov · stamalakhov · commit 192349a81309 · 2026-03-23T17:31:01.000+03:00
This PR outputs kv-tuples in case `use_cache` was set.

TICO-DCO-1.0-Signed-off-by: s.malakhov &lt;s.malakhov@partner.samsung.com&gt;
diff --git a/test/quantization/wrapq/wrappers/llama/test_quant_model.py b/test/quantization/wrapq/wrappers/llama/test_quant_model.py
@@ -109,3 +109,68 @@ def test_forward_diff(self):
         self.assertGreater(diff, 0.0)
         self.assertLess(diff, 0.4)
         self.assertEqual(fp_out.shape, q_out.shape)
+
+
+@unittest.skipUnless(has_transformers_for("llama"), skip_msg)
+class TestQuantLlamaModelWithCache(unittest.TestCase):
+    seq_len: int
+    vocab_size: int
+    hid_layers: int
+    fp_model: torch.nn.Module
+
+    @classmethod
+    def setUpClass(cls):
+        torch.manual_seed(0)
+
+        from transformers.models.llama.configuration_llama import LlamaConfig
+        from transformers.models.llama.modeling_llama import LlamaModel
+
+        cls.seq_len = 16
+        cls.vocab_size = 10000
+        cls.hid_layers = 3
+        cfg = LlamaConfig(
+            hidden_size=8,
+            num_attention_heads=2,
+            num_key_value_heads=1,
+            head_dim=4,
+            attention_bias=False,
+            attention_dropout=0.0,
+            attn_implementation="eager",
+            num_hidden_layers=cls.hid_layers,
+            max_position_embeddings=cls.seq_len,
+            use_cache=True,
+            return_dict=False,
+        )
+        cls.fp_model = LlamaModel(cfg)
+
+    def test_model_output(self):
+        qmodel = QuantLlamaModel(
+            self.fp_model, qcfg=PTQConfig(wrapper_variant="prefill")
+        )
+        self.assertIs(qmodel._mode, Mode.NO_QUANT)
+
+        qmodel.enable_calibration()
+        self.assertIs(qmodel._mode, Mode.CALIB)
+
+        x = torch.randint(
+            0,
+            self.vocab_size,
+            (
+                1,
+                self.seq_len,
+            ),
+        )
+        output = qmodel(x)
+
+        assert len(output) == 2  # last_hidden_states + past_key_values
+        past_key_values = output[1]
+        assert len(past_key_values) == self.hid_layers
+        for index in range(self.hid_layers):
+            past_key_value = past_key_values[index]
+            assert isinstance(past_key_value, tuple)
+
+            past_key = past_key_value[0]
+            assert past_key.shape[-2] == self.seq_len
+
+            past_value = past_key_value[1]
+            assert past_value.shape[-2] == self.seq_len
diff --git a/tico/quantization/wrapq/examples/quantize_full_qmodel_with_gptq.py b/tico/quantization/wrapq/examples/quantize_full_qmodel_with_gptq.py
@@ -334,6 +334,12 @@ def main():
         type=str,
         default=None,
     )
+    parser.add_argument(
+        "--use_cache",
+        action="store_true",
+        default=False,
+        help="Wether to use cache",
+    )
     args = parser.parse_args()
     print(args)
 
@@ -370,7 +376,7 @@ def main():
         .eval()
     )
 
-    model.config.use_cache = False  # TODO use args for it
+    model.config.use_cache = args.use_cache
     if args.calibrate_seq_len is not None:
         model.config.max_position_embeddings = min(
             model.config.max_position_embeddings, args.calibrate_seq_len
@@ -420,6 +426,8 @@ def main():
         if not args.no_GPTQ:
             print("Applying GPTQ …")
 
+        old_use_cache = model.config.use_cache
+        model.config.use_cache = False  # to save memory
         sens = None
         if args.gptq_mse is not None and args.gptq_mse == "smse":
             if args.sensitivity_path is not None:
@@ -440,6 +448,7 @@ def main():
                 q_m(inp.to(args.device))
 
         q_m = convert(q_m, inplace=True)  # materialize INT-weight tensors
+        model.config.use_cache = old_use_cache
     else:
         q_m = model
 
diff --git a/tico/quantization/wrapq/wrappers/llama/quant_model.py b/tico/quantization/wrapq/wrappers/llama/quant_model.py
@@ -185,12 +185,15 @@ def forward(
             inputs_embeds = self.embed_tokens(input_ids)
 
         if use_cache and past_key_values is None:
-            past_key_values = DynamicCache()
+            past_key_values = []
 
         if cache_position is None:
             past_seen_tokens = (
-                past_key_values.get_seq_length() if past_key_values is not None else 0
+                0
+                if (past_key_values is None or len(past_key_values) == 0)
+                else past_key_values[0][0].shape[-2]
             )
+
             cache_position = torch.arange(
                 past_seen_tokens,
                 past_seen_tokens + inputs_embeds.shape[1],
@@ -217,15 +220,21 @@ def forward(
         all_hidden_states = () if output_hidden_states else None
         all_self_attns = () if output_attentions else None
 
-        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+        for idx, decoder_layer in enumerate(
+            self.layers[: self.config.num_hidden_layers]
+        ):
             if output_hidden_states:
                 all_hidden_states += (hidden_states,)  # type: ignore[operator]
 
             layer_outputs = decoder_layer(
                 hidden_states,
                 attention_mask=causal_mask,
                 position_ids=position_ids,
-                past_key_value=past_key_values,
+                past_key_value=(
+                    past_key_values[idx]
+                    if past_key_values is not None and len(past_key_values) > idx
+                    else None
+                ),
                 output_attentions=output_attentions,
                 use_cache=use_cache,
                 cache_position=cache_position,
@@ -235,6 +244,15 @@ def forward(
 
             if decoder_layer.wrapped.return_type == "tuple":
                 hidden_states = layer_outputs[0]
+            elif use_cache:
+                hidden_states = layer_outputs[0]
+                assert isinstance(layer_outputs[1], tuple)
+                if len(past_key_values) >= idx:  # type: ignore[arg-type]
+                    # prefill mode
+                    past_key_values += (layer_outputs[1],)  # type: ignore[operator]
+                else:
+                    # decode mode
+                    past_key_values[idx] = (layer_outputs[1],)  # type: ignore[index]
             else:
                 hidden_states = layer_outputs