Add replicatekvhead transform

mamtsing · mamtsing · commit 5711faa95c94 · 2026-03-16T18:28:16.000Z
Signed-off-by: Mamta Singh &lt;mamtsing@qti.qualcomm.com&gt;
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -51,6 +51,7 @@
     PrefillOnlyExternalModuleMapperTransform,
     PrefillOnlyChunkedTransform,
     PrefillOnlyTransform,
+    ReplicateKVHeadTransform,
     RevertPrefillKeepAttentionTransform,
     RevertPrefillOnlyTransform,
     RevertPrefillOnlyExternalModuleMapperTransform,
@@ -2410,6 +2411,10 @@ def __init__(
         self.comp_ctx_lengths_prefill, self.comp_ctx_lengths_decode = None, None
         self.hash_params["max_seq_len_cached"] = max_seq_len_cached
 
+        self.model, replicate_kv_transformed = ReplicateKVHeadTransform.apply(self.model, **kwargs)
+        if replicate_kv_transformed:
+            self.hash_params["config"] = model.config.to_diff_dict()
+
         # ---Sampling---
         # Note: SamplerTransform should be applied after all other transforms
         # are done. The role of the sampler is to just add nodes at the output of the
diff --git a/QEfficient/transformers/models/pytorch_transforms.py b/QEfficient/transformers/models/pytorch_transforms.py
@@ -10,6 +10,7 @@
 from types import MethodType
 from typing import Callable, Optional, Tuple, Union
 
+import torch
 from torch import nn
 from transformers.models.codegen.modeling_codegen import (
     CodeGenAttention,
@@ -456,6 +457,7 @@
 from QEfficient.transformers.post_processing import build_and_attach_mlp, model_type_registry
 from QEfficient.transformers.sampler.sampler import sampler_forward
 from QEfficient.transformers.spd.spd_transform_forward import tlm_forward
+from QEfficient.utils.logging_utils import logger
 
 SPD_TARGET = "target"
 
@@ -694,6 +696,82 @@ class RevertPrefillOnlyTransform(ModuleMappingTransform):
     }
 
 
+class ReplicateKVHeadTransform:
+    """
+    Replicates KV heads in attention modules to match the number of KV heads in the target model.
+    This transform is used when the source model has fewer KV heads than required in target model.
+    """
+
+    def _duplicate_weights_for_linear_layer(
+        layer: nn.Module, orig_kv_heads: int, repeat: int, head_dim: int, hidden_size: int
+    ):
+        new_kv_heads = repeat #for mla
+
+        layer.weight.data = torch.repeat_interleave(
+            layer.weight.data.view(orig_kv_heads, head_dim, hidden_size), repeat, 0
+        ).view(new_kv_heads * head_dim, hidden_size)
+        if layer.bias is not None:
+            layer.bias.data = torch.repeat_interleave(
+                layer.bias.data.view(orig_kv_heads, head_dim), repeat, 0
+            ).view(new_kv_heads * head_dim)
+        if layer.bias is not None:
+            layer.bias.data = torch.repeat_interleave(
+                layer.bias.data.view(orig_kv_heads, head_dim), repeat, 0
+            ).view(new_kv_heads * head_dim)
+
+    def _get_text_model(model):
+        """
+        Determine and return the appropriate text_model from a given model object.
+        """
+        # Check for VLMs
+        if hasattr(model, "language_model"):
+            if hasattr(model.language_model, "model"):
+                return model.language_model.model
+            else:
+                return model.language_model
+        # Check for CausalLMs
+        if hasattr(model, "model"):
+            return model.model
+
+        raise AttributeError("No suitable text model found in the provided model.")
+
+    @classmethod
+    def apply(cls, model: nn.Module, **kwargs) -> nn.Module:
+        """
+        Replicates KV heads in attention modules based on provided multiplier.
+
+        Args:
+            model: The model to apply the transform to.
+            kwargs: Additional arguments for the transformation. Includes:
+                - num_kv_heads_repeat: The number of times to repeat the KV heads.
+        """
+        n_repeat = kwargs.pop("num_kv_heads_repeat", 1)
+        transformed = False
+        if n_repeat is not None and n_repeat > 1:
+            text_model = cls._get_text_model(model)
+
+            orig_kv_heads = 1 # for mla #text_model.config.num_key_value_heads
+            new_kv_heads = n_repeat*orig_kv_heads
+            text_model.config.orig_kv_heads = orig_kv_heads
+            text_model.config.num_key_value_heads = new_kv_heads
+
+            num_attention_heads = text_model.config.num_attention_heads
+            hidden_size = text_model.config.hidden_size
+
+            logger.warning(f"Original KV heads: {orig_kv_heads}")
+            logger.warning(f"Modified KV heads: {new_kv_heads}")
+            transformed = True
+            for block in text_model.layers:
+                attn = getattr(block, "cross_attn", getattr(block, "self_attn", None))
+                attn.num_key_value_heads = new_kv_heads
+                head_dim = attn.kv_lora_rank+attn.qk_rope_head_dim
+
+                cls._duplicate_weights_for_linear_layer(
+                    attn.kv_a_proj_with_mqa, orig_kv_heads, n_repeat, head_dim, hidden_size
+                )
+        return model, transformed
+
+
 class SpDTransform:
     """
     Apply generic QEffForCausalLM forward pass to extract `num_speculative_tokens+1` hidden states before computing logits during decode phase and extract last predicted token during prefill.
diff --git a/examples/run_kimik2.py b/examples/run_kimik2.py
@@ -5,10 +5,15 @@
 from QEfficient import QEFFAutoModelForCausalLM
 
 prompt = "Once upon a time,"
+num_kv_heads_repeat=4 #TS=4
+num_hidden_layers=2
+enable_mla=True
+mla_absorption_config={"enable": True, "online": True}
 
-model_path = "/home/ochougul/.cache/huggingface/hub/models--moonshotai--Kimi-K2-Thinking/snapshots/a51ccc050d73dab088bf7b0e2dd9b30ae85a4e55/"
+#model_path = "/home/ochougul/.cache/huggingface/hub/models--moonshotai--Kimi-K2-Thinking/snapshots/a51ccc050d73dab088bf7b0e2dd9b30ae85a4e55/"
+model_path ="/home/huggingface_hub/models--moonshotai--Kimi-K2-Thinking/snapshots/612681931a8c906ddb349f8ad0f582cb552189cd"
 model = AutoModelForCausalLM.from_pretrained(
-    model_path, torch_dtype=torch.float32, num_hidden_layers=2, trust_remote_code=True
+    model_path, torch_dtype=torch.float32, num_hidden_layers=num_hidden_layers, trust_remote_code=True
 )
 tokenizer = AutoTokenizer.from_pretrained("moonshotai/Kimi-K2-Thinking", trust_remote_code=True)
 
@@ -27,8 +32,8 @@
     out = model(**inputs)
     predictions = torch.argmax(out.logits, dim=-1)
 
-qeff_model = QEFFAutoModelForCausalLM(model)
-qeff_model.mla(enable_mla=True, mla_absorption_config={"enable": True, "online": True})
+qeff_model = QEFFAutoModelForCausalLM(model, num_kv_heads_repeat=num_kv_heads_repeat)
+qeff_model.mla(enable_mla=enable_mla, mla_absorption_config=mla_absorption_config)
 
 inputs = tokenizer(prompt, return_tensors="np", padding="max_length", max_length=padded_len)
 inputs["position_ids"] = np.where(inputs.pop("attention_mask"), np.arange(padded_len), -1)
@@ -84,17 +89,22 @@
 print("Completion:", repr(predicted_string))
 
 
+
+prefill_seq_len = 128
+ctx_len = 2048
+
 onnx_path = qeff_model.export(
-    prefill_seq_len=1, enable_mla=True, mla_absorption_config={"enable": True, "online": True}
+    prefill_seq_len=prefill_seq_len, enable_mla=enable_mla, mla_absorption_config=mla_absorption_config
 )
+
 qpc_path = qeff_model.compile(
-    prefill_seq_len=1,
-    ctx_len=1024,
-    enable_mla=True,
-    mla_absorption_config={"enable": True, "online": True},
+    prefill_seq_len=prefill_seq_len,
+    ctx_len=ctx_len,
+    enable_mla=enable_mla,
+    mla_absorption_config=mla_absorption_config,
     mxfp6_matmul=True,
     mxint8_kv_cache=False,
-    num_devices=1,
+    num_devices=num_kv_heads_repeat,
     num_cores=16,
 )