Adding the support of dense models distilled from moe models with the same architecture (#728)

vjanfaza · web-flow · commit 4bd22391a0ba · 2026-02-20T09:23:08.000+05:30
In this PR, we are adding the support of meta-llama/Llama-Guard-4-12B
which is a dense model distilled form llama4 scout moe model. The
changes in pytorch_transforms.py file can be applied to any dense model
distilled from a moe model with supported architecture in QEfficient.

Signed-off-by: Vahid Janfaza &lt;vjanfaza@qti.qualcomm.com&gt;
diff --git a/QEfficient/base/pytorch_transforms.py b/QEfficient/base/pytorch_transforms.py
@@ -152,10 +152,16 @@ def apply(cls, model: nn.Module) -> Tuple[nn.Module, bool]:
             # ---- build the textual prefix once per layer ----------
             if is_gpt_oss:
                 prefix = f"model.layers.{layer_idx}.mlp.experts."
-                experts = model_tmp.model.layers[layer_idx].mlp.experts
+                # experts = model_tmp.model.layers[layer_idx].mlp.experts
+                ff = model_tmp.model.layers[layer_idx].mlp
             else:
                 prefix = f"model.layers.{layer_idx}.feed_forward.experts."
-                experts = model_tmp.model.layers[layer_idx].feed_forward.experts
+                # experts = model_tmp.model.layers[layer_idx].feed_forward.experts
+                ff = model_tmp.model.layers[layer_idx].feed_forward
+
+            if not hasattr(ff, "experts"):
+                continue
+            experts = ff.experts
 
             fused_key = prefix + "gate_up_proj"
             gate_key = prefix + "gate_proj"
diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py
@@ -504,7 +504,7 @@ def forward(
 
         if past_key_value is not None:
             chunk_position_ids = position_ids
-            if self.use_rope:
+            if self.use_rope and self.config.attention_chunk_size:
                 chunk_position_ids = torch.where(
                     chunk_position_ids != -1, chunk_position_ids % self.config.attention_chunk_size, chunk_position_ids
                 )
@@ -663,10 +663,16 @@ def forward(
         causal_mask = _create_causal_mask(
             position_ids=position_ids, target_length=past_key_values.layers[3].keys.shape[-2]
         )
-        chunk_position_ids = torch.where(
-            position_ids != -1, position_ids % self.config.attention_chunk_size, position_ids
-        )
-        target_length = min(past_key_values.layers[0].keys.shape[-2], torch.tensor(self.config.attention_chunk_size))
+        if self.config.attention_chunk_size:
+            chunk_position_ids = torch.where(
+                position_ids != -1, position_ids % self.config.attention_chunk_size, position_ids
+            )
+            target_length = min(
+                past_key_values.layers[0].keys.shape[-2], torch.tensor(self.config.attention_chunk_size)
+            )
+        else:
+            chunk_position_ids = position_ids
+            target_length = past_key_values.layers[0].keys.shape[-2]
         chunk_causal_mask = _create_causal_mask(position_ids=chunk_position_ids, target_length=target_length)
         causal_mask_mapping = {
             "full_attention": causal_mask,
@@ -798,7 +804,7 @@ def get_dummy_pkv_cache(self, config, batch_size, seq_len):
         is_chunked_attention = torch.tensor(
             [bool((i + 1) % 4) for i in range(config.num_hidden_layers)], dtype=torch.bool
         )
-        attention_chunk_size = getattr(config, "attention_chunk_size", seq_len)
+        attention_chunk_size = getattr(config, "attention_chunk_size", None) or seq_len
         global_cache_shape = [batch_size, n_heads, seq_len, d_head]
         chunked_cache_shape = [
             batch_size,
@@ -967,13 +973,12 @@ def get_specializations(
 
         prefill_seq_len = prefill_seq_len if prefill_seq_len else 32
         ctx_len = ctx_len if ctx_len else constants.INTERN_CTX_LEN
+        attention_chunk_size = getattr(
+            getattr(getattr(self, "config", None), "text_config", None), "attention_chunk_size", None
+        )
         chunk_ctx_len = min(
             ctx_len,
-            (
-                self.config.text_config.attention_chunk_size
-                if hasattr(self, "config")
-                else constants.LLAMA4_ATTENTION_CHUNK_SIZE
-            ),
+            (attention_chunk_size if attention_chunk_size is not None else constants.LLAMA4_ATTENTION_CHUNK_SIZE),
         )
         if (
             prefill_seq_len > constants.LLAMA4_MAX_POSITION_EMBEDDINGS
@@ -1158,7 +1163,7 @@ def get_dummy_pkv_cache(self, config, batch_size, seq_len):
         is_chunked_attention = torch.tensor(
             [bool((i + 1) % 4) for i in range(config.num_hidden_layers)], dtype=torch.bool
         )
-        attention_chunk_size = getattr(config, "attention_chunk_size", seq_len)
+        attention_chunk_size = getattr(config, "attention_chunk_size", None) or seq_len
         global_cache_shape = [batch_size, n_heads, seq_len, d_head]
         chunked_cache_shape = [
             batch_size,