quic
diff --git a/‎QEfficient/transformers/cache_utils.py‎
Lines changed: 3 additions & 18 deletions b/‎QEfficient/transformers/cache_utils.py‎
Lines changed: 3 additions & 18 deletions
diff --git a/‎QEfficient/transformers/models/pytorch_transforms.py‎
Lines changed: 104 additions & 19 deletions b/‎QEfficient/transformers/models/pytorch_transforms.py‎
Lines changed: 104 additions & 19 deletions
diff --git a/‎QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py‎
Lines changed: 0 additions & 2 deletions b/‎QEfficient/transformers/models/qwen3_vl/modeling_qwen3_vl.py‎
Lines changed: 0 additions & 2 deletions
@@ -157,7 +157,6 @@ def write_only(self, key_states, value_states, cache_kwargs):
             self.keys = key_states
             self.values = value_states
         else:
-            # breakpoint()
             position_ids = cache_kwargs.get("position_ids")
             batch_index = cache_kwargs.get("batch_index", None)  # Check and fetch batch index value form the kwargs
 
@@ -192,9 +191,8 @@ def update(
         Return:
             A tuple containing the updated key and value states.
         """
-        # breakpoint()
+
         # Update the cache
-        # if not self.is_initialized:
 
         if self.keys is None:
             self.keys = key_states
@@ -327,11 +325,10 @@ def __init__(
         **kwargs,
     ):
         # Remove layer_classes if present to avoid duplicate argument
-        # breakpoint()
+
         kwargs.pop("layers", None)
         from transformers.cache_utils import Cache  # Import here to avoid circular import
 
-        # breakpoint()
         layers = []
         # If a config is passed, use it to infer the layer types and initialize accordingly
         if len(layers) == 0:
@@ -373,7 +370,7 @@ def read_only(self, layer_idx, cache_kwargs):
         Return:
             A tuple containing the updated key and value states.
         """
-        # breakpoint()
+
         return self.layers[layer_idx].read_only(cache_kwargs)
 
     def read_only_blockedKV(self, start_index, end_index, layer_idx, cache_kwargs):
@@ -439,18 +436,6 @@ def update3D(
         self.append_new_layers(layer_idx)
         return self.layers[layer_idx].update3D(key_states, value_states, cache_kwargs)
 
-    # def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
-    #     """Returns the sequence length of the cached states. A layer index can be optionally passed."""
-    #     # TODO: deprecate this function in favor of `cache_position`
-    #     breakpoint()
-    #     is_empty_layer = (
-    #         len(self.key_cache) == 0  # no cache in any layer
-    #         or len(self.key_cache) <= layer_idx  # skipped `layer_idx` and hasn't run a layer with cache after it
-    #         or len(self.key_cache[layer_idx]) == 0  # the layer has no cache
-    #     )
-    #     layer_seq_length = self.key_cache[layer_idx].shape[-2] if not is_empty_layer else 0
-    #     return layer_seq_length
-
 
 class QEffEncoderDecoderCache(EncoderDecoderCache):
     """
 
@@ -113,6 +113,11 @@
     MistralModel,
     MistralRMSNorm,
 )
+from transformers.models.mistral3.modeling_mistral3 import (
+    Mistral3ForConditionalGeneration,
+    Mistral3Model,
+    Mistral3RMSNorm,
+)
 from transformers.models.mixtral.modeling_mixtral import (
     MixtralAttention,
     MixtralDecoderLayer,
@@ -135,6 +140,13 @@
     MllamaVisionModel,
 )
 from transformers.models.mpt.modeling_mpt import MptAttention, MptBlock, MptForCausalLM, MptModel
+from transformers.models.olmo2.modeling_olmo2 import (
+    Olmo2Attention,
+    Olmo2DecoderLayer,
+    Olmo2ForCausalLM,
+    Olmo2Model,
+    Olmo2RMSNorm,
+)
 from transformers.models.phi.modeling_phi import PhiAttention, PhiDecoderLayer, PhiForCausalLM, PhiModel
 from transformers.models.phi3.modeling_phi3 import (
     Phi3Attention,
@@ -143,6 +155,7 @@
     Phi3Model,
     Phi3RMSNorm,
 )
+from transformers.models.pixtral.modeling_pixtral import PixtralRMSNorm, PixtralVisionModel
 from transformers.models.qwen2.modeling_qwen2 import (
     Qwen2Attention,
     Qwen2DecoderLayer,
@@ -155,6 +168,7 @@
     Qwen2_5_VLAttention,
     Qwen2_5_VLDecoderLayer,
     Qwen2_5_VLForConditionalGeneration,
+    Qwen2_5_VLModel,
     Qwen2_5_VLTextModel,
     Qwen2_5_VLVisionAttention,
 )
@@ -168,6 +182,15 @@
     Qwen3Model,
     Qwen3RMSNorm,
 )
+from transformers.models.qwen3_moe.modeling_qwen3_moe import (
+    Qwen3MoeAttention,
+    Qwen3MoeDecoderLayer,
+    Qwen3MoeForCausalLM,
+    Qwen3MoeModel,
+    Qwen3MoeRMSNorm,
+    Qwen3MoeRotaryEmbedding,
+    Qwen3MoeSparseMoeBlock,
+)
 from transformers.models.qwen3_vl.modeling_qwen3_vl import (
     Qwen3VLForConditionalGeneration,
     Qwen3VLModel,
@@ -327,6 +350,11 @@
     QEffMistralForCausalLM,
     QEffMistralModel,
 )
+from QEfficient.transformers.models.mistral3.modeling_mistral3 import (
+    QEffMistral3ForConditionalGeneration,
+    QEffMistral3Model,
+    QEffPixtralVisionModel,
+)
 from QEfficient.transformers.models.mixtral_moe.modeling_mixtral import (
     QEffMixtralAttention,
     QeffMixtralDecoderLayer,
@@ -347,12 +375,25 @@
     QEffMllamaTextSelfAttention,
     QEffMllamaVisionModel,
 )
+from QEfficient.transformers.models.molmo.modeling_molmo import (
+    QEffMolmo,
+    QEffMolmoBlock,
+    QEffMolmoModel,
+    QEffMolmoSequentialBlock,
+    QEffMultiHeadDotProductAttention,
+)
 from QEfficient.transformers.models.mpt.modeling_mpt import (
     QEffMptAttention,
     QEffMptBlock,
     QEffMptForCausalLM,
     QEFfMptModel,
 )
+from QEfficient.transformers.models.olmo2.modeling_olmo2 import (
+    QEffOlmo2Attention,
+    QEffOlmo2DecoderLayer,
+    QEffOlmo2ForCausalLM,
+    QEffOlmo2Model,
+)
 from QEfficient.transformers.models.phi.modeling_phi import (
     QEffPhiAttention,
     QEffPhiDecoderLayer,
@@ -375,9 +416,10 @@
     QEffQwen2_5_VisionTransformerPretrainedModel,
     QEffQwen2_5_VLAttention,
     QEffQwen2_5_VLDecoderLayer,
+    QEffQwen2_5_VLModel,
     QEffQwen2_5_VLTextModel,
-    # QEffQwen2_5_VLModel,
     QEffQwen2_5_VLVisionAttention,
+    QEffQwen_2_5_vl_DecoderWrapper,
     QEffQwen_2_5_vl_ForConditionalGeneration,
 )
 from QEfficient.transformers.models.qwen3.modeling_qwen3 import (
@@ -386,13 +428,20 @@
     QEffQwen3ForCausalLM,
     QEffQwen3Model,
 )
+from QEfficient.transformers.models.qwen3_moe.modeling_qwen3_moe import (
+    QEffQwen3MoeAttention,
+    QEffQwen3MoeDecoderLayer,
+    QEffQwen3MoeForCausalLM,
+    QEffQwen3MoeModel,
+    QEffQwen3MoeRotaryEmbedding,
+    QEffQwen3MoeSparseMoeBlock,
+)
 from QEfficient.transformers.models.qwen3_vl.modeling_qwen3_vl import (
     QEffQwen3VLForConditionalGeneration,
     QEffQwen3VLModel,
     QEffQwen3VLTextAttention,
     QEffQwen3VLTextDecoderLayer,
     QEffQwen3VLTextModel,
-    # QEffQwen3VLTextRotaryEmbedding,
     QEffQwen3VLVisionAttention,
     QEffQwen3VLVisionModel,
 )
@@ -430,16 +479,20 @@ class CustomOpsTransform(ModuleMappingTransform):
         LlamaRMSNorm: CustomRMSNormAIC,
         Llama4TextRMSNorm: CustomRMSNormAIC,
         MistralRMSNorm: CustomRMSNormAIC,
+        Mistral3RMSNorm: CustomRMSNormAIC,
         MixtralRMSNorm: CustomRMSNormAIC,
         Phi3RMSNorm: CustomRMSNormAIC,
         Qwen2RMSNorm: CustomRMSNormAIC,
         Qwen3RMSNorm: CustomRMSNormAIC,
         Qwen2_5RMSNorm: CustomRMSNormAIC,
         MllamaTextRMSNorm: CustomRMSNormAIC,
         GraniteRMSNorm: CustomRMSNormAIC,
+        PixtralRMSNorm: CustomRMSNormAIC,
         GraniteMoeRMSNorm: CustomRMSNormAIC,
-        Qwen3VLTextRMSNorm: CustomRMSNormAIC,
+        Qwen3MoeRMSNorm: CustomRMSNormAIC,
         Gemma3RMSNorm: QEffGemma3CustomRMSNormAIC,
+        Qwen3VLTextRMSNorm: CustomRMSNormAIC,
+        Olmo2RMSNorm: CustomRMSNormAIC,
     }
 
 
@@ -492,12 +545,12 @@ class KVCacheTransform(ModuleMappingTransform):
         GemmaModel: QEffGemmaModel,
         GemmaForCausalLM: QEffGemmaForCausalLM,
         # Qwen3Moe
-        # Qwen3MoeForCausalLM: QEffQwen3MoeForCausalLM,
-        # Qwen3MoeModel: QEffQwen3MoeModel,
-        # Qwen3MoeDecoderLayer: QEffQwen3MoeDecoderLayer,
-        # Qwen3MoeAttention: QEffQwen3MoeAttention,
-        # Qwen3MoeRotaryEmbedding: QEffQwen3MoeRotaryEmbedding,
-        # Qwen3MoeSparseMoeBlock: QEffQwen3MoeSparseMoeBlock,
+        Qwen3MoeForCausalLM: QEffQwen3MoeForCausalLM,
+        Qwen3MoeModel: QEffQwen3MoeModel,
+        Qwen3MoeDecoderLayer: QEffQwen3MoeDecoderLayer,
+        Qwen3MoeAttention: QEffQwen3MoeAttention,
+        Qwen3MoeRotaryEmbedding: QEffQwen3MoeRotaryEmbedding,
+        Qwen3MoeSparseMoeBlock: QEffQwen3MoeSparseMoeBlock,
         # Gemma2
         Gemma2Attention: QEffGemma2Attention,
         Gemma2DecoderLayer: QEffGemma2DecoderLayer,
@@ -545,6 +598,9 @@ class KVCacheTransform(ModuleMappingTransform):
         MistralDecoderLayer: QEffMistralDecoderLayer,
         MistralModel: QEffMistralModel,
         MistralForCausalLM: QEffMistralForCausalLM,
+        # Mistral3
+        Mistral3ForConditionalGeneration: QEffMistral3ForConditionalGeneration,
+        Mistral3Model: QEffMistral3Model,
         # Mixtral
         MixtralAttention: QEffMixtralAttention,
         MixtralSparseMoeBlock: QEffMixtralSparseMoeBlock,
@@ -566,36 +622,34 @@ class KVCacheTransform(ModuleMappingTransform):
         PhiDecoderLayer: QEffPhiDecoderLayer,
         PhiModel: QEffPhiModel,
         PhiForCausalLM: QEffPhiForCausalLM,
+        # Pixtral
+        PixtralVisionModel: QEffPixtralVisionModel,
         # Qwen2
         Qwen2Attention: QEffQwen2Attention,
         Qwen2DecoderLayer: QEffQwen2DecoderLayer,
         Qwen2Model: QEffQwen2Model,
         Qwen2ForCausalLM: QEffQwen2ForCausalLM,
-        # Qwen2.5 VL
-        Qwen2_5_VLForConditionalGeneration: QEffQwen_2_5_vl_ForConditionalGeneration,
-        # Qwen2_5_VLModel: QEffQwen2_5_VLModel,
-        Qwen2_5_VLTextModel: QEffQwen2_5_VLTextModel,
         # Qwen3
         Qwen3Attention: QEffQwen3Attention,
         Qwen3DecoderLayer: QEffQwen3DecoderLayer,
         Qwen3Model: QEffQwen3Model,
         Qwen3ForCausalLM: QEffQwen3ForCausalLM,
         # Qwen2.5 VL
-        # Qwen2_5_VLForConditionalGeneration: QEffQwen_2_5_vl_ForConditionalGeneration,
-        # Qwen2_5_VLModel: QEffQwen2_5_VLModel,
+        Qwen2_5_VLForConditionalGeneration: QEffQwen_2_5_vl_ForConditionalGeneration,
+        Qwen2_5_VLModel: QEffQwen2_5_VLModel,
         Qwen2_5_VLAttention: QEffQwen2_5_VLAttention,
         Qwen2_5_VLDecoderLayer: QEffQwen2_5_VLDecoderLayer,
         Qwen2_5_VisionTransformerPretrainedModel: QEffQwen2_5_VisionTransformerPretrainedModel,
         Qwen2_5_VLVisionAttention: QEffQwen2_5_VLVisionAttention,
-        # Qwen3vl
+        Qwen2_5_VLTextModel: QEffQwen2_5_VLTextModel,
+        # Qwen3 VL
         Qwen3VLForConditionalGeneration: QEffQwen3VLForConditionalGeneration,
         Qwen3VLModel: QEffQwen3VLModel,
         Qwen3VLTextAttention: QEffQwen3VLTextAttention,
         Qwen3VLTextDecoderLayer: QEffQwen3VLTextDecoderLayer,
         Qwen3VLVisionAttention: QEffQwen3VLVisionAttention,
         Qwen3VLVisionModel: QEffQwen3VLVisionModel,
         Qwen3VLTextModel: QEffQwen3VLTextModel,
-        # Qwen3VLTextRotaryEmbedding: QEffQwen3VLTextRotaryEmbedding,  # reusing decoder layer for rotary embedding as they are tightly coupled in forward pass
         # Starcoder2
         Starcoder2Attention: QEffStarcoder2Attention,
         Starcoder2DecoderLayer: QEFFStarcoder2DecoderLayer,
@@ -606,6 +660,11 @@ class KVCacheTransform(ModuleMappingTransform):
         GPTBigCodeBlock: QEffGPTBigCodeBlock,
         GPTBigCodeModel: QEffGPTBigCodeModel,
         GPTBigCodeForCausalLM: QEffGPTBigCodeForCausalLM,
+        # Olmo2
+        Olmo2Attention: QEffOlmo2Attention,
+        Olmo2DecoderLayer: QEffOlmo2DecoderLayer,
+        Olmo2Model: QEffOlmo2Model,
+        Olmo2ForCausalLM: QEffOlmo2ForCausalLM,
         # Whisper encoder and decoder layers
         WhisperPositionalEmbedding: QEffWhisperPositionalEmbedding,
         WhisperAttention: QEffWhisperAttention,
@@ -675,7 +734,7 @@ class SpDTransform:
         # Llama
         QEffLlamaForCausalLM,
         QEffQwen2ForCausalLM,
-        # QEffQwen3ForCausalLM,
+        QEffQwen3ForCausalLM,
     }
 
     @classmethod
@@ -741,7 +800,7 @@ class SamplerTransform:
         QEffMptForCausalLM,
         QEffPhi3ForCausalLM,
         QEffQwen2ForCausalLM,
-        # QEffQwen_2_5_vl_DecoderWrapper,
+        QEffQwen_2_5_vl_DecoderWrapper,
     }
 
     @classmethod
@@ -787,6 +846,32 @@ class KVCacheExternalModuleMapperTransform(ExternalModuleMapperTransform):
             "get_qeff_language_decoder": QEffInternVLModel.get_qeff_language_decoder,
         },
         "InternVisionEmbeddings": {"forward": QEffInternVisionEmbeddings.forward},
+        # Mapping for Molmo
+        "MolmoForCausalLM": {
+            "forward": QEffMolmoModel.forward,
+            "get_qeff_vision_encoder": QEffMolmoModel.get_qeff_vision_encoder,
+            "get_qeff_language_decoder": QEffMolmoModel.get_qeff_language_decoder,
+            "get_specializations": QEffMolmoModel.get_specializations,
+            "get_onnx_dynamic_axes": QEffMolmoModel.get_onnx_dynamic_axes,
+            "get_output_names": QEffMolmoModel.get_output_names,
+            "get_dummy_inputs": QEffMolmoModel.get_dummy_inputs,
+            "get_inputs_info": QEffMolmoModel.get_inputs_info,
+        },
+        "RMSLayerNorm": {"forward": CustomRMSNormAIC.forward},
+        # "MolmoForCausalLM": {"forward": QEffMolmoForCausalLM.forward},
+        "Molmo": {"forward": QEffMolmo.forward},
+        "MolmoSequentialBlock": {
+            "forward": QEffMolmoSequentialBlock.forward,
+            "attention": QEffMolmoBlock.attention,
+            "__qeff_init__": QEffMolmoBlock.__qeff_init__,
+        },
+        "MolmoBlock": {
+            "attention": QEffMolmoBlock.attention,
+            "__qeff_init__": QEffMolmoBlock.__qeff_init__,
+        },
+        "MultiHeadDotProductAttention": {
+            "forward": QEffMultiHeadDotProductAttention.forward,
+        },
         # Mapping for grok1 model
         "Grok1ModelForCausalLM": {"forward": QEffGrok1ModelForCausalLM.forward},
         "Grok1Model": {
 
@@ -309,7 +309,6 @@ def eager_attention_forward(
 
 
 class QEffQwen3VLTextAttention(Qwen3VLTextAttention):
-
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -1024,7 +1023,6 @@ def prepare_inputs_for_generation(self, inputs, prefill_seq_len=128, batch_size=
             inputs["position_ids"], pad=(0, padded_len - input_ids_length), mode="constant", value=-1
         )
 
-
         return inputs
 
     def get_inputs_info(self):