Comments Addressed

asmigosw · quic-dhirajku · commit c658e0fc0a46 · 2026-03-13T09:07:11.000Z
Signed-off-by: asmigosw &lt;asmigosw@qti.qualcomm.com&gt;
Signed-off-by: Dhiraj Kumar Sah &lt;dhirajku@qti.qualcomm.com&gt;
diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py
@@ -536,7 +536,7 @@ def forward(
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
         if self.config.torch_dtype == torch.float16:
-            logger.warning("Accucary might drop with float16 as torch_dtype")
+            logger.warning("Accuracy might drop with float16 as torch_dtype")
 
         outputs = self.model(
             input_ids=input_ids,
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -76,7 +76,7 @@
 from QEfficient.utils.logging_utils import logger
 from QEfficient.utils.sampler_utils import get_sampling_inputs_and_outputs
 
-DTYPE_TO_STRING_MAP = {
+CUSTOM_IO_DTYPE_MAP = {
     torch.float16: "float16",
     torch.bfloat16: "bfloat16",
     torch.float32: "float16",  # Since compiler doesn't support fp32
@@ -463,7 +463,7 @@ def compile(
             compile_dir=compile_dir,
             compile_only=True,
             specializations=specializations,
-            convert_to_fp16=(DTYPE_TO_STRING_MAP[needed_dtype] == "float16"),
+            convert_to_fp16=(CUSTOM_IO_DTYPE_MAP[needed_dtype] == "float16"),
             mxfp6_matmul=mxfp6_matmul,
             mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
@@ -804,7 +804,7 @@ def compile(
             compile_dir=compile_dir,
             compile_only=True,
             specializations=specializations,
-            convert_to_fp16=(DTYPE_TO_STRING_MAP[needed_dtype] == "float16"),
+            convert_to_fp16=(CUSTOM_IO_DTYPE_MAP[needed_dtype] == "float16"),
             mxfp6_matmul=mxfp6_matmul,
             mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
@@ -1478,17 +1478,17 @@ def compile(
 
         custom_io_vision = {}
         needed_dtype = self.model.config.torch_dtype
-        kv_cache_dtype = "mxint8" if mxint8_kv_cache else DTYPE_TO_STRING_MAP[needed_dtype]
+        kv_cache_dtype = "mxint8" if mxint8_kv_cache else CUSTOM_IO_DTYPE_MAP[needed_dtype]
         molmo = hasattr(self.model.config, "model_type") and self.model.config.model_type == "molmo"
         if molmo:
-            custom_io_vision["image_masks"] = DTYPE_TO_STRING_MAP[needed_dtype]
-        custom_io_vision["pixel_values"] = DTYPE_TO_STRING_MAP[needed_dtype]
+            custom_io_vision["image_masks"] = CUSTOM_IO_DTYPE_MAP[needed_dtype]
+        custom_io_vision["pixel_values"] = CUSTOM_IO_DTYPE_MAP[needed_dtype]
 
         for output_name in output_names["vision"]:
             if output_name.startswith("past_"):
                 custom_io_vision[output_name] = kv_cache_dtype
             else:
-                custom_io_vision[output_name] = DTYPE_TO_STRING_MAP[needed_dtype]
+                custom_io_vision[output_name] = CUSTOM_IO_DTYPE_MAP[needed_dtype]
 
         if vision_onnx_path:
             self.vision_model.onnx_path = vision_onnx_path
@@ -1531,21 +1531,21 @@ def compile(
             for output_name in output_names["lang"]:
                 if output_name.endswith("_RetainedState"):
                     custom_io_lang[output_name[: -len("_RetainedState")]] = (
-                        DTYPE_TO_STRING_MAP[needed_dtype] if "vision_embeds" in output_name else kv_cache_dtype
+                        CUSTOM_IO_DTYPE_MAP[needed_dtype] if "vision_embeds" in output_name else kv_cache_dtype
                     )
 
             # outputs
             for output_name in output_names["lang"]:
                 if output_name.endswith("_RetainedState"):
                     custom_io_lang[output_name] = (
-                        DTYPE_TO_STRING_MAP[needed_dtype] if "vision_embeds" in output_name else kv_cache_dtype
+                        CUSTOM_IO_DTYPE_MAP[needed_dtype] if "vision_embeds" in output_name else kv_cache_dtype
                     )
             self.lang_model._compile(
                 compile_dir=compile_dir,
                 compile_only=True,
                 retained_state=True,
                 specializations=specializations["lang"],
-                convert_to_fp16=(DTYPE_TO_STRING_MAP[needed_dtype] == "float16"),
+                convert_to_fp16=(CUSTOM_IO_DTYPE_MAP[needed_dtype] == "float16"),
                 mxfp6_matmul=mxfp6_matmul,
                 mdp_ts_num_devices=num_devices,
                 aic_num_cores=num_cores,
@@ -2160,19 +2160,19 @@ def compile(
 
         custom_io = {}
         needed_dtype = self.model.config.torch_dtype
-        kv_cache_dtype = "mxint8" if mxint8_kv_cache else DTYPE_TO_STRING_MAP[needed_dtype]
+        kv_cache_dtype = "mxint8" if mxint8_kv_cache else CUSTOM_IO_DTYPE_MAP[needed_dtype]
         # inputs
         for input_name in output_names:
             if input_name.endswith("_RetainedState"):
                 custom_io[input_name[: -len("_RetainedState")]] = (
-                    DTYPE_TO_STRING_MAP[needed_dtype] if "pixel_values" in input_name else kv_cache_dtype
+                    CUSTOM_IO_DTYPE_MAP[needed_dtype] if "pixel_values" in input_name else kv_cache_dtype
                 )
 
         # outputs
         for output_name in output_names:
             if output_name.endswith("_RetainedState"):
                 custom_io[output_name] = (
-                    DTYPE_TO_STRING_MAP[needed_dtype] if "pixel_values" in output_name else kv_cache_dtype
+                    CUSTOM_IO_DTYPE_MAP[needed_dtype] if "pixel_values" in output_name else kv_cache_dtype
                 )
 
         # TODO this hould be removed once the continous batching is supported for all the models.
@@ -2185,7 +2185,7 @@ def compile(
             compile_only=True,
             retained_state=True,
             specializations=specializations,
-            convert_to_fp16=(DTYPE_TO_STRING_MAP[needed_dtype] == "float16"),
+            convert_to_fp16=(CUSTOM_IO_DTYPE_MAP[needed_dtype] == "float16"),
             mxfp6_matmul=mxfp6_matmul,
             custom_io=custom_io,
             mdp_ts_num_devices=num_devices,
@@ -3437,7 +3437,7 @@ def compile(
         kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
         custom_io = {}
         needed_dtype = self.model.config.torch_dtype
-        kv_cache_dtype = "mxint8" if mxint8_kv_cache else DTYPE_TO_STRING_MAP[needed_dtype]
+        kv_cache_dtype = "mxint8" if mxint8_kv_cache else CUSTOM_IO_DTYPE_MAP[needed_dtype]
 
         for suffix in ["", "_RetainedState"]:
             for i in range(self.num_layers):
@@ -3449,7 +3449,7 @@ def compile(
             compile_only=True,
             retained_state=True,
             specializations=specializations,
-            convert_to_fp16=(DTYPE_TO_STRING_MAP[needed_dtype] == "float16"),
+            convert_to_fp16=(CUSTOM_IO_DTYPE_MAP[needed_dtype] == "float16"),
             mxfp6_matmul=mxfp6_matmul,
             custom_io=custom_io,
             mdp_ts_num_devices=num_devices,
@@ -3795,7 +3795,7 @@ def compile(
         output_names = self.model.get_output_names()
 
         needed_dtype = self.model.config.torch_dtype
-        kv_cache_dtype = DTYPE_TO_STRING_MAP[needed_dtype]
+        kv_cache_dtype = CUSTOM_IO_DTYPE_MAP[needed_dtype]
         custom_io = {}
 
         custom_io["input_features"] = kv_cache_dtype
@@ -3816,7 +3816,7 @@ def compile(
             compile_only=True,
             retained_state=True,
             specializations=specializations,
-            convert_to_fp16=(DTYPE_TO_STRING_MAP[needed_dtype] == "float16"),
+            convert_to_fp16=(CUSTOM_IO_DTYPE_MAP[needed_dtype] == "float16"),
             mxfp6_matmul=mxfp6_matmul,
             mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
@@ -4224,7 +4224,7 @@ def cloud_ai_100_feature_generate(
             torch.nn.functional.pad(inputs["input_values"], (0, self.seq_len - input_ids_len), "constant", 0)
         )
         needed_dtype = self.model.config.torch_dtype
-        input_values = input_values.astype(DTYPE_TO_STRING_MAP[needed_dtype])
+        input_values = input_values.astype(CUSTOM_IO_DTYPE_MAP[needed_dtype])
         inputs = dict(input_values=input_values)
         outputs = self.qpc_session.run(inputs)
 
diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py
@@ -18,6 +18,7 @@
 from QEfficient.transformers.modeling_attn_mask_utils import _create_causal_mask
 from QEfficient.utils import constants
 from QEfficient.utils._utils import IOInfo, get_padding_shape_from_config
+from QEfficient.utils.constants import MIN_MASKED_ATTENTION_VALUE
 
 
 def _non_meta_init_device(config) -> torch.device:
@@ -54,7 +55,9 @@ def eager_attention_forward(
 
     attn_weights = torch.matmul(q, k.transpose(2, 3)) * scale_factor
     if attention_mask is not None:
-        attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=k.dtype), attn_weights)
+        attn_weights = torch.where(
+            attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=k.dtype), attn_weights
+        )
 
     attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
     attn_output = torch.matmul(attn_weights, v)
diff --git a/tests/configs/image_text_model_configs.json b/tests/configs/image_text_model_configs.json
@@ -204,5 +204,43 @@
       "num_layers": 1,
       "additional_params": {}
     }
+  ],
+  "image_text_custom_dtype_models":[
+    {
+      "model_name": "OpenGVLab/InternVL2_5-1B",
+      "model_type": "internvl_chat",
+      "batch_size": 1,
+      "prompt_len": 384,
+      "ctx_len": 512,
+      "img_size": null,
+      "img_url": "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg",
+      "text_prompt": "Please describe the image in detail.",
+      "num_layers": 2,
+      "additional_params": {}
+    },
+    {
+      "model_name": "google/gemma-3-4b-it",
+      "model_type": "gemma3",
+      "batch_size": 1,
+      "prompt_len": 128,
+      "ctx_len": 3072,
+      "img_size": 896,
+      "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png",
+      "text_prompt": "Can you describe the image in detail.",
+      "num_layers": 6,
+      "additional_params": {}
+    },
+    {
+      "model_name": "llava-hf/llava-1.5-7b-hf",
+      "model_type": "llava",
+      "batch_size": 1,
+      "prompt_len": 784,
+      "ctx_len": 1024,
+      "img_size": 336,
+      "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg",
+      "text_prompt": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud",
+      "num_layers": 1,
+      "additional_params": {}
+    }
   ]
 }
diff --git a/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py b/tests/transformers/models/image_text_to_text/test_image_text_to_text_models.py
@@ -38,8 +38,11 @@
 with open(CONFIG_PATH, "r") as f:
     config_data = json.load(f)
     multimodal_models = config_data["image_text_models"]
+    custom_dtype_support_models = config_data["image_text_custom_dtype_models"]
 test_mm_models = [model_config["model_name"] for model_config in multimodal_models]
 model_config_dict = {model["model_name"]: model for model in multimodal_models}
+test_custom_dtype_support_models = [model_config["model_name"] for model_config in custom_dtype_support_models]
+custom_dtype_support_models_config_dict = {model["model_name"]: model for model in custom_dtype_support_models}
 
 
 def load_image_text_to_text_model(model_config):
@@ -122,7 +125,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
     qnn_config: Optional[str] = None,
     config: Optional[AutoConfig] = None,
     img_size: Optional[int] = None,
-    torch_dtype: Optional[int] = torch.float32,
+    torch_dtype: Optional[torch.dtype] = torch.float32,
 ):
     """
     Unified function to test PyTorch model, PyTorch KV model, ONNX model, and Cloud AI 100 model.
@@ -381,40 +384,29 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload
 
 @pytest.mark.on_qaic
 @pytest.mark.multimodal
-@pytest.mark.parametrize("model_name", test_mm_models)
-@pytest.mark.parametrize("kv_offload", [True, False])
+@pytest.mark.parametrize("model_name", test_custom_dtype_support_models)
+@pytest.mark.parametrize("kv_offload", [True])
 @pytest.mark.parametrize("torch_dtype", [torch.float16])
 def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_custom_dtype(model_name, kv_offload, torch_dtype):
     """
     Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model,  without continuous batching.
     ``Mandatory`` Args:
         :model_name (str): Hugging Face Model Card name, Example: ``gpt2``
     """
-    if model_name in [
-        "meta-llama/Llama-4-Scout-17B-16E-Instruct",
-        "allenai/Molmo-7B-D-0924",
-        "meta-llama/Llama-3.2-11B-Vision-Instruct",
-    ]:
-        pytest.skip("Test skipped for this model due to some issues.")
-    if (
-        model_name in ["OpenGVLab/InternVL2_5-1B", "OpenGVLab/InternVL3_5-1B", "Qwen/Qwen2.5-VL-3B-Instruct"]
-        and not kv_offload
-    ):
-        pytest.skip("These models require kv_offload=True for testing.")
-    # Get img_size for standard models, None for InternVL and Molmo
-    img_size = model_config_dict[model_name].get("img_size")
+    # Get img_size for standard models, None for InternVL
+    img_size = custom_dtype_support_models_config_dict[model_name].get("img_size")
 
     # TODO: Add custom dtype support in ORT and Pytorch_KV APIs
     check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(
         model_name=model_name,
-        prompt_len=model_config_dict[model_name]["prompt_len"],
-        ctx_len=model_config_dict[model_name]["ctx_len"],
+        prompt_len=custom_dtype_support_models_config_dict[model_name]["prompt_len"],
+        ctx_len=custom_dtype_support_models_config_dict[model_name]["ctx_len"],
         max_gen_len=NEW_GENERATION_TOKENS,
         img_size=img_size,
-        img_url=model_config_dict[model_name]["img_url"],
-        query=model_config_dict[model_name]["text_prompt"],
-        n_layer=model_config_dict[model_name]["num_layers"],
-        batch_size=model_config_dict[model_name]["batch_size"],
+        img_url=custom_dtype_support_models_config_dict[model_name]["img_url"],
+        query=custom_dtype_support_models_config_dict[model_name]["text_prompt"],
+        n_layer=custom_dtype_support_models_config_dict[model_name]["num_layers"],
+        batch_size=custom_dtype_support_models_config_dict[model_name]["batch_size"],
         kv_offload=kv_offload,
         torch_dtype=torch_dtype,
     )