Updated SoftMax calculation precision for all modeling files.

quic-dhirajku · quic-dhirajku · commit f7c88be79eab · 2026-03-13T09:07:11.000Z
Enabled CI tests for fp16 based LMs, embedding and sequence classification models.

Modified CI based config for LLM tests.

Embedding models have high MAD for fp16 exported models(~0.015)

Certain CausalLMs cause a token mismatch after few tokens for fp16 setup.
Whisper model has a clip operator issue for fp16 exported models so its not enabled yet.

Signed-off-by: Dhiraj Kumar Sah &lt;dhirajku@qti.qualcomm.com&gt;
diff --git a/QEfficient/transformers/models/falcon/modeling_falcon.py b/QEfficient/transformers/models/falcon/modeling_falcon.py
@@ -156,7 +156,9 @@ def forward(
         attention_scores = torch.where(
             attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=self.config.torch_dtype), attention_scores
         )
-        attention_scores = F.softmax(attention_scores + attention_mask, dim=-1, dtype=hidden_states.dtype)
+        attention_scores = F.softmax(attention_scores + attention_mask, dim=-1, dtype=torch.float32).to(
+            query_layer.dtype
+        )
         # It is unclear why neither dropout nor head_mask is applied here (while it is with alibi).
         attn_output = attention_scores @ value_layer
 
diff --git a/QEfficient/transformers/models/gemma/modeling_gemma.py b/QEfficient/transformers/models/gemma/modeling_gemma.py
@@ -114,7 +114,7 @@ def eager_attention_forward(
             attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights
         )
 
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
diff --git a/QEfficient/transformers/models/gemma2/modeling_gemma2.py b/QEfficient/transformers/models/gemma2/modeling_gemma2.py
@@ -121,7 +121,7 @@ def eager_attention_forward(
             attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights
         )
 
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
diff --git a/QEfficient/transformers/models/gemma3/modeling_gemma3.py b/QEfficient/transformers/models/gemma3/modeling_gemma3.py
@@ -166,7 +166,7 @@ def eager_attention_forward(
             attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights
         )
 
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
@@ -277,7 +277,7 @@ def forward(
             )
 
         # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=self.config.torch_dtype).to(query_states.dtype)
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
 
diff --git a/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/QEfficient/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -86,7 +86,7 @@ def eager_attention_forward(
         attn_weights = torch.where(
             attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights
         )
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
diff --git a/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py b/QEfficient/transformers/models/gpt_oss/modeling_gpt_oss.py
@@ -661,7 +661,7 @@ def eager_attention_forward_blocked(
         )
         combined_logits = torch.cat([curr_attn_weights, sinks], dim=-1)
         combined_logits = combined_logits - combined_logits.max(dim=-1, keepdim=True).values
-        curr_attn_weights = nn.functional.softmax(combined_logits, dim=-1, dtype=module.config.torch_dtype)
+        curr_attn_weights = nn.functional.softmax(combined_logits, dim=-1, dtype=torch.float32)
         curr_attn_weights = curr_attn_weights[..., :-1]
         out_block = torch.matmul(curr_attn_weights, value_states)
         outs.append(out_block)
@@ -724,7 +724,7 @@ def opt_eager_attention_forward_blocked(
         )
         combined_logits = torch.cat([curr_attn_weights, sinks], dim=-1)
         combined_logits = combined_logits - combined_logits.max(dim=-1, keepdim=True).values
-        curr_attn_weights = nn.functional.softmax(combined_logits, dim=-1, dtype=module.config.torch_dtype)
+        curr_attn_weights = nn.functional.softmax(combined_logits, dim=-1, dtype=torch.float32)
         curr_attn_weights = curr_attn_weights[..., :-1]
         out_block = torch.matmul(curr_attn_weights, v_block)
         outs.append(out_block)
diff --git a/QEfficient/transformers/models/granite/modeling_granite.py b/QEfficient/transformers/models/granite/modeling_granite.py
@@ -113,7 +113,7 @@ def eager_attention_forward(
             attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights
         )
 
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
diff --git a/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py b/QEfficient/transformers/models/granitemoe/modeling_granitemoe.py
@@ -190,7 +190,7 @@ def eager_attention_forward(
         )
 
     # upcast attention to fp32
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
     return attn_output, attn_weights
diff --git a/QEfficient/transformers/models/grok_1/modeling_grok1.py b/QEfficient/transformers/models/grok_1/modeling_grok1.py
@@ -149,7 +149,7 @@ def forward(self, hidden_states: torch.Tensor):
         hidden_states = hidden_states.view(-1, hidden_dim)
         router_logits = self.gate(hidden_states)
 
-        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float32)
         routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
         # Creating experts mask and routing weights masked
         awesome_experts_mask_1 = (
diff --git a/QEfficient/transformers/models/llama/modeling_llama.py b/QEfficient/transformers/models/llama/modeling_llama.py
@@ -114,7 +114,7 @@ def eager_attention_forward(
             attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights
         )
 
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
diff --git a/QEfficient/transformers/models/llama4/modeling_llama4.py b/QEfficient/transformers/models/llama4/modeling_llama4.py
@@ -61,7 +61,7 @@ def eager_attention_forward_vision(
             attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights
         )
 
-    attn_weights = nn.functional.softmax(attn_weights.to(module.config.torch_dtype), dim=-1).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
@@ -392,7 +392,7 @@ def eager_attention_forward(
             attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights
         )
 
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
diff --git a/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py b/QEfficient/transformers/models/llama_swiftkv/modeling_llama_swiftkv.py
@@ -130,9 +130,7 @@ def forward(
                 attn_weights,
             )
         # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=self.v_proj_swiftkv.weight.dtype).to(
-            query_states.dtype
-        )
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
         attn_output = torch.matmul(attn_weights, value_states)
 
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
diff --git a/QEfficient/transformers/models/mistral/modeling_mistral.py b/QEfficient/transformers/models/mistral/modeling_mistral.py
@@ -117,7 +117,7 @@ def eager_attention_forward(
             attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights
         )
 
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
diff --git a/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py b/QEfficient/transformers/models/mixtral_moe/modeling_mixtral.py
@@ -119,7 +119,7 @@ def eager_attention_forward(
             attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights
         )
 
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
     return attn_output, attn_weights
diff --git a/QEfficient/transformers/models/mllama/modeling_mllama.py b/QEfficient/transformers/models/mllama/modeling_mllama.py
@@ -65,7 +65,7 @@ def eager_attention_forward(
         causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
         attn_weights = attn_weights + causal_mask
 
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
@@ -89,7 +89,7 @@ def eager_self_attention_forward(
             attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights
         )
 
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
diff --git a/QEfficient/transformers/models/modeling_auto.py b/QEfficient/transformers/models/modeling_auto.py
@@ -82,6 +82,12 @@
     torch.float32: "float16",  # Since compiler doesn't support fp32
 }
 
+TORCH_TO_NUMPY_DTYPE_MAP = {
+    torch.float16: np.float16,
+    torch.bfloat16: np.float16,  # Since numpy doesn't support bfloat16
+    torch.float32: np.float32,
+}
+
 
 class QEFFTransformersBase(QEFFBaseModel):
     """
@@ -451,12 +457,13 @@ def compile(
             {"batch_size": batch_size, "seq_len": sl} for sl in (seq_len if isinstance(seq_len, list) else [seq_len])
         ]
 
+        needed_dtype = self.model.config.torch_dtype
         return self._compile(
             onnx_path=onnx_path,
             compile_dir=compile_dir,
             compile_only=True,
             specializations=specializations,
-            convert_to_fp16=True,
+            convert_to_fp16=(DTYPE_TO_STRING_MAP[needed_dtype] == "float16"),
             mxfp6_matmul=mxfp6_matmul,
             mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
@@ -470,6 +477,7 @@ def generate(
         device_ids: List[int] = None,
         runtime_ai100: bool = True,
         write_io: bool = False,
+        dtype: Optional[torch.dtype] = torch.float32,
     ) -> Union[torch.Tensor, np.ndarray]:
         """
         Generate output by executing the compiled QPC on Cloud AI 100 hardware or using PyTorch runtime.
@@ -509,6 +517,7 @@ def cloud_ai_100_feature_generate(
         self,
         inputs: torch.Tensor,
         device_ids: List[int] = [0],
+        dtype: Optional[torch.dtype] = torch.float32,
     ) -> np.ndarray:
         """
         Generate features for a batch of inputs using the Cloud AI 100 hardware runtime.
@@ -561,14 +570,16 @@ def cloud_ai_100_feature_generate(
         # TODO: Remove try and catch after compiler fix
         try:
             outputs = {
-                "output": np.random.randn(*list(self.qpc_session.bindings[2].dims)).astype(np.float32),
+                "output": np.random.randn(*list(self.qpc_session.bindings[2].dims)).astype(
+                    TORCH_TO_NUMPY_DTYPE_MAP[dtype]
+                ),
             }
             self.qpc_session.set_buffers(outputs)
             outputs = self.qpc_session.run(inputs)
         except Exception:
             outputs = {
                 "output": np.random.randn(self.batch_size, self.seq_len, self.qpc_session.bindings[2].dims[1]).astype(
-                    np.float32
+                    TORCH_TO_NUMPY_DTYPE_MAP[dtype]
                 ),
             }
             self.qpc_session.set_buffers(outputs)
@@ -787,13 +798,13 @@ def compile(
         specializations = [
             {"batch_size": batch_size, "seq_len": sl} for sl in (seq_len if isinstance(seq_len, list) else [seq_len])
         ]
-
+        needed_dtype = self.model.config.torch_dtype
         return self._compile(
             onnx_path=onnx_path,
             compile_dir=compile_dir,
             compile_only=True,
             specializations=specializations,
-            convert_to_fp16=True,
+            convert_to_fp16=(DTYPE_TO_STRING_MAP[needed_dtype] == "float16"),
             mxfp6_matmul=mxfp6_matmul,
             mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
@@ -3425,6 +3436,8 @@ def compile(
         # --- Compilation ---
         kv_cache_dtype = "mxint8" if mxint8_kv_cache else "float16"
         custom_io = {}
+        needed_dtype = self.model.config.torch_dtype
+        kv_cache_dtype = "mxint8" if mxint8_kv_cache else DTYPE_TO_STRING_MAP[needed_dtype]
 
         for suffix in ["", "_RetainedState"]:
             for i in range(self.num_layers):
@@ -3436,7 +3449,7 @@ def compile(
             compile_only=True,
             retained_state=True,
             specializations=specializations,
-            convert_to_fp16=True,
+            convert_to_fp16=(DTYPE_TO_STRING_MAP[needed_dtype] == "float16"),
             mxfp6_matmul=mxfp6_matmul,
             custom_io=custom_io,
             mdp_ts_num_devices=num_devices,
@@ -3781,7 +3794,8 @@ def compile(
 
         output_names = self.model.get_output_names()
 
-        kv_cache_dtype = "float16"
+        needed_dtype = self.model.config.torch_dtype
+        kv_cache_dtype = DTYPE_TO_STRING_MAP[needed_dtype]
         custom_io = {}
 
         custom_io["input_features"] = kv_cache_dtype
@@ -3802,7 +3816,7 @@ def compile(
             compile_only=True,
             retained_state=True,
             specializations=specializations,
-            convert_to_fp16=True,
+            convert_to_fp16=(DTYPE_TO_STRING_MAP[needed_dtype] == "float16"),
             mxfp6_matmul=mxfp6_matmul,
             mdp_ts_num_devices=num_devices,
             aic_num_cores=num_cores,
@@ -3862,7 +3876,9 @@ def generate(
             self.qpc_session = QAICInferenceSession(str(self.qpc_path), device_ids)
             self.batch_size = self.qpc_session.bindings[0].dims[0]
 
-        inputs["input_features"] = inputs["input_features"].numpy().astype(np.float16)
+        inputs["input_features"] = (
+            inputs["input_features"].numpy().astype(TORCH_TO_NUMPY_DTYPE_MAP[self.model.config.torch_dtype])
+        )
 
         # add start token id and initial position ids to inputs
         seq_len = 1
@@ -3899,7 +3915,9 @@ def generate(
         if streamer:
             streamer.put(next_token)
 
-        inputs["input_features"] = np.zeros((self.batch_size, self.model.config.num_mel_bins, 1)).astype(np.float16)
+        inputs["input_features"] = np.zeros((self.batch_size, self.model.config.num_mel_bins, 1)).astype(
+            TORCH_TO_NUMPY_DTYPE_MAP[self.model.config.torch_dtype]
+        )
 
         loop_start = perf_counter()
         for num_tokens in range(generation_len):
diff --git a/QEfficient/transformers/models/molmo/modeling_molmo.py b/QEfficient/transformers/models/molmo/modeling_molmo.py
@@ -56,7 +56,7 @@ def eager_attention_forward(
     if attention_mask is not None:
         attn_weights = torch.where(attention_mask, torch.tensor(-10000.0, dtype=k.dtype), attn_weights)
 
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=k.dtype).to(q.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(q.dtype)
     attn_output = torch.matmul(attn_weights, v)
 
     return attn_output, attn_weights
@@ -206,7 +206,7 @@ def forward(self, inputs_q: torch.Tensor, inputs_kv: Optional[torch.Tensor] = No
 
         if self.config.attention_type == "direct":
             attn_weights = torch.einsum("...qhd,...khd->...hqk", xq / math.sqrt(xq.size(-1)), xk)
-            attn_weights = F.softmax(attn_weights, dim=-1, dtype=self.config.torch_dtype).to(xq.dtype)
+            attn_weights = F.softmax(attn_weights, dim=-1, dtype=torch.float32).to(xq.dtype)
             if self.attention_dropout is not None:
                 attn_weights = self.attention_dropout(attn_weights)
             attn_output = torch.einsum("...hqk,...khd->...qhd", attn_weights.to(xv.dtype), xv)
diff --git a/QEfficient/transformers/models/olmo2/modeling_olmo2.py b/QEfficient/transformers/models/olmo2/modeling_olmo2.py
@@ -113,7 +113,7 @@ def eager_attention_forward(
         attn_weights = torch.where(
             attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights
         )
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
diff --git a/QEfficient/transformers/models/phi/modeling_phi.py b/QEfficient/transformers/models/phi/modeling_phi.py
@@ -45,7 +45,7 @@ def eager_attention_forward(
             attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights
         )
 
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
diff --git a/QEfficient/transformers/models/phi3/modeling_phi3.py b/QEfficient/transformers/models/phi3/modeling_phi3.py
@@ -113,7 +113,7 @@ def eager_attention_forward(
             attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights
         )
 
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
diff --git a/QEfficient/transformers/models/qwen2/modeling_qwen2.py b/QEfficient/transformers/models/qwen2/modeling_qwen2.py
@@ -127,7 +127,7 @@ def eager_attention_forward(
         attn_weights = torch.where(
             attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights
         )
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
     attn_output = torch.matmul(attn_weights, value_states)
     attn_output = attn_output.transpose(1, 2).contiguous()
 
diff --git a/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/QEfficient/transformers/models/qwen2_5_vl/modeling_qwen2_5_vl.py
diff --git a/QEfficient/transformers/models/qwen3/modeling_qwen3.py b/QEfficient/transformers/models/qwen3/modeling_qwen3.py
diff --git a/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py b/QEfficient/transformers/models/qwen3_moe/modeling_qwen3_moe.py
diff --git a/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py b/QEfficient/transformers/models/starcoder2/modeling_starcoder2.py
diff --git a/QEfficient/utils/generate_inputs.py b/QEfficient/utils/generate_inputs.py
diff --git a/tests/configs/causal_model_configs.json b/tests/configs/causal_model_configs.json
diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py
diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py
diff --git a/tests/transformers/models/test_seq_classification.py b/tests/transformers/models/test_seq_classification.py

Original file line number	Diff line number	Diff line change
`@@ -114,7 +114,7 @@ def eager_attention_forward(`
`114`	`114`	`attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights`
`115`	`115`	`)`
`116`	`116`
`117`		`- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)`
	`117`	`+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)`
`118`	`118`	`attn_output = torch.matmul(attn_weights, value_states)`
`119`	`119`	`attn_output = attn_output.transpose(1, 2).contiguous()`
`120`	`120`
Original file line number	Diff line number	Diff line change
`@@ -121,7 +121,7 @@ def eager_attention_forward(`
`121`	`121`	`attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights`
`122`	`122`	`)`
`123`	`123`
`124`		`- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)`
	`124`	`+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)`
`125`	`125`	`attn_output = torch.matmul(attn_weights, value_states)`
`126`	`126`	`attn_output = attn_output.transpose(1, 2).contiguous()`
`127`	`127`
Original file line number	Diff line number	Diff line change
`@@ -166,7 +166,7 @@ def eager_attention_forward(`
`166`	`166`	`attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights`
`167`	`167`	`)`
`168`	`168`
`169`		`- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)`
	`169`	`+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)`
`170`	`170`	`attn_output = torch.matmul(attn_weights, value_states)`
`171`	`171`	`attn_output = attn_output.transpose(1, 2).contiguous()`
`172`	`172`
`@@ -277,7 +277,7 @@ def forward(`
`277`	`277`	`)`
`278`	`278`
`279`	`279`	`# upcast attention to fp32`
`280`		`- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=self.config.torch_dtype).to(query_states.dtype)`
	`280`	`+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)`
`281`	`281`	`attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)`
`282`	`282`	`attn_output = torch.matmul(attn_weights, value_states)`
`283`	`283`
Original file line number	Diff line number	Diff line change
`@@ -86,7 +86,7 @@ def eager_attention_forward(`
`86`	`86`	`attn_weights = torch.where(`
`87`	`87`	`attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights`
`88`	`88`	`)`
`89`		`- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)`
	`89`	`+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)`
`90`	`90`	`attn_output = torch.matmul(attn_weights, value_states)`
`91`	`91`	`attn_output = attn_output.transpose(1, 2).contiguous()`
`92`	`92`
Original file line number	Diff line number	Diff line change
`@@ -113,7 +113,7 @@ def eager_attention_forward(`
`113`	`113`	`attention_mask, torch.tensor(MIN_MASKED_ATTENTION_VALUE, dtype=module.config.torch_dtype), attn_weights`
`114`	`114`	`)`
`115`	`115`
`116`		`- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)`
	`116`	`+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)`
`117`	`117`	`attn_output = torch.matmul(attn_weights, value_states)`
`118`	`118`	`attn_output = attn_output.transpose(1, 2).contiguous()`
`119`	`119`
Original file line number	Diff line number	Diff line change
`@@ -190,7 +190,7 @@ def eager_attention_forward(`
`190`	`190`	`)`
`191`	`191`
`192`	`192`	`# upcast attention to fp32`
`193`		`- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=module.config.torch_dtype).to(query.dtype)`
	`193`	`+ attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)`
`194`	`194`	`attn_output = torch.matmul(attn_weights, value_states)`
`195`	`195`	`attn_output = attn_output.transpose(1, 2).contiguous()`
`196`	`196`	`return attn_output, attn_weights`