DS V2V3 fix for same file

Concurrensee · Concurrensee · commit b06c1541e75b · 2025-02-12T17:36:21.000Z
diff --git a/vllm/model_executor/models/deepseek_v2.py b/vllm/model_executor/models/deepseek_v2.py
@@ -150,11 +150,19 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
             shared_output = self.shared_experts(hidden_states)
         # router_logits: (num_tokens, n_experts)
         router_logits, _ = self.gate(hidden_states)
-        final_hidden_states = self.experts(hidden_states=hidden_states,
-                                           router_logits=router_logits)
+        if hidden_states.dtype != torch.float16:
+            final_hidden_states = self.experts(
+                hidden_states=hidden_states,
+                router_logits=router_logits) * self.routed_scaling_factor
+        else:
+            final_hidden_states = self.experts(hidden_states=hidden_states,
+                                            router_logits=router_logits)
         if shared_output is not None:
-            final_hidden_states = final_hidden_states + shared_output \
-                * (1. / self.routed_scaling_factor)
+            if hidden_states.dtype != torch.float16:
+                final_hidden_states = final_hidden_states + shared_output
+            else:
+                final_hidden_states = final_hidden_states + shared_output \
+                    * (1. / self.routed_scaling_factor)
         if self.tp_size > 1:
             final_hidden_states = tensor_model_parallel_all_reduce(
                 final_hidden_states)
@@ -557,12 +565,14 @@ def forward(
         )
 
         # Fully Connected
-        if isinstance(self.mlp, DeepseekV2MoE):
+        if isinstance(self.mlp, DeepseekV2MoE) and \
+            hidden_states.dtype == torch.float16:
             hidden_states *= 1. / self.mlp.routed_scaling_factor
         hidden_states, residual = self.post_attention_layernorm(
             hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
-        if isinstance(self.mlp, DeepseekV2MLP):
+        if isinstance(self.mlp, DeepseekV2MLP) and \
+            hidden_states.dtype == torch.float16:
             hidden_states *= 1. / self.routed_scaling_factor
             residual *= 1. / self.routed_scaling_factor
         return hidden_states, residual