fix: workaround duplicated AllGather for EP+FSDP2 (#173)

Luosuu · web-flow · commit 135bc4592aca · 2025-11-06T23:05:55.000-08:00
### What does this PR do? Compute shared expert first to workaround the duplicated all-gather issue in EP+FSDP2, which seems to be a bug in PyTorch FSDP2. Before: <img width="2230" height="302" alt="76591" src="https://github.com/user-attachments/assets/f9f4e553-5678-4fa8-9fcf-77750ad165bf" /> After: <img width="1756" height="188" alt="4480" src="https://github.com/user-attachments/assets/749b572c-214b-4121-bfad-5ec2cea7f191" />
diff --git a/veomni/models/transformers/deepseek_v3/modeling_deepseek.py b/veomni/models/transformers/deepseek_v3/modeling_deepseek.py
@@ -370,6 +370,8 @@ def forward(self, hidden_states):
             denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
             topk_weight = topk_weight / denominator
         topk_weight = topk_weight * self.routed_scaling_factor  # must multiply the scaling factor
+        # Ensure routing weights keep the same dtype as hidden states so checkpoint recomputations stay consistent.
+        topk_weight = topk_weight.to(hidden_states.dtype)
 
         return topk_idx, topk_weight
 
@@ -469,13 +471,19 @@ def forward(self, hidden_states):
         topk_idx, topk_weight = self.gate(hidden_states)
         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
 
-        final_hidden_states = torch.zeros(
-            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
-        )
-        final_hidden_states = self.experts(hidden_states, routing_weights=topk_weight, selected_experts=topk_idx)
-
+        # we compute shared expert first to workaround the duplicated AllGather issue when using EP+FSDP2
+        # which seems to be a bug of fsdp2 fully_shard
         if self.config.n_shared_experts is not None:
-            final_hidden_states = final_hidden_states + self.shared_experts(identity)
+            shared_expert_outputs = self.shared_experts(identity)
+        else:
+            shared_expert_outputs = None
+
+        moe_outputs = self.experts(hidden_states, routing_weights=topk_weight, selected_experts=topk_idx)
+
+        if shared_expert_outputs is not None:
+            final_hidden_states = moe_outputs + shared_expert_outputs
+        else:
+            final_hidden_states = moe_outputs
 
         return final_hidden_states