[None][fix] only support deepep post quant all2all on nvfp4 (#8041)

yilin-void · web-flow · commit 7f1e2dba9293 · 2025-09-29T14:37:50.000+08:00
Signed-off-by: Yilin Zhang &lt;18275976+yilin-void@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -417,6 +417,18 @@ def reducescatter_or_allreduce(
                 sizes=None if use_dp_padding else all_rank_num_tokens)
         return outputs
 
+    def is_post_quant_all2all_supported(self):
+        if not self.use_postquant_alltoall:
+            return False
+        if self.alltoall_method_type == AlltoallMethodType.MNNVL:
+            return False
+        elif self.alltoall_method_type == AlltoallMethodType.DeepEP:
+            return self.has_nvfp4
+        elif self.alltoall_method_type == AlltoallMethodType.DeepEPLowLatency:
+            return self.has_fp8_qdq or self.has_nvfp4 or self.has_w4afp8
+        else:
+            return False
+
     def forward_chunk(
             self,
             x: Union[torch.Tensor, Fp4QuantizedTensor],
@@ -493,7 +505,8 @@ def forward_chunk(
         use_allgather = not use_all_to_all
 
         # If alltoall is disabled, we need also disable use_postquant_alltoall
-        use_postquant_alltoall = self.use_postquant_alltoall and use_all_to_all and self.has_any_quant
+        use_postquant_alltoall = use_all_to_all and self.is_post_quant_all2all_supported(
+        )
 
         # Prepare additional information for profiling in case padding is applied when using alltoall.
         # Only the non-alltoall case is considered for profiling in the warmup phase.
@@ -613,6 +626,7 @@ def forward_chunk(
             if self.alltoall_method_type == AlltoallMethodType.MNNVL:
                 pass
             elif self.alltoall_method_type == AlltoallMethodType.DeepEP:
+                assert self.has_nvfp4, "DeepEP postquant alltoall should have nvfp4"
                 if x_sf is not None:
                     # Adapter between `x_sf` and DeepEP
                     # TODO: remove the adapter by adding dtype support to DeepEP