Add dummy all_reduce for kernel breakdown (NVIDIA#5745)

qiaoxj07 · web-flow · commit 089fd55eda10 · 2025-07-05T13:08:58.000+09:00
Signed-off-by: Xianjie &lt;5410381+qiaoxj07@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -211,6 +211,10 @@ def __init__(
         if not model_config.skip_create_weights_in_init:
             self.create_weights()
 
+        # Debug function for eliminating imbalance during performance analysis.
+        self.enable_dummy_allreduce = os.environ.get(
+            "TRTLLM_ENABLE_DUMMY_ALLREDUCE", "0") == "1"
+
     def _check_configs(self):
         assert self._weights_created
 
@@ -302,6 +306,16 @@ def create_weights(self):
         self._weights_created = True
         self._check_configs()
 
+    def dummy_allreduce(self):
+        """
+        Debug function for eliminating imbalance during performance analysis.
+        Creates a small dummy tensor and performs allreduce to synchronize processes
+        and eliminate timing imbalances for more accurate profiling measurements.
+        """
+        dummy_tensor = torch.zeros(4, dtype=torch.float32, device='cuda')
+        dummy_tensor = self.all_reduce(dummy_tensor)
+        return dummy_tensor
+
     def reducescatter_or_allreduce(
         self,
         inputs,
@@ -311,6 +325,8 @@ def reducescatter_or_allreduce(
         outputs = inputs
         if self.parallel_size > 1 and not self.enable_alltoall:
             if self.use_dp:
+                if self.enable_dummy_allreduce:
+                    self.dummy_allreduce()
                 outputs = reducescatter(
                     inputs,
                     self.mapping,
@@ -398,6 +414,8 @@ def forward_chunk(
 
         if self.enable_alltoall:
             if self.alltoall_method_type == AlltoallMethodType.MNNVL:
+                if self.enable_dummy_allreduce:
+                    self.dummy_allreduce()
                 token_count = x.shape[0]
                 alltoall_info = None
                 x, token_selected_slots, token_final_scales, gathered_loadbalancer_local_statistic_info, alltoall_info = \
@@ -482,6 +500,8 @@ def forward_chunk(
 
         if self.use_dp and self.parallel_size > 1 and not disable_fp4_allgather(
         ) and not self.enable_alltoall:
+            if self.enable_dummy_allreduce:
+                self.dummy_allreduce()
             x, x_sf, token_selected_slots, token_final_scales, gathered_token_selected_experts_for_statistic = allgather(
                 [
                     x,
@@ -630,6 +650,8 @@ def forward_chunk(
 
         if self.enable_alltoall:
             if self.alltoall_method_type == AlltoallMethodType.MNNVL:
+                if self.enable_dummy_allreduce:
+                    self.dummy_allreduce()
                 final_hidden_states = self.alltoall_combine(
                     final_hidden_states, alltoall_info, token_count)
             elif self.alltoall_method_type == AlltoallMethodType.DeepEP: