Synchronize the request counts for EP inference with strict matching (#3033)

santhnm2 · web-flow · commit 1fdb29f763c9 · 2026-02-03T23:10:48.000Z
Signed-off-by: Keshav Santhanam &lt;ksanthanam@nvidia.com&gt;
diff --git a/megatron/core/inference/batch_dimensions_utils.py b/megatron/core/inference/batch_dimensions_utils.py
@@ -183,6 +183,8 @@ def adjust_batch_dims_for_expert_parallelism(
                 local_batch_dims.token_count,
                 int(is_non_decode),
                 int(has_explicit_chunked_prefill_req),
+                local_batch_dims.prefill_req_count,
+                local_batch_dims.decode_req_count,
             ],
             dtype=torch.int32,
             device=torch.cuda.current_device(),
@@ -208,10 +210,21 @@ def adjust_batch_dims_for_expert_parallelism(
             return None  # indicate no match, run in eager mode
 
         assert not has_explicit_chunked_prefill_req
+
+        # If strict matching is enabled, we sync the request counts across EP ranks
+        # to ensure the graph captures the maximum needed capacity.
+        # TODO(ksanthanam): Add functional test for this scenario
+        adjusted_prefill_req_count = (
+            int(sync_tensor[3].item()) if strict else local_batch_dims.prefill_req_count
+        )
+        adjusted_decode_req_count = (
+            int(sync_tensor[4].item()) if strict else local_batch_dims.decode_req_count
+        )
+
         adjusted_batch_dim = InferenceBatchDimensions(
             token_count=int(sync_tensor[0].item()),
-            prefill_req_count=local_batch_dims.prefill_req_count,
-            decode_req_count=local_batch_dims.decode_req_count,
+            prefill_req_count=adjusted_prefill_req_count,
+            decode_req_count=adjusted_decode_req_count,
             has_explicit_chunked_prefill_req=False,
         )
         return adjusted_batch_dim
diff --git a/megatron/core/inference/engines/dynamic_engine.py b/megatron/core/inference/engines/dynamic_engine.py
@@ -1706,7 +1706,11 @@ async def run_engine_with_coordinator(
                 if ep_group_has_work and local_pending_requests == 0:
                     # run dummy forward pass if EP group as a whole has work,
                     # but this rank does not have any work.
+                    self.step_start_event.record()
                     self.controller.dummy_forward()
+                    self.step_end_event.record()
+                    self.step_end_event.synchronize()
+                    self.step_count += 1
                     continue
 
                 # 3. No work in EP group
diff --git a/megatron/core/ssm/mamba_block.py b/megatron/core/ssm/mamba_block.py
@@ -202,6 +202,37 @@ def mamba_state_shapes_per_request(self) -> Optional[Tuple[Tuple[int], Tuple[int
                 return layer.mamba_state_shapes_per_request()
         return None
 
+    def _should_call_local_cudagraph(self, *args, **kwargs):
+        """
+        Check if we should call the local cudagraph path.
+        """
+        if not self.training and (
+            hasattr(self, 'cudagraph_manager')
+            and kwargs['attention_mask'] is None
+            and (
+                kwargs.get('inference_context') is not None
+                or kwargs.get('inference_params') is not None
+            )
+            and CudaGraphScope.full_iteration in self.config.cuda_graph_scope
+        ):
+            if kwargs['inference_context'].is_static_batching():
+                using_cuda_graph = kwargs['inference_context'].is_decode_only()
+            else:
+                using_cuda_graph = kwargs['inference_context'].using_cuda_graph_this_step()
+
+            if using_cuda_graph:
+                return True
+        return False
+
+    def __call__(self, *args, **kwargs):
+        if self._should_call_local_cudagraph(*args, **kwargs):
+            kwargs['hidden_states'] = (
+                kwargs['hidden_states'].unwrap()
+                if isinstance(kwargs['hidden_states'], WrappedTensor)
+                else kwargs['hidden_states']
+            )
+        return super().__call__(*args, **kwargs)
+
     def forward(
         self,
         hidden_states: Union[Tensor, WrappedTensor],
diff --git a/megatron/core/ssm/mamba_layer.py b/megatron/core/ssm/mamba_layer.py
@@ -192,6 +192,7 @@ def _should_call_local_cudagraph(self, *args, **kwargs):
             hasattr(self, 'cudagraph_manager')
             and kwargs.get('attention_mask') is None
             and kwargs.get('inference_context') is not None
+            and CudaGraphScope.full_iteration not in self.config.cuda_graph_scope
         ):
             using_cuda_graph = kwargs['inference_context'].using_cuda_graph_this_step()
             return using_cuda_graph
diff --git a/megatron/core/transformer/transformer_block.py b/megatron/core/transformer/transformer_block.py
@@ -606,16 +606,7 @@ def __call__(self, *args, **kwargs):
                 if isinstance(kwargs['hidden_states'], WrappedTensor)
                 else kwargs['hidden_states']
             )
-            # dynamic_inference_decode_only is not a real argument to forward, it is only used
-            # to differentiate the cuda graph used for decode from the one used for non-decode
-            # inference.
-            dynamic_inference_decode_only = kwargs['inference_context'].is_decode_only()
-            # cudagraphmanager returns a singleton tuple, whereas the
-            # normal forward returns a tensor, therefore we need
-            # to extract the tensor from the tuple
-            return super().__call__(
-                *args, dynamic_inference_decode_only=dynamic_inference_decode_only, **kwargs
-            )[0]
+            return super().__call__(*args, **kwargs)[0]
         return super().__call__(*args, **kwargs)
 
     def forward(
diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py
@@ -506,10 +506,6 @@ def forward(self, *args, **kwargs):
         This method calls the core computation of a transformer layer, including
         self-attention, cross-attention (if applicable), and feed-forward operations.
         """
-        # Remove 'dynamic_inference_decode_only' from kwargs if present
-        # this is only used to uniquely identify decode and non-decode cuda graph
-        # runners in the cuda graph manager
-        kwargs.pop("dynamic_inference_decode_only", None)
         hidden_states, context = self._forward_attention(*args, **kwargs)
         output = self._forward_mlp(
             hidden_states,
@@ -1203,19 +1199,6 @@ def _should_call_local_cudagraph(self, *args, **kwargs):
                 return True
         return False
 
-    def __call__(self, *args, **kwargs):
-        if self._should_call_local_cudagraph(*args, **kwargs):
-            # Inference mode.
-            if kwargs.get('inference_context') is not None:
-                # dynamic_inference_decode_only is not a real argument to forward, it is only used
-                # to differentiate the cuda graph used for decode from the one used for non-decode
-                # inference.
-                kwargs["dynamic_inference_decode_only"] = kwargs[
-                    'inference_context'
-                ].is_decode_only()
-
-        return super().__call__(*args, **kwargs)
-
     def get_layer_norm_weights(self):
         """
         Get the weights of all layernorms (attention and MLP) in the transformer layer.