NVIDIA
diff --git a/‎examples/inference/gpt/gpt_dynamic_inference.py‎
Lines changed: 9 additions & 2 deletions b/‎examples/inference/gpt/gpt_dynamic_inference.py‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎examples/inference/gpt/utils.py‎
Lines changed: 1 addition & 6 deletions b/‎examples/inference/gpt/utils.py‎
Lines changed: 1 addition & 6 deletions
diff --git a/‎megatron/core/inference/batch_dimensions_utils.py‎
Lines changed: 312 additions & 0 deletions b/‎megatron/core/inference/batch_dimensions_utils.py‎
Lines changed: 312 additions & 0 deletions
@@ -55,6 +55,7 @@
 from mamba_builders import mamba_builder
 
 from megatron.core.utils import configure_nvtx_profiling
+import logging
 
 import json
 
@@ -196,6 +197,8 @@ def get_inference_context(
         use_cuda_graphs_for_non_decode_steps=not args.decode_only_cuda_graphs,
         use_flashinfer_fused_rope=args.use_flashinfer_fused_rope,
         unified_memory_level=args.inference_dynamic_batching_unified_memory_level,
+        cuda_graph_max_tokens=args.inference_dynamic_batching_cuda_graph_max_tokens,
+        cuda_graph_mixed_prefill_count=args.inference_dynamic_batching_cuda_graph_mixed_prefill_count,
         metrics_writer=metrics_writer,
     )
 
@@ -278,7 +281,7 @@ def run_inference(
     total_output_tokens = 0
     attempted_step_count = 0
     if args.cuda_graph_impl == "local":
-        cuda_graph_request_count_map = {r:0 for r in engine.context.cuda_graph_request_counts}
+        cuda_graph_request_count_map = {}
     else:
         cuda_graph_request_count_map = None
 
@@ -354,7 +357,7 @@ def _add_request():
         # Record cuda_graph_request_count.
         cuda_graph_request_count = result["cuda_graph_request_count"]
         if args.cuda_graph_impl == "local" and cuda_graph_request_count is not None:
-            cuda_graph_request_count_map[cuda_graph_request_count] += 1
+            cuda_graph_request_count_map[cuda_graph_request_count] = cuda_graph_request_count_map.get(cuda_graph_request_count, 0) + 1
 
         # Update requests.
         active_request_ids = result["active_request_ids"]
@@ -421,6 +424,10 @@ def main():
     if os.environ.get("NSIGHT_PREFIX"):
         torch.cuda.cudart().cudaProfilerStart()
 
+    level_str = os.getenv("LOG_LEVEL", "INFO").upper() 
+    level = getattr(logging, level_str, logging.INFO) 
+    logging.basicConfig(level=level, force=True)
+
     configure_nvtx_profiling(True)
 
     args = get_args()
 
@@ -379,12 +379,7 @@ def build_dynamic_engine_setup_prefix(
     """
     # CUDA graph config
     if args.cuda_graph_impl == "local":
-        cg_str = (
-            "graphs "
-            f"[{len(context.cuda_graph_token_counts)}] "
-            f"{context.cuda_graph_token_counts[0]}:"
-            f"{context.cuda_graph_token_counts[-1]}"
-        )
+        cg_str = f"graphs {len(context.cuda_graph_batch_dimensions_list)}"
     else:
         cg_str = "--"
 
 
@@ -0,0 +1,312 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+
+"""
+Batch dimensions utilities.
+
+This module contains utilities for managing batch dimensions,
+including the InferenceBatchDimensions dataclass and CUDAGraphBatchDimensionBuilder for generating
+and matching CUDA graph batch dimensions.
+"""
+
+import math
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
+
+@dataclass(order=True, frozen=True)
+class InferenceBatchDimensions:
+    """Batch dimensions for dynamic inference.
+
+    Attributes:
+        token_count : number of total input tokens
+        prefill_req_count : number of prefill requests
+        decode_req_count : number of decode requests
+
+    The batch dimensions are ordered by token_count, then by prefill_req_count,
+    then by decode_req_count.
+
+    """
+
+    token_count: int = 0
+    prefill_req_count: int = 0
+    decode_req_count: int = 0
+
+    def __str__(self):
+        """
+        Returns a string representation of the batch dimensions.
+        """
+        return f"[{self.token_count}]: {self.prefill_req_count} P + {self.decode_req_count} D"
+
+    def is_applicable_for_batch_dim(
+        self, real_batch_dim: "InferenceBatchDimensions", strict: bool = False
+    ) -> bool:
+        """
+        Checks if this batch dimension is applicable for the given real batch dimension.
+        Applicable batch dimensions are those that have enough tokens and
+        requests budget to handle the real batch dimensions.
+
+        Note that if strict is False, prefill slots can be used
+        for prefill or decode requests. Otherwise, prefill slots
+        can only be used for prefill requests.
+        """
+        if real_batch_dim.prefill_req_count == 0:
+            return (
+                self.token_count >= real_batch_dim.token_count
+                and self.decode_req_count >= real_batch_dim.decode_req_count
+                and self.prefill_req_count == 0  # keep decode only property
+            )
+        if strict:
+            return (
+                self.token_count >= real_batch_dim.token_count
+                and self.prefill_req_count >= real_batch_dim.prefill_req_count
+                and self.decode_req_count >= real_batch_dim.decode_req_count
+            )
+        else:
+            return (
+                self.token_count >= real_batch_dim.token_count
+                and self.prefill_req_count >= real_batch_dim.prefill_req_count
+                and self.prefill_req_count + self.decode_req_count
+                >= real_batch_dim.prefill_req_count + real_batch_dim.decode_req_count
+            )
+
+    def is_valid(self, max_requests: int, max_sequence_length: int) -> bool:
+        """
+        Checks if the batch dimension is valid based on resource constraints.
+
+        Args:
+            max_requests: Maximum number of requests allowed
+
+        Returns:
+            True if the config is valid, False otherwise
+        """
+        # Check if total requests exceed maximum
+        if self.prefill_req_count + self.decode_req_count > max_requests:
+            return False
+
+        # Check for negative request counts
+        if self.prefill_req_count < 0 or self.decode_req_count < 0:
+            return False
+
+        # Check if token count is sufficient for requests
+        if self.token_count < self.prefill_req_count + self.decode_req_count:
+            return False
+
+        # Check if the prefill requests are shorter than the max sequence length
+        if self.token_count > self.prefill_req_count * max_sequence_length + self.decode_req_count:
+            return False
+
+        return True
+
+    def __hash__(self):
+        """
+        Returns a hash of the batch dimension.
+        In cuda graph quick matching, the batch dimension is used as a key in a dictionary.
+        """
+        return hash((self.token_count, self.prefill_req_count, self.decode_req_count))
+
+    def __eq__(self, other: "InferenceBatchDimensions") -> bool:
+        """
+        Checks if this batch dimension is equal to another batch dimension.
+        """
+        if other is None:
+            return False
+        return (self.token_count, self.prefill_req_count, self.decode_req_count) == (
+            other.token_count,
+            other.prefill_req_count,
+            other.decode_req_count,
+        )
+
+    @property
+    def req_count(self) -> int:
+        """
+        Returns the total number of requests.
+        """
+        return self.prefill_req_count + self.decode_req_count
+
+
+class CUDAGraphBatchDimensionBuilder:
+    """Builder for creating and managing CUDA graph batch dimensions.
+
+    This class provides static methods for generating lists of CUDA graph batch dimensions
+    and matching the best batch dimension for a given real batch dimension.
+    """
+
+    # Constant for rounding token counts when generating CUDA graph batch dimensions
+    CUDA_GRAPH_ROUNDER = 8
+
+    @staticmethod
+    def generate_cuda_graph_batch_dimensions_list(
+        tp_size: int,
+        num_cuda_graphs: Optional[int],
+        cuda_graph_max_tokens: int,
+        cuda_graph_mixed_prefill_count: Optional[int],
+        max_requests: int,
+        max_tokens: int,
+        max_sequence_length: int,
+        use_cuda_graphs_for_non_decode_steps: bool,
+    ) -> Tuple[List[InferenceBatchDimensions], Optional[List[int]]]:
+        """
+        Generate CUDA graph batch dimensions.
+
+        This function constructs CUDA graph batch dimensions for different token counts
+        and request patterns, then filters them based on resource constraints.
+        The construction process involves:
+
+        Construction Rules:
+        1. Token count generation: Creates token counts from step_size to max_tokens,
+           rounded to multiples of 8
+        2. Tensor parallelism alignment: Ensures step_size is divisible by tensor parallel size
+        3. Batch dimension creation: For each token count, creates three types of batch dimensions:
+           - Decode-only: (token_count, 0, token_count) - all tokens used for decode requests
+           - Mixed prefill+decode: (token_count, prefill_req_count, token_count - prefill_req_count)
+           - Prefill-only:
+             (token_count, max(prefill_req_count, ceil(token_count/(max_seq_len-1))), 0)
+
+        Filtering Rules:
+        1. Request limit: prefill_req_count + decode_req_count <= max_requests
+        2. Non-negative counts: Both prefill_req_count and decode_req_count must be >= 0
+        3. Token sufficiency: token_count >= prefill_req_count + decode_req_count
+
+        Sorting Rules for Attention Metadata Construction:
+        1. Batch dimensions are sorted by prefill token count (token_count - decode_req_count)
+           in descending order
+
+        Args:
+            tp_size: Tensor parallel size
+            num_cuda_graphs: Number of CUDA graphs to generate
+            cuda_graph_max_tokens: Maximum tokens for CUDA graphs
+            cuda_graph_mixed_prefill_count: Number of mixed prefill requests for CUDA graphs
+            max_requests: Maximum number of requests
+            max_tokens: Maximum total tokens
+            max_sequence_length: Maximum sequence length
+            use_cuda_graphs_for_non_decode_steps: Whether to use CUDA graphs for non-decode steps
+
+        Returns:
+            Tuple containing:
+            - List of InferenceBatchDimensions objects,
+              sorted by prefill token count in descending order
+            - Optional list of CUDA graph token counts
+        """
+
+        def add_if_valid(token_count: int, prefill_req_count: int, decode_req_count: int) -> None:
+            """Helper to create and append batch dimension to list only if it's valid."""
+            batch_dim = InferenceBatchDimensions(token_count, prefill_req_count, decode_req_count)
+            if batch_dim.is_valid(max_requests, max_sequence_length):
+                cuda_graph_batch_dimensions_list.append(batch_dim)
+
+        # Cuda graph token-counts
+        # (i.e., token counts used by cuda-graph steps, both decode and non-decode).
+        cuda_graph_token_counts = None
+        if num_cuda_graphs is not None:
+
+            # Ensure valid num_cuda_graphs.
+            if (
+                cuda_graph_max_tokens is None
+                or cuda_graph_max_tokens > max_tokens
+                or cuda_graph_max_tokens <= 0
+            ):
+                cuda_graph_max_tokens = max_tokens
+            num_cuda_graphs = min(max(num_cuda_graphs, 1), cuda_graph_max_tokens)
+
+            # Cuda graph step size.
+            cuda_graph_step_size = cuda_graph_max_tokens / num_cuda_graphs
+            cuda_graph_step_size = CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER * int(
+                math.ceil(
+                    int(cuda_graph_step_size) / CUDAGraphBatchDimensionBuilder.CUDA_GRAPH_ROUNDER
+                )
+            )
+            # Make sure divisible by TP size
+            cuda_graph_step_size = math.ceil(cuda_graph_step_size / tp_size) * tp_size
+
+            # Cuda graph token counts.
+            if num_cuda_graphs == 1:
+                cuda_graph_token_counts = [cuda_graph_max_tokens]
+            else:
+                cuda_graph_token_counts = list(
+                    range(cuda_graph_step_size, cuda_graph_max_tokens, cuda_graph_step_size)
+                )
+                if (
+                    len(cuda_graph_token_counts) == 0
+                    or cuda_graph_token_counts[-1] != cuda_graph_max_tokens
+                ):
+                    cuda_graph_token_counts.append(cuda_graph_max_tokens)
+                cuda_graph_token_counts.reverse()
+
+        cuda_graph_batch_dimensions_list = []
+        if num_cuda_graphs is None:
+            cuda_graph_batch_dimensions_list = []
+        elif (
+            not cuda_graph_mixed_prefill_count
+            or cuda_graph_mixed_prefill_count <= 0
+            or not use_cuda_graphs_for_non_decode_steps
+        ):  # decode only
+            for size in cuda_graph_token_counts:
+                add_if_valid(
+                    token_count=min(size, max_requests),
+                    prefill_req_count=0,
+                    decode_req_count=min(size, max_requests),
+                )
+        else:
+            for size in cuda_graph_token_counts:
+                add_if_valid(
+                    token_count=min(size, max_requests),
+                    prefill_req_count=0,
+                    decode_req_count=min(size, max_requests),
+                )
+                add_if_valid(
+                    token_count=size,
+                    prefill_req_count=min(cuda_graph_mixed_prefill_count, max_requests),
+                    decode_req_count=min(size, max_requests)
+                    - min(cuda_graph_mixed_prefill_count, max_requests),
+                )
+                # We need to ensure the prefill requests are shorter than the max sequence length,
+                # considering the one decode token is used for prefill request construction
+                prefill_only_minimal_num = max(
+                    cuda_graph_mixed_prefill_count,
+                    math.ceil(size / max(1, max_sequence_length - 1)),
+                )
+                if prefill_only_minimal_num < max_requests:
+                    add_if_valid(
+                        token_count=size,
+                        prefill_req_count=max(prefill_only_minimal_num, min(max_requests, size)),
+                        decode_req_count=0,
+                    )
+
+        # Remove duplicates and sort by prefill token count
+        cuda_graph_batch_dimensions_list = list(set(cuda_graph_batch_dimensions_list))
+        cuda_graph_batch_dimensions_list.sort(
+            key=lambda x: ((x.token_count - x.decode_req_count), x.decode_req_count), reverse=True
+        )
+
+        return cuda_graph_batch_dimensions_list, cuda_graph_token_counts
+
+    @staticmethod
+    def match_graph_config(
+        real_batch_dim: InferenceBatchDimensions,
+        cuda_graph_batch_dimensions_list: List[InferenceBatchDimensions],
+        strict: bool = False,
+    ) -> Optional[InferenceBatchDimensions]:
+        """
+        Matches the best CUDA graph batch dimension for the given real batch dimension.
+
+        Args:
+            real_batch_dim: The real batch dimension to match
+            cuda_graph_batch_dimensions_list: List of available CUDA graph batch dimensions
+            strict: If False, prefill slots can be used for prefill or decode requests.
+                   If True, prefill slots can only be used for prefill requests.
+
+        Returns:
+            The best matching CUDA graph batch dimension, or None if no applicable match is found
+        """
+        # first filter out batch dimensions with smaller token count, prefill req count,
+        # or decode req count, as they are not applicable
+        graph_batch_dims_applicable = [
+            graph_batch_dim
+            for graph_batch_dim in cuda_graph_batch_dimensions_list
+            if graph_batch_dim.is_applicable_for_batch_dim(real_batch_dim, strict=strict)
+        ]
+        if len(graph_batch_dims_applicable) == 0:
+            return None
+        # then find the best batch dimension
+        best_batch_dim = min(graph_batch_dims_applicable)
+        return best_batch_dim