Fix cudagraphs, add rms norm pattern matcher (#87)

suyoggupta · lucaslie · lucaslie · commit 1ec1448bd1a9 · 2025-07-21T07:25:16.000-07:00
* fix overlap scheduler in AD

Signed-off-by: Suyog Gupta &lt;41447211+suyoggupta@users.noreply.github.com&gt;

* cleanups

Signed-off-by: Suyog Gupta &lt;41447211+suyoggupta@users.noreply.github.com&gt;

* fix nest sequences

Signed-off-by: Suyog Gupta &lt;41447211+suyoggupta@users.noreply.github.com&gt;

* nits

* avoid hardcoding max beam width

Signed-off-by: Suyog Gupta &lt;41447211+suyoggupta@users.noreply.github.com&gt;

* avoid hardcoding max beam width

Signed-off-by: Suyog Gupta &lt;41447211+suyoggupta@users.noreply.github.com&gt;

* cudagraph fixes + rms norm

Signed-off-by: Suyog Gupta &lt;41447211+suyoggupta@users.noreply.github.com&gt;

* fix test

Signed-off-by: Suyog Gupta &lt;41447211+suyoggupta@users.noreply.github.com&gt;

* revert ad_executor changes

Signed-off-by: Suyog Gupta &lt;41447211+suyoggupta@users.noreply.github.com&gt;

* Review comments + make sure num_pages &gt;= max batch size

* wrapping reviewer feedback and open items

Signed-off-by: Lucas Liebenwein &lt;11156568+lucaslie@users.noreply.github.com&gt;

---------

Signed-off-by: Suyog Gupta &lt;41447211+suyoggupta@users.noreply.github.com&gt;
Signed-off-by: Lucas Liebenwein &lt;11156568+lucaslie@users.noreply.github.com&gt;
Co-authored-by: Lucas Liebenwein &lt;11156568+lucaslie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py b/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py
@@ -35,10 +35,11 @@ def __init__(
         self._out_buffer_flat: List[torch.Tensor] = None
         self._args_hash: Optional[Tuple[int, ...]] = None
         self.cuda_graph_batch_sizes = (
-            cuda_graph_batch_sizes
+            sorted(cuda_graph_batch_sizes, reverse=True)
             if cuda_graph_batch_sizes is not None
             else self._get_graph_batch_sizes(self.max_batch_size)
         )
+        self._cuda_graph_mem_pool = None
 
     def _get_hash(self, flat_args: List[Any]) -> Tuple[int, ...]:
         return tuple(hash(a) for a in flat_args)
@@ -64,7 +65,7 @@ def _capture_one_graph(self, *args, **kwargs) -> torch.cuda.CUDAGraph:
         # capture graph now
         torch.cuda.synchronize()
         graph = torch.cuda.CUDAGraph()
-        with torch.cuda.graph(graph):
+        with torch.cuda.graph(graph, pool=self._cuda_graph_mem_pool):
             # compute output
             out = self.model(*args, **kwargs)
             # write out into output buffer up to out batch size
@@ -73,7 +74,7 @@ def _capture_one_graph(self, *args, **kwargs) -> torch.cuda.CUDAGraph:
             for o_buffer, o in zip(self._out_buffer_flat, out_flat):
                 o_buffer[: o.shape[0]] = o
         torch.cuda.synchronize()
-
+        self._cuda_graph_mem_pool = self._cuda_graph_mem_pool or graph.pool()
         return graph
 
     @staticmethod
@@ -88,7 +89,7 @@ def _get_graph_batch_sizes(
         batch_sizes.update(range(multiplier, max_bs + 1, multiplier))
 
         # return as sorted list
-        return sorted(batch_sizes)
+        return sorted(batch_sizes, reverse=True)
 
     def capture_graph(self, *args, **kwargs):
         """Capture and pre-fetch the graph for variable batch size."""
@@ -118,6 +119,7 @@ def capture_graph(self, *args, **kwargs):
 
         # capture output once with max batch size to capture output buffers
         with CudaGraphWarmUpPhase():
+            ad_logger.info(f"Warm up with {self.max_batch_size=} before graph capture")
             out = self.model(*args, **kwargs)
         self._out_buffer_flat, out_spec = tree_flatten(out)
         assert out_spec == self._out_spec, "Output spec mismatch."
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/__init__.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/__init__.py
@@ -7,6 +7,7 @@
 from .linear import *
 from .mla import *
 from .quant import *
+from .rms_norm import *
 from .torch_attention import *
 from .torch_backend_attention import *
 from .torch_moe import *
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/attention_interface.py
@@ -117,14 +117,20 @@ def __post_init__(self):
         # if the provided max_num_tokens is less than the max_batch_size * max_seq_len,
         # we use the provided max_num_tokens to calculate the number of pages
         total_tokens = min(self.max_num_tokens, self.max_batch_size * max_seq_len_adjusted)
-        self._num_pages = (total_tokens) // self.page_size + (total_tokens % self.page_size > 0)
+        # Num pages can not be less than max_batch_size.
+        self._num_pages = max(
+            self.max_batch_size,
+            (total_tokens) // self.page_size + (total_tokens % self.page_size > 0),
+        )
         self.input_ids = torch.ones(self.max_batch_size, 1, dtype=torch.int)
         self.position_ids = torch.zeros(self.max_batch_size, 1, dtype=torch.long)
         self.seq_len = torch.empty(self.max_batch_size, dtype=torch.int)
         self.input_pos = torch.empty_like(self.seq_len)
         self.cache_loc = torch.empty(self.num_pages, dtype=torch.int)
         self.pages_per_seq = torch.empty_like(self.seq_len)
-
+        assert self.num_pages >= self.max_batch_size, (
+            "num_pages must be greater than max_batch_size"
+        )
         # dynamic shape descriptors for tensor args
         self._dynamic_shapes: Optional[Tuple[Dict[str, Dim]]] = None
 
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py
@@ -0,0 +1,82 @@
+"""Custom operator for FlashInfer and Triton RMSNorm implementation."""
+
+import flashinfer
+import torch
+
+from .triton_kernels.rms_norm import rms_norm
+
+
+@torch.library.custom_op("auto_deploy::flashinfer_rms_norm", mutates_args=())
+def flashinfer_rmsnorm(input: torch.Tensor, weight: torch.Tensor, eps: float) -> torch.Tensor:
+    """Custom operator for FlashInfer RMSNorm implementation.
+
+    Args:
+        input: Input tensor to normalize.
+        weight: Scaling weights for the normalized output.
+        eps: Small constant for numerical stability.
+
+    Returns:
+        Normalized and scaled tensor using FlashInfer implementation.
+    """
+    # Flashinfer rmsnorm expects a 2D input
+    input_flat = input.reshape(-1, input.shape[-1])
+    rmsnorm_flat = flashinfer.norm.rmsnorm(input_flat, weight, eps)
+    return rmsnorm_flat.reshape(input.shape)
+
+
+@flashinfer_rmsnorm.register_fake
+def _(input: torch.Tensor, weight: torch.Tensor, eps: float) -> torch.Tensor:
+    """Fake implementation for the custom operator during tracing.
+
+    Args:
+        input: Input tensor to normalize.
+        weight: Scaling weights for the normalized output.
+        eps: Small constant for numerical stability.
+
+    Returns:
+        Empty tensor with same shape as input.
+    """
+    return torch.empty_like(input)
+
+
+@torch.library.custom_op("auto_deploy::triton_rms_norm", mutates_args=())
+def triton_rmsnorm(input: torch.Tensor, weight: torch.Tensor, eps: float) -> torch.Tensor:
+    """Custom operator for Triton RMSNorm implementation.
+
+    Args:
+        input: Input tensor to normalize.
+        weight: Scaling weights for the normalized output.
+        eps: Small constant for numerical stability.
+
+    Returns:
+        Normalized and scaled tensor using Triton implementation.
+    """
+    return rms_norm(input, weight, eps)
+
+
+@triton_rmsnorm.register_fake
+def _(input: torch.Tensor, weight: torch.Tensor, eps: float) -> torch.Tensor:
+    """Fake implementation for the custom operator during tracing."""
+    return torch.empty_like(input)
+
+
+@torch.library.custom_op("auto_deploy::torch_rmsnorm", mutates_args=())
+def torch_rmsnorm(input: torch.Tensor, weight: torch.Tensor, eps: float) -> torch.Tensor:
+    """Custom operator for Torch RMSNorm implementation.
+
+    Args:
+        input: Input tensor to normalize.
+        weight: Scaling weights for the normalized output.
+        eps: Small constant for numerical stability.
+    """
+    input_dtype = input.dtype
+    input = input.to(torch.float32)
+    variance = input.pow(2).mean(-1, keepdim=True)
+    input = input * torch.rsqrt(variance + eps)
+    return weight * input.to(input_dtype)
+
+
+@torch_rmsnorm.register_fake
+def _(input: torch.Tensor, weight: torch.Tensor, eps: float) -> torch.Tensor:
+    """Fake implementation for the custom operator during tracing."""
+    return torch.empty_like(input)
diff --git a/tensorrt_llm/_torch/auto_deploy/export/interface.py b/tensorrt_llm/_torch/auto_deploy/export/interface.py
@@ -242,7 +242,7 @@ def _apply_patches(remaining_patches):
             yield from _apply_patches(remaining_patches[1:])
 
     # log applied patches
-    ad_logger.info(
+    ad_logger.debug(
         f"applying export patches: {', '.join([patch.get_patch_key() for patch in patches])}"
     )
 
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/__init__.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/__init__.py
@@ -9,6 +9,7 @@
 from .kvcache import *
 from .quantization import *
 from .quantize_moe import *
+from .rms_norm import *
 from .rope import *
 from .sharding import *
 
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/kvcache.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/kvcache.py
@@ -143,8 +143,13 @@ def resize_kv_cache(
 
     free_mem_ratio specifies the fraction of available memory to occupy.
     """
-    free_mem, total_mem = torch.cuda.mem_get_info()
-    ad_logger.info(f"Free memory: {free_mem}, Total memory: {total_mem}")
+
+    def _get_mem_info_in_mb():
+        free_mem, total_mem = torch.cuda.mem_get_info()
+        return free_mem // 1024**2, total_mem // 1024**2
+
+    free_mem, total_mem = _get_mem_info_in_mb()
+    ad_logger.info(f"Free memory (MB): {free_mem}, Total memory (MB): {total_mem}")
     current_cache_size = cm.current_cache_size_bytes()
     current_num_pages = cm.info.num_pages
     ad_logger.info(
@@ -158,14 +163,16 @@ def resize_kv_cache(
     try:
         # Let's run a forward pass to get the memory usage
         cm.info._set_max_num_tokens_sample()
-        free_mem_pre, _ = torch.cuda.mem_get_info()
-        ad_logger.info(f"Free memory before forward pass: {free_mem_pre}")
+        free_mem_pre, _ = _get_mem_info_in_mb()
+        ad_logger.info(f"Free memory before forward pass (MB): {free_mem_pre}")
+
         egm(*cm.args)
-        free_mem_post, _ = torch.cuda.mem_get_info()
-        ad_logger.info(f"Free memory after forward pass: {free_mem_post}")
+
+        free_mem_post, _ = _get_mem_info_in_mb()
+        ad_logger.info(f"Free memory after forward pass (MB): {free_mem_post}")
 
         memory_for_forward_pass = free_mem_pre - free_mem_post
-        ad_logger.info(f"Memory for forward pass: {memory_for_forward_pass}")
+        ad_logger.info(f"Memory for forward pass (MB): {memory_for_forward_pass}")
 
         new_cache_size = free_mem_post * free_mem_ratio + current_cache_size
         new_num_pages = int(new_cache_size // (current_cache_size // current_num_pages))
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/library/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/transformations/library/rms_norm.py
@@ -0,0 +1,113 @@
+"""Graph transform to optimize RMSNorm execution using FlashInfer."""
+
+from functools import partial
+
+import torch
+from torch.fx import GraphModule
+
+from ...utils.logger import ad_logger
+
+# It is important to import ADPatternMatcherPass from pattern_matcher.py, not from torch._inductor.pattern_matcher
+from ...utils.pattern_matcher import ADPatternMatcherPass, register_ad_pattern
+from .._graph import canonicalize_graph
+
+_BACKEND_OPS = {
+    "flashinfer": torch.ops.auto_deploy.flashinfer_rms_norm,
+    "triton": torch.ops.auto_deploy.triton_rms_norm,
+    "torch": torch.ops.auto_deploy.torch_rmsnorm,
+}
+
+
+def _rms_norm_pattern(data: torch.Tensor, weight: torch.Tensor, eps: float) -> torch.Tensor:
+    """Implements the RMSNorm pattern for pattern matching.
+
+    Args:
+        data: Input tensor to normalize.
+        weight: Scaling weights for the normalized output.
+        eps: Small constant for numerical stability.
+
+    Returns:
+        Normalized and scaled tensor.
+    """
+    input_dtype = data.dtype
+    data = data.to(torch.float32)
+    variance = data.pow(2).mean(-1, keepdim=True)
+    data = data * torch.rsqrt(variance + eps)
+    return weight * data.to(input_dtype)
+
+
+def _rms_norm_replacement(
+    data: torch.Tensor, weight: torch.Tensor, eps: float, backend: str
+) -> torch.Tensor:
+    """Backend-specific rms_norm implementation.
+
+    Args:
+        data: Input tensor to normalize.
+        weight: Scaling weights for the normalized output.
+        eps: Small constant for numerical stability.
+        backend: Backend to use for RMSNorm computation ("flashinfer" or "triton").
+
+    Returns:
+        Normalized and scaled tensor using the specified backend implementation.
+    """
+
+    assert backend.lower() in _BACKEND_OPS, (
+        f"Invalid {backend=}; must be one of {list(_BACKEND_OPS)}"
+    )
+    return _BACKEND_OPS[backend.lower()](data, weight, eps)
+
+
+def fuse_rmsnorm(gm: GraphModule, backend: str = "triton") -> None:
+    """Matches and replaces RMSNorm patterns in the graph with FlashInfer or Triton implementation.
+
+    This function sets up pattern matching to identify RMSNorm operations in the graph
+    and replaces them with optimized implementations. It uses dummy tensors to register
+    the pattern matching rules.
+
+    Args:
+        gm: Input graph module to transform.
+        backend: Backend to use for RMSNorm computation ("flashinfer" or "triton").
+
+    Returns:
+        Transformed graph module with optimized RMSNorm operations.
+    """
+    if backend.lower() not in _BACKEND_OPS:
+        raise ValueError(f"Invalid backend, must be one of {list(_BACKEND_OPS)}, got {backend}")
+    ad_logger.info(f"Starting RMSNorm pattern matching with backend: {backend}")
+
+    graph = gm.graph
+    patterns = ADPatternMatcherPass()
+
+    # Create dummy tensors for pattern matching
+    bs = 2
+    hidden_size = 512
+
+    def dummy_args(input_dtype: torch.dtype, weight_dtype: torch.dtype, eps: float = 1e-6):
+        return [
+            torch.randn(bs, hidden_size, device="cuda", dtype=input_dtype),
+            torch.randn(hidden_size, device="cuda", dtype=weight_dtype),
+            eps,
+        ]
+
+    # Define configurations for different data types
+    configs = [
+        (torch.bfloat16, torch.bfloat16),
+        (torch.float16, torch.float16),
+        (torch.float32, torch.float32),
+    ]
+
+    # Register patterns for each configuration
+    for input_dtype, weight_dtype in configs:
+        register_ad_pattern(
+            search_fn=_rms_norm_pattern,
+            replace_fn=partial(_rms_norm_replacement, backend=backend),
+            patterns=patterns,
+            dummy_args=dummy_args(input_dtype, weight_dtype),
+            op_ignore_types={},
+            scalar_workaround={"eps": 1e-6},
+        )
+
+    cnt = patterns.apply(graph)
+    ad_logger.info(f"RMSNorm pattern count: {cnt}")
+    canonicalize_graph(gm)
+    ad_logger.debug("RMSNorm pattern matching completed.")
diff --git a/tensorrt_llm/_torch/auto_deploy/transformations/transform.py b/tensorrt_llm/_torch/auto_deploy/transformations/transform.py
@@ -22,6 +22,7 @@
     ep_shard,
     fuse_allreduce_residual_rmsnorm,
     fuse_collectives,
+    fuse_rmsnorm,
     insert_cached_attention,
     match_attention_layout,
     match_causal_attn_mask,
@@ -163,6 +164,10 @@ def __call__(self, cm: CachedSequenceInterface) -> nn.Module:
         # check if we can fuse collectives
         fuse_collectives(egm)
 
+        # TODO (lucaslie): add backend selection as part of configurable inference optimizers
+        # check if we can fuse rmsnorm
+        fuse_rmsnorm(egm, "flashinfer")
+
         # visualize the final graph
         if self.ad_config.visualize:
             try:
diff --git a/tensorrt_llm/bench/benchmark/throughput.py b/tensorrt_llm/bench/benchmark/throughput.py
@@ -388,6 +388,9 @@ def throughput_command(
                 logger.warning(
                     "Ignore extended_runtime_perf_knob_config for _autodeploy backend."
                 )
+            kwargs["world_size"] = kwargs.pop("tensor_parallel_size", None)
+            kwargs.pop("pipeline_parallel_size", None)
+
             llm = AutoDeployLLM(**kwargs)
         else:
             llm = LLM(**kwargs)
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_rms_norm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/triton_kernels/test_triton_rms_norm.py
@@ -1,18 +1,10 @@
 import torch
 
+from tensorrt_llm._torch.auto_deploy.custom_ops.rms_norm import *  # noqa
 from tensorrt_llm._torch.auto_deploy.custom_ops.triton_kernels.rms_norm import rms_norm
 
 
-def torch_forward(hidden_states, weight, variance_epsilon=1e-6):
-    """pytorch forward."""
-    input_dtype = hidden_states.dtype
-    hidden_states = hidden_states.to(torch.float32)
-    variance = hidden_states.pow(2).mean(-1, keepdim=True)
-    hidden_states = hidden_states * torch.rsqrt(variance + variance_epsilon)
-    return weight * hidden_states.to(input_dtype)
-
-
-def test_rms_norm():
+def test_rmsnorm_triton_op():
     bsz = 2
     ctx_len = 1024
     feat_len = 32
@@ -25,6 +17,6 @@ def test_rms_norm():
     weight = (
         torch.empty((feat_len), dtype=dtype, device="cuda").normal_(mean=0.0, std=0.5).contiguous()
     )
-    triton_output = rms_norm(hidden_states=input, weight=weight)
-    torch_output = torch_forward(hidden_states=input, weight=weight)
+    triton_output = rms_norm(input, weight, 1e-6)
+    torch_output = torch.ops.auto_deploy.torch_rmsnorm(input, weight, 1e-6)
     assert torch.allclose(torch_output, triton_output, atol=1e-2, rtol=0)
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rmsnorm.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/transformations/library/test_fuse_rmsnorm.py

Original file line number	Diff line number	Diff line change
`@@ -242,7 +242,7 @@ def _apply_patches(remaining_patches):`
`242`	`242`	`yield from _apply_patches(remaining_patches[1:])`
`243`	`243`
`244`	`244`	`# log applied patches`
`245`		`- ad_logger.info(`
	`245`	`+ ad_logger.debug(`
`246`	`246`	`f"applying export patches: {', '.join([patch.get_patch_key() for patch in patches])}"`
`247`	`247`	`)`
`248`	`248`