[NVIDIA#9150][feat] AutoDeploy: reviewer comments for NVIDIA#9150 (NVIDIA#9527)

lucaslie · lucaslie · commit e1372fb46c82 · 2025-12-03T06:26:53.000-08:00
Signed-off-by: Lucas Liebenwein &lt;11156568+lucaslie@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_backend_delta.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_backend_delta.py
@@ -1,4 +1,9 @@
-"""Cached attention op for chunked delta rule using the fla kernel library."""
+"""Cached attention op for delta rule using the fla kernel library.
+
+Delta Rule is based on this paper: https://arxiv.org/abs/2406.06484
+
+Kernels are based on this repo: https://github.com/fla-org/flash-linear-attention
+"""
 
 from typing import List, Tuple
 
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_delta.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fla/fla_delta.py
@@ -1,4 +1,9 @@
-"""Custom ops corresponding to fla's chunked delta rule."""
+"""Custom ops corresponding to fla's chunked delta rule.
+
+Delta Rule is based on this paper: https://arxiv.org/abs/2406.06484
+
+Kernels are based on this repo: https://github.com/fla-org/flash-linear-attention
+"""
 
 from typing import Optional
 
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/l2norm.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/l2norm.py
@@ -4,10 +4,6 @@
 
 from tensorrt_llm._torch.modules.fla.l2norm import l2norm_fwd
 
-# TODO: add a pattern matcher for this such that
-# 1. pattern match to torch_l2norm
-# 2. fuse transform to map to desired backend like fla
-
 
 @torch.library.custom_op("auto_deploy::torch_l2norm", mutates_args=())
 def _torch_l2norm(x: torch.Tensor, eps: float = 1e-6) -> torch.Tensor:
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/rms_norm.py
@@ -70,11 +70,13 @@ def torch_rmsnorm(input: torch.Tensor, weight: torch.Tensor, eps: float) -> torc
         weight: Scaling weights for the normalized output.
         eps: Small constant for numerical stability.
     """
-    input_dtype = input.dtype
+    # pre-allocate output to ensure same dtype+stride as input
+    out = torch.empty_like(input)
     input = input.to(torch.float32)
     variance = input.pow(2).mean(-1, keepdim=True)
     input = input * torch.rsqrt(variance + eps)
-    return (weight * input.to(input_dtype)).contiguous()
+    out.copy_((weight * input.to(out.dtype)))
+    return out
 
 
 @torch_rmsnorm.register_fake