[TRTLLM-11289][perf] Eliminate contiguous copies in CuTe DSL BF16 BMM path

peaceh-nv · peaceh-nv · commit 33ac890d3ad6 · 2026-03-17T05:37:23.000-07:00
Add wrapper_strided to PersistentDenseGemmKernel that accepts explicit A
tensor strides, enabling non-contiguous views (e.g. from .transpose()) to
be passed directly to TMA without .contiguous() copies. Update the BMM
runner to compute and pass A strides instead of forcing contiguous tensors,
removing the direct_copy_kernel_cuda overhead between attention and BMM.

Signed-off-by: peaceh &lt;103117813+peaceh-nv@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
@@ -3847,31 +3847,27 @@ def forward(
 
             a_tensor, b_tensor, c_tensor = inputs
 
-            # Ensure A and B are contiguous — the kernel constructs CuTe
-            # layouts via make_ordered_layout assuming contiguous [B, M, K]
-            # and [B, N, K].  Transpose views (e.g. from .transpose(0,1))
-            # have swapped batch/seq strides which would cause the kernel
-            # to read from wrong memory locations.
-            a_tensor = a_tensor.contiguous()
-            b_tensor = b_tensor.contiguous()
-
-            # For the output, use a contiguous buffer so TMA store sees a
-            # standard layout; copy back afterwards if the original was
-            # non-contiguous.
-            c_needs_copy = not c_tensor.is_contiguous()
-            if c_needs_copy:
-                c_buf = torch.empty_like(c_tensor)
-            else:
-                c_buf = c_tensor
-
-            # c_buf is [B, M, N], permute to [M, N, B] for cute layout
-            c_tmp = c_buf.permute(1, 2, 0)
+            # Permute C from [B, M, N] to [M, N, B] for CuTe layout.
+            # from_dlpack captures the actual strides, so non-contiguous
+            # views (e.g. from .transpose(0,1)) are handled natively by
+            # TMA without an extra copy.
+            c_tmp = c_tensor.permute(1, 2, 0)
 
             batch_size = a_tensor.shape[0]
             m = a_tensor.shape[1]
             k = a_tensor.shape[2]
             n = b_tensor.shape[1]
 
+            # Compute A strides so the kernel can handle non-contiguous
+            # views (e.g. [M,B,K].transpose(0,1) → [B,M,K] with
+            # non-standard strides) without a .contiguous() copy.
+            # CuTe tensor is (M, K, B) so strides map as:
+            #   M stride  = a_tensor.stride(1)
+            #   K stride  = 1  (always innermost)
+            #   B stride  = a_tensor.stride(0)
+            a_stride_m = a_tensor.stride(1)
+            a_stride_batch = a_tensor.stride(0)
+
             if not self.use_tvm_ffi:
                 a_ptr = make_ptr(
                     cutlass.BFloat16,
@@ -3926,14 +3922,16 @@ def forward(
                     cluster_shape_mn[0] * cluster_shape_mn[1])
 
                 compiled_gemm = cute.compile(
-                    gemm.wrapper,
+                    gemm.wrapper_strided,
                     m,
                     n,
                     k,
                     batch_size,
                     a_ptr,
                     b_ptr,
                     c_cute_tensor,
+                    a_stride_m,
+                    a_stride_batch,
                     max_active_clusters=max_active_clusters,
                     stream=stream,
                     options=f"--opt-level 2 --enable-tvm-ffi"
@@ -3953,6 +3951,8 @@ def forward(
                     a_tensor.data_ptr(),
                     b_tensor.data_ptr(),
                     c_tmp,
+                    a_stride_m,
+                    a_stride_batch,
                 )
             else:
                 compiled_gemm(
@@ -3963,13 +3963,11 @@ def forward(
                     a_ptr,
                     b_ptr,
                     c_cute_tensor,
+                    a_stride_m,
+                    a_stride_batch,
                     stream=stream,
                 )
 
-            # Copy result back if original output was non-contiguous
-            if c_needs_copy:
-                c_tensor.copy_(c_buf)
-
     # a/b: bf16, output: bf16
     @torch.library.custom_op("trtllm::cute_dsl_bf16_bmm_blackwell",
                              mutates_args=("output", ),
diff --git a/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_gemm_persistent.py b/tensorrt_llm/_torch/cute_dsl_kernels/blackwell/dense_gemm_persistent.py
@@ -1010,3 +1010,62 @@ def wrapper(
             max_active_clusters,
             stream,
         )
+
+    @cute.jit
+    def wrapper_strided(
+        self,
+        m: cutlass.Int32,
+        n: cutlass.Int32,
+        k: cutlass.Int32,
+        batch_size: cutlass.Int32,
+        a_ptr: cute.Pointer,
+        b_ptr: cute.Pointer,
+        c_tensor: cute.Tensor,
+        a_stride_m: cutlass.Int32,
+        a_stride_batch: cutlass.Int32,
+        max_active_clusters: cutlass.Constexpr,
+        stream: cuda.CUstream,
+    ):
+        """Executes the GEMM kernel with explicit A tensor strides.
+
+        Like ``wrapper`` but allows non-contiguous A tensors by accepting
+        the M and batch strides directly.  The K stride is assumed to be 1
+        (row-major in K).  B is always contiguous.
+
+        Args:
+            m: The M dimension of the GEMM problem.
+            n: The N dimension of the GEMM problem.
+            k: The K dimension of the GEMM problem.
+            batch_size: The batch dimension.
+            a_ptr: Pointer to the A tensor data.
+            b_ptr: Pointer to the B tensor data.
+            c_tensor: Output tensor as cute.Tensor.
+            a_stride_m: Stride of A along the M dimension (in elements).
+            a_stride_batch: Stride of A along the batch dimension (in elements).
+            max_active_clusters: Maximum number of active clusters.
+            stream: CUDA stream for the operation.
+        """
+        # A with explicit strides: (M, K, batch_size), K stride = 1
+        a_tensor = cute.make_tensor(
+            a_ptr,
+            layout=cute.make_layout(
+                (m, k, batch_size),
+                stride=(a_stride_m, 1, a_stride_batch),
+            ),
+        )
+        # B is always contiguous: (N, K, batch_size) with K innermost
+        b_tensor = cute.make_tensor(
+            b_ptr,
+            layout=cute.make_ordered_layout(
+                (n, k, batch_size),
+                order=(1, 0, 2),
+            ),
+        )
+
+        self(
+            a_tensor,
+            b_tensor,
+            c_tensor,
+            max_active_clusters,
+            stream,
+        )