NVIDIA
diff --git a/‎examples/llm-api/quickstart_advanced.py‎
Lines changed: 5 additions & 0 deletions b/‎examples/llm-api/quickstart_advanced.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py‎
Lines changed: 281 additions & 0 deletions b/‎tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py‎
Lines changed: 281 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/model_config.py‎
Lines changed: 1 addition & 0 deletions b/‎tensorrt_llm/_torch/model_config.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorrt_llm/_torch/modules/attention.py‎
Lines changed: 12 additions & 5 deletions b/‎tensorrt_llm/_torch/modules/attention.py‎
Lines changed: 12 additions & 5 deletions
@@ -189,6 +189,10 @@ def add_llm_args(parser):
                         default=False,
                         action='store_true',
                         help='Use CuTe DSL bf16 persistent GEMM for BMM on Blackwell.')
+    parser.add_argument('--use_cute_dsl_bf16_gemm',
+                        default=False,
+                        action='store_true',
+                        help='Use CuTe DSL bf16 persistent GEMM for Linear layers on Blackwell.')
 
     # HF
     parser.add_argument('--trust_remote_code',
@@ -318,6 +322,7 @@ def setup_llm(args, **kwargs):
         max_beam_width=args.max_beam_width,
         orchestrator_type=args.orchestrator_type,
         use_cute_dsl_bf16_bmm=args.use_cute_dsl_bf16_bmm,
+        use_cute_dsl_bf16_gemm=args.use_cute_dsl_bf16_gemm,
         **kwargs)
 
     use_beam_search = args.max_beam_width > 1
 
@@ -4011,3 +4011,284 @@ def _(
         assert output.dtype == torch.bfloat16, "CuTe DSL bf16 bmm output dtype must be bf16"
         assert output.shape == (batch_size, m,
                                 n), "CuTe DSL bf16 bmm output shape is incorrect"
+
+    # ======================================================================
+    # BF16 Dense Persistent GEMM (CuTe DSL) for Blackwell - Linear layers
+    # ======================================================================
+
+    class CuteDSLBf16BlackwellGemmRunner(TunableRunner):
+        """
+        CuTe DSL BF16 GEMM runner for Linear layers.
+
+        Unlike BMM which operates on [B, M, K] @ [B, N, K] -> [B, M, N],
+        GEMM operates on [M, K] @ [N, K]^T -> [M, N] (standard Linear).
+
+        We reuse PersistentDenseGemmKernel with batch_size=1.
+        """
+        kernel_class = PersistentDenseGemmKernel
+        kernel_cache = dict()
+
+        tuning_config = TuningConfig(
+            dynamic_tensor_specs=(DynamicTensorSpec(
+                0, 0, get_last_power_of_2_num_tokens_buckets,
+                last_positive_power_of_2), ),
+        )
+
+        def __init__(self, use_tvm_ffi: bool = True):
+            super().__init__()
+            self.use_tvm_ffi = use_tvm_ffi
+
+        def get_valid_tactics(
+            self,
+            inputs: List[torch.Tensor],
+            profile: OptimizationProfile,
+            **kwargs,
+        ) -> List[int]:
+
+            if not is_sm_100f():
+                logger.debug(
+                    f"CuteDSL: SM version {get_sm_version()} is not supported. "
+                    f"CuteDSL BF16 GEMM only supports SM 100 family. Skipping all tactics."
+                )
+                return []
+
+            # input: [M, K], weight: [N, K]
+            m, k = inputs[0].shape[0], inputs[0].shape[1]
+            n = inputs[1].shape[0]
+            batch_size = 1
+
+            # Layouts: A is [M, K] K-major, B is [N, K] K-major
+            a_major = "k"
+            b_major = "k"
+            c_major = "n"
+
+            use_2cta_instrs_candi = [False, True]
+            mma_tiler_mn_candi = [(64, 128), (128, 128), (256, 128)]
+            cluster_shape_mn_candi = [
+                (1, 1),
+                (1, 2),
+                (1, 4),
+                (2, 1),
+                (2, 2),
+                (2, 4),
+                (4, 1),
+                (4, 2),
+                (4, 4),
+            ]
+            return [
+                (use_2cta_instrs, mma_tiler_mn, cluster_shape_mn)
+                for use_2cta_instrs in use_2cta_instrs_candi
+                for mma_tiler_mn in mma_tiler_mn_candi
+                for cluster_shape_mn in cluster_shape_mn_candi
+                if self.__class__.kernel_class.can_implement(
+                    cutlass.BFloat16,  # ab_dtype
+                    cutlass.Float32,  # acc_dtype
+                    cutlass.BFloat16,  # c_dtype
+                    use_2cta_instrs,
+                    mma_tiler_mn,
+                    cluster_shape_mn,
+                    m,
+                    n,
+                    k,
+                    batch_size,
+                    a_major,
+                    b_major,
+                    c_major,
+                )
+            ]
+
+        def forward(
+            self,
+            inputs: List[torch.Tensor],
+            tactic,
+        ) -> None:
+            """
+            Performs bf16 dense persistent GEMM using CuTe DSL.
+
+            Args:
+                inputs (List[torch.Tensor]):
+                    inputs[0]: Input tensor of shape (m, k), dtype: bf16.
+                    inputs[1]: Weight tensor of shape (n, k), dtype: bf16.
+                    inputs[2]: Output tensor of shape (m, n), dtype: bf16.
+                tactic: Tiling and cluster strategy, typically a tuple
+                    (use_2cta_instrs, mma_tiler_mn, cluster_shape_mn).
+            """
+            if isinstance(tactic, tuple):
+                use_2cta_instrs, mma_tiler_mn, cluster_shape_mn = tactic
+            else:
+                use_2cta_instrs, mma_tiler_mn, cluster_shape_mn = [
+                    False,
+                    (128, 128),
+                    (1, 1),
+                ]
+
+            a_tensor, b_tensor, c_tensor = inputs
+
+            # Input: [M, K], Weight: [N, K], Output: [M, N]
+            m, k = a_tensor.shape[0], a_tensor.shape[1]
+            n = b_tensor.shape[0]
+            batch_size = 1
+
+            # Ensure inputs are contiguous
+            a_tensor = a_tensor.contiguous()
+            b_tensor = b_tensor.contiguous()
+
+            # For output, use contiguous buffer if needed
+            c_needs_copy = not c_tensor.is_contiguous()
+            if c_needs_copy:
+                c_buf = torch.empty_like(c_tensor)
+            else:
+                c_buf = c_tensor
+
+            # Reshape to [1, M, K], [1, N, K], [1, M, N] for the batched kernel
+            a_batched = a_tensor.unsqueeze(0)  # [1, M, K]
+            b_batched = b_tensor.unsqueeze(0)  # [1, N, K]
+            # c_buf is [M, N], permute to [M, N, 1] for cute layout
+            c_tmp = c_buf.unsqueeze(-1)  # [M, N, 1]
+
+            if not self.use_tvm_ffi:
+                a_ptr = make_ptr(
+                    cutlass.BFloat16,
+                    a_batched.data_ptr(),
+                    cute.AddressSpace.gmem,
+                    assumed_align=16,
+                )
+                b_ptr = make_ptr(
+                    cutlass.BFloat16,
+                    b_batched.data_ptr(),
+                    cute.AddressSpace.gmem,
+                    assumed_align=16,
+                )
+                c_cute_tensor = cute.runtime.from_dlpack(
+                    c_tmp).mark_layout_dynamic(leading_dim=1)
+
+                stream = cuda.CUstream(torch.cuda.current_stream().cuda_stream)
+
+            cache_key = (
+                use_2cta_instrs,
+                mma_tiler_mn,
+                cluster_shape_mn,
+                self.use_tvm_ffi,
+            )
+            if cache_key not in self.__class__.kernel_cache:
+                if self.use_tvm_ffi:
+                    a_ptr = make_ptr(
+                        cutlass.BFloat16,
+                        a_batched.data_ptr(),
+                        cute.AddressSpace.gmem,
+                        assumed_align=16,
+                    )
+                    b_ptr = make_ptr(
+                        cutlass.BFloat16,
+                        b_batched.data_ptr(),
+                        cute.AddressSpace.gmem,
+                        assumed_align=16,
+                    )
+                    c_cute_tensor = cute.runtime.from_dlpack(
+                        c_tmp).mark_layout_dynamic(leading_dim=1)
+                    stream = cute.runtime.make_fake_stream(
+                        use_tvm_ffi_env_stream=True)
+
+                gemm = self.__class__.kernel_class(
+                    cutlass.Float32,  # acc_dtype
+                    use_2cta_instrs=use_2cta_instrs,
+                    mma_tiler_mn=mma_tiler_mn,
+                    cluster_shape_mn=cluster_shape_mn,
+                )
+                hardware_info = cutlass.utils.HardwareInfo()
+                max_active_clusters = hardware_info.get_max_active_clusters(
+                    cluster_shape_mn[0] * cluster_shape_mn[1])
+
+                compiled_gemm = cute.compile(
+                    gemm.wrapper,
+                    m,
+                    n,
+                    k,
+                    batch_size,
+                    a_ptr,
+                    b_ptr,
+                    c_cute_tensor,
+                    max_active_clusters=max_active_clusters,
+                    stream=stream,
+                    options=f"--opt-level 2 --enable-tvm-ffi"
+                    if self.use_tvm_ffi else "--opt-level 2",
+                )
+                self.__class__.kernel_cache[cache_key] = compiled_gemm
+            else:
+                compiled_gemm = self.__class__.kernel_cache[cache_key]
+
+            # launch gemm kernel
+            if self.use_tvm_ffi:
+                compiled_gemm(
+                    m,
+                    n,
+                    k,
+                    batch_size,
+                    a_batched.data_ptr(),
+                    b_batched.data_ptr(),
+                    c_tmp,
+                )
+            else:
+                compiled_gemm(
+                    m,
+                    n,
+                    k,
+                    batch_size,
+                    a_ptr,
+                    b_ptr,
+                    c_cute_tensor,
+                    stream=stream,
+                )
+
+            # Copy result back if original output was non-contiguous
+            if c_needs_copy:
+                c_tensor.copy_(c_buf)
+
+    # input: [M, K], weight: [N, K], output: [M, N]
+    @torch.library.custom_op("trtllm::cute_dsl_bf16_gemm_blackwell",
+                             mutates_args=("output", ),
+                             device_types="cuda")
+    def cute_dsl_bf16_gemm_blackwell(
+        input: torch.Tensor,
+        weight: torch.Tensor,
+        output: torch.Tensor,
+        use_tvm_ffi: bool = True,
+    ) -> None:
+        """
+        CuTe DSL BF16 GEMM for Linear layers on Blackwell.
+
+        Computes: output = input @ weight^T
+        - input: [M, K] (num_tokens, in_features)
+        - weight: [N, K] (out_features, in_features)
+        - output: [M, N] (num_tokens, out_features)
+        """
+        if not is_sm_100f():
+            raise ValueError(
+                f"CuteDSL: SM version {get_sm_version()} is not supported. "
+                f"CuteDSL BF16 GEMM only supports SM 100 family.")
+
+        tuner = AutoTuner.get()
+
+        runner = CuteDSLBf16BlackwellGemmRunner(use_tvm_ffi=use_tvm_ffi)
+
+        inputs = [input, weight, output]
+
+        _, best_tactic = tuner.choose_one(
+            "trtllm::cute_dsl_bf16_gemm_blackwell::gemm",
+            [runner],
+            runner.__class__.tuning_config,
+            inputs,
+        )
+        runner(inputs, tactic=best_tactic)
+
+    @torch.library.register_fake("trtllm::cute_dsl_bf16_gemm_blackwell")
+    def _(
+        mat_a: torch.Tensor,
+        mat_b: torch.Tensor,
+        output: torch.Tensor,
+        use_tvm_ffi: bool = True,
+    ) -> None:
+        m, k = mat_a.shape[0], mat_a.shape[1]
+        n = mat_b.shape[0]
+        assert output.dtype == torch.bfloat16, "CuTe DSL bf16 gemm output dtype must be bf16"
+        assert output.shape == (m, n), "CuTe DSL bf16 gemm output shape is incorrect"
@@ -126,6 +126,7 @@ class ModelConfig(Generic[TConfig]):
     use_cute_dsl_blockscaling_mm: bool = False
     use_cute_dsl_blockscaling_bmm: bool = False
     use_cute_dsl_bf16_bmm: bool = False
+    use_cute_dsl_bf16_gemm: bool = False
 
     _frozen: bool = field(default=False, init=False, repr=False)
 
 
@@ -462,6 +462,7 @@ def __init__(
         self.use_cute_dsl_blockscaling_mm = config.use_cute_dsl_blockscaling_mm
         self.use_cute_dsl_blockscaling_bmm = config.use_cute_dsl_blockscaling_bmm
         self.use_cute_dsl_bf16_bmm = config.use_cute_dsl_bf16_bmm
+        self.use_cute_dsl_bf16_gemm = config.use_cute_dsl_bf16_gemm
 
         qkv_shard_indices_mapping = {
             "q": (0, self.q_size * (2 if self.attn_output_gate else 1)),
@@ -1124,6 +1125,7 @@ def __init__(
         self.use_cute_dsl_blockscaling_mm = config.use_cute_dsl_blockscaling_mm
         self.use_cute_dsl_blockscaling_bmm = config.use_cute_dsl_blockscaling_bmm
         self.use_cute_dsl_bf16_bmm = config.use_cute_dsl_bf16_bmm
+        self.use_cute_dsl_bf16_gemm = config.use_cute_dsl_bf16_gemm
 
         if not self.is_lite:
             self.kv_a_proj_with_mqa = Linear(
@@ -1135,7 +1137,8 @@ def __init__(
                 skip_create_weights_in_init=config.skip_create_weights_in_init,
                 use_custom_cublas_mm=True,
                 force_dynamic_quantization=config.force_dynamic_quantization,
-                use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm)
+                use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm,
+                use_cute_dsl_bf16_gemm=self.use_cute_dsl_bf16_gemm)
 
             self.q_a_layernorm = RMSNorm(hidden_size=self.q_lora_rank,
                                          eps=rms_norm_eps,
@@ -1152,7 +1155,8 @@ def __init__(
                 skip_create_weights_in_init=config.skip_create_weights_in_init,
                 allreduce_strategy=config.allreduce_strategy,
                 force_dynamic_quantization=config.force_dynamic_quantization,
-                use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm)
+                use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm,
+                use_cute_dsl_bf16_gemm=self.use_cute_dsl_bf16_gemm)
         else:
             self.kv_a_proj_with_mqa = Linear(
                 hidden_size,
@@ -1163,7 +1167,8 @@ def __init__(
                 skip_create_weights_in_init=config.skip_create_weights_in_init,
                 use_custom_cublas_mm=True,
                 force_dynamic_quantization=config.force_dynamic_quantization,
-                use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm)
+                use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm,
+                use_cute_dsl_bf16_gemm=self.use_cute_dsl_bf16_gemm)
 
             self.q_proj = Linear(
                 self.q_lora_rank,
@@ -1176,7 +1181,8 @@ def __init__(
                 skip_create_weights_in_init=config.skip_create_weights_in_init,
                 allreduce_strategy=config.allreduce_strategy,
                 force_dynamic_quantization=config.force_dynamic_quantization,
-                use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm)
+                use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm,
+                use_cute_dsl_bf16_gemm=self.use_cute_dsl_bf16_gemm)
             self.q_b_proj = self.q_proj
 
         self.kv_a_layernorm = RMSNorm(hidden_size=kv_lora_rank,
@@ -1194,7 +1200,8 @@ def __init__(
             skip_create_weights_in_init=config.skip_create_weights_in_init,
             allreduce_strategy=config.allreduce_strategy,
             force_dynamic_quantization=config.force_dynamic_quantization,
-            use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm)
+            use_cute_dsl_blockscaling_mm=self.use_cute_dsl_blockscaling_mm,
+            use_cute_dsl_bf16_gemm=self.use_cute_dsl_bf16_gemm)
         # This parameter will view into self.kv_b_proj.weight after loading weights.
         # For dummy weight initialization, this parameter is initialized with empty tensor.
         # Used in forward_absorption only