NVIDIA
diff --git a/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh‎
Lines changed: 83 additions & 4 deletions b/‎cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh‎
Lines changed: 83 additions & 4 deletions
@@ -27,10 +27,10 @@
 #include <string>
 #include <vector>
 
-#include "6kd_blockwise_gemm/sm120_fp8_gemm_1d2d.cuh"
 #include "ada_blockwise_gemm/sm89_fp8_gemm_1d1d.cuh"
 #include "fp8_blockscale_mma_utils.cuh"
 #include "fp8_blockscale_tma_utils.cuh"
+#include "sm120_blockwise_gemm/sm120_fp8_gemm_1d1d.cuh"
 #include "tensorrt_llm/common/config.h"
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/common/logger.h"
@@ -1653,14 +1653,28 @@ void gemm_dispatch_sm120(void* mat_a, void* mat_b, void* mat_d, float* scales_a,
     using Params = typename GemmKernel::Params;
     using Arguments = typename GemmKernel::Arguments;
     using ProblemShape = typename GemmKernel::ProblemShape;
+    ProblemShape problem_shape = make_shape((int) shape_m, (int) shape_n, (int) shape_k, 1);
 
     auto ptr_A = reinterpret_cast<ElementInput*>(mat_a);
     auto ptr_B = reinterpret_cast<ElementInput*>(mat_b);
     auto ptr_SFA = reinterpret_cast<ElementBlockScale*>(scales_a);
     auto ptr_SFB = reinterpret_cast<ElementBlockScale*>(scales_b);
     auto ptr_D = reinterpret_cast<ElementOutput*>(mat_d);
-    Arguments args = {ptr_A, ptr_B, ptr_SFA, ptr_SFB, ptr_D};
-    ProblemShape problem_shape = make_shape((int) shape_m, (int) shape_n, (int) shape_k, 1);
+
+    int32_t ld_a = shape_k;
+    int32_t stride_a = shape_m * shape_k;
+    int32_t ld_b = shape_k;
+    int32_t stride_b = shape_n * shape_k;
+    int32_t ld_d = shape_n;
+    int32_t stride_d = shape_m * shape_n;
+
+    typename KT::StrideA dA = make_stride(ld_a, Int<1>{}, stride_a);
+    typename KT::StrideB dB = make_stride(ld_b, Int<1>{}, stride_b);
+    typename KT::StrideSFA dSFA = KT::deduce_sfa_layout(problem_shape).stride();
+    typename KT::StrideSFB dSFB = KT::deduce_sfb_layout(problem_shape).stride();
+    typename KT::StrideD dD = make_stride(ld_d, Int<1>{}, stride_d);
+
+    Arguments args = {ptr_A, dA, ptr_B, dB, ptr_SFA, dSFA, ptr_SFB, dSFB, ptr_D, dD};
 
     Params kernel_params = GemmKernel::to_underlying_arguments(problem_shape, args);
     auto kernel_ptr = &cutlass::device_kernel<GemmKernel>;
@@ -1914,6 +1928,65 @@ void strided_batch_gemm_dispatch_sm89(__nv_fp8_e4m3* mat_a, int ld_a, int stride
         stride_scales_b);
 }
 
+void strided_batch_gemm_dispatch_sm120(__nv_fp8_e4m3* mat_a, int ld_a, int stride_a, __nv_fp8_e4m3* mat_b, int ld_b,
+    int stride_b, __nv_bfloat16* mat_d, int ld_d, int stride_d, float* scales_a, int stride_scales_a, float* scales_b,
+    uint32_t num_problems, uint32_t shape_m, uint32_t shape_n, uint32_t shape_k, cudaStream_t stream,
+    int num_device_sms = kNumDeviceSMs)
+{
+    if (num_device_sms < 0)
+    {
+        num_device_sms = kNumDeviceSMs = tensorrt_llm::common::getMultiProcessorCount();
+    }
+    using ElementInput = cute::float_e4m3_t;
+    using ElementOutput = cute::bfloat16_t;
+    using ElementAccum = float;
+    using ElementBlockScale = int32_t;
+    using KT = sm120_blockscaled_gemm::SM120BlockScaledBuilder<32, 128>;
+    using GemmKernel = sm120_blockscaled_gemm::SM120BlockScaledKernel<KT>;
+    using Params = typename GemmKernel::Params;
+    using Arguments = typename GemmKernel::Arguments;
+    using ProblemShape = typename GemmKernel::ProblemShape;
+    ProblemShape problem_shape = make_shape((int) shape_m, (int) shape_n, (int) shape_k, (int) num_problems);
+
+    auto ptr_A = reinterpret_cast<ElementInput*>(mat_a);
+    auto ptr_B = reinterpret_cast<ElementInput*>(mat_b);
+    auto ptr_SFA = reinterpret_cast<ElementBlockScale*>(scales_a);
+    auto ptr_SFB = reinterpret_cast<ElementBlockScale*>(scales_b);
+    auto ptr_D = reinterpret_cast<ElementOutput*>(mat_d);
+
+    typename KT::StrideA dA = make_stride(ld_a, Int<1>{}, stride_a);
+    typename KT::StrideB dB = make_stride(ld_b, Int<1>{}, stride_b);
+    typename KT::StrideSFA dSFA = KT::deduce_sfa_layout(problem_shape).stride();
+    typename KT::StrideSFB dSFB = KT::deduce_sfb_layout(problem_shape).stride();
+    typename KT::StrideD dD = make_stride(ld_d, Int<1>{}, stride_d);
+
+    Arguments args = {ptr_A, dA, ptr_B, dB, ptr_SFA, dSFA, ptr_SFB, dSFB, ptr_D, dD};
+
+    Params kernel_params = GemmKernel::to_underlying_arguments(problem_shape, args);
+    auto kernel_ptr = &cutlass::device_kernel<GemmKernel>;
+
+    cudaFuncSetAttribute(kernel_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, GemmKernel::kSmemSize);
+    auto result = cudaGetLastError();
+    TLLM_CHECK_WITH_INFO(result == cudaSuccess, "sm120 gemm kernel cannot launch: %s", cudaGetErrorString(result));
+
+    cudaLaunchConfig_t launch_config;
+    cudaLaunchAttribute attrs[1];
+    attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+    attrs[0].val.programmaticStreamSerializationAllowed = 1;
+
+    launch_config.gridDim = GemmKernel::get_grid_shape(kernel_params);
+    launch_config.blockDim = GemmKernel::get_block_shape();
+    launch_config.dynamicSmemBytes = GemmKernel::kSmemSize;
+    launch_config.stream = stream;
+    launch_config.attrs = attrs;
+    launch_config.numAttrs = 1;
+
+    cudaLaunchKernelEx(&launch_config, kernel_ptr, kernel_params);
+
+    result = cudaGetLastError();
+    TLLM_CHECK_WITH_INFO(result == cudaSuccess, "sm120 gemm kernel runtime error: %s", cudaGetErrorString(result));
+}
+
 void fp8_stride_batch_gemm_run(__nv_bfloat16 const* mat_a, __nv_fp8_e4m3* fp8_mat_a, float* scales_a, int ld_a,
     int stride_a, int stride_scales_a, __nv_bfloat16 const* mat_b, __nv_fp8_e4m3* fp8_mat_b, float* scales_b, int ld_b,
     int stride_b, __nv_bfloat16* mat_d, int ld_d, int stride_d, uint32_t num_problems, uint32_t shape_m,
@@ -1941,12 +2014,18 @@ void fp8_stride_batch_gemm_run(__nv_bfloat16 const* mat_a, __nv_fp8_e4m3* fp8_ma
     }
 
     int arch = tensorrt_llm::common::getSMVersion();
-    if (arch == 89 || arch == 120)
+    if (arch == 89)
     {
         strided_batch_gemm_dispatch_sm89(fp8_mat_a, ld_a, stride_a, fp8_mat_b, ld_b, stride_b, mat_d, ld_d, stride_d,
             scales_a, stride_scales_a, scales_b, num_problems, shape_m, shape_n, shape_k, stream);
         return;
     }
+    if (arch == 120)
+    {
+        strided_batch_gemm_dispatch_sm120(fp8_mat_a, ld_a, stride_a, fp8_mat_b, ld_b, stride_b, mat_d, ld_d, stride_d,
+            scales_a, stride_scales_a, scales_b, num_problems, shape_m, shape_n, shape_k, stream);
+        return;
+    }
     if (kDeepGemmEnabled)
     {
         strided_batch_gemm_dispatch(fp8_mat_a, ld_a, stride_a, fp8_mat_b, ld_b, stride_b, mat_d, ld_d, stride_d,