[TRTLLM-1234][feat] Add fp8 blockscaled Gemm for sm120 (NVIDIA#8844)

CarstyYou · fredricz-20070104 · commit 877791745579 · 2025-11-05T10:55:25.000+08:00
Signed-off-by: CarstyYou &lt;186021327+CarstyYou@users.noreply.github.com&gt;
Signed-off-by: FredricZ-2007 &lt;226039983+fredricz-20070104@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt b/cpp/tensorrt_llm/kernels/cutlass_kernels/CMakeLists.txt
@@ -205,7 +205,7 @@ set_cuda_architectures(fb_gemm_src 89 90 100f 120f)
 # ${INSTANTIATION_GENERATION_DIR}/fp8_rowwise_gemm)
 
 add_library(fp8_blockscale_gemm_src STATIC ${FP8_BLOCKSCALE_GEMM_SRC_CU})
-set_cuda_architectures(fp8_blockscale_gemm_src 89 90 100f)
+set_cuda_architectures(fp8_blockscale_gemm_src 89 90 100f 120f)
 
 set(GEMM_SWIGLU_SM90_SRC_CU
     ${CMAKE_CURRENT_SOURCE_DIR}/fused_gated_gemm/gemm_swiglu_e4m3.cu)
diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh
@@ -1622,16 +1622,15 @@ void gemm_dispatch_sm89(void* mat_a, void* mat_b, void* mat_d, float* scales_a,
     dim3 grid = dim3(grid_m, grid_n, grid_k);
     dim3 block = dim3(kThreadCount, 1, 1);
 
-    if (kSmemSize > (48 << 10))
-    {
-        cudaFuncSetAttribute(ada_blockwise_gemm::sm89_fp8_gemm_1d1d_impl<GemmKernel>,
-            cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize);
-        auto result = cudaGetLastError();
-        TLLM_CHECK_WITH_INFO(result == cudaSuccess, "sm89 gemm kernel cannot launch: %s", cudaGetErrorString(result));
-    }
+    auto result = cudaFuncSetAttribute(ada_blockwise_gemm::sm89_fp8_gemm_1d1d_impl<GemmKernel>,
+        cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize);
+    TLLM_CHECK_WITH_INFO(result == cudaSuccess, "sm89 gemm kernel cannot launch: %s", cudaGetErrorString(result));
 
     ada_blockwise_gemm::sm89_fp8_gemm_1d1d_impl<GemmKernel>
         <<<grid, block, kSmemSize, stream>>>(shape_m, shape_n, shape_k, mat_a, mat_b, mat_d, scales_a, scales_b);
+
+    result = cudaGetLastError();
+    TLLM_CHECK_WITH_INFO(result == cudaSuccess, "sm89 gemm kernel runtime error: %s", cudaGetErrorString(result));
 }
 
 void fp8_gemm_run(__nv_fp8_e4m3* mat_a, int ld_a, __nv_fp8_e4m3* mat_b, int ld_b, __nv_bfloat16* mat_d, int ld_d,
@@ -1643,7 +1642,7 @@ void fp8_gemm_run(__nv_fp8_e4m3* mat_a, int ld_a, __nv_fp8_e4m3* mat_b, int ld_b
     }
 #ifndef PLACEHOLDER_KERNELS
     int arch = tensorrt_llm::common::getSMVersion();
-    if (arch == 89)
+    if (arch == 89 || arch == 120)
     {
         gemm_dispatch_sm89(mat_a, mat_b, mat_d, scales_a, scales_b, shape_m, shape_n, shape_k, stream);
         return;
@@ -1883,7 +1882,7 @@ void fp8_stride_batch_gemm_run(__nv_bfloat16 const* mat_a, __nv_fp8_e4m3* fp8_ma
     }
 
     int arch = tensorrt_llm::common::getSMVersion();
-    if (arch == 89)
+    if (arch == 89 || arch == 120)
     {
         strided_batch_gemm_dispatch_sm89(fp8_mat_a, ld_a, stride_a, fp8_mat_b, ld_b, stride_b, mat_d, ld_d, stride_d,
             scales_a, stride_scales_a, scales_b, num_problems, shape_m, shape_n, shape_k, stream);
diff --git a/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp b/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp
@@ -209,6 +209,7 @@ extern torch::Tensor fp8_block_scaling_gemm(torch::Tensor const& mat1, torch::Te
     case 100: return fp8_block_scale_gemm_blackwell(mat1, mat2, mat1Scale, mat2Scale);
     case 90: return fp8_block_scaling_gemm_hopper(mat1, mat2, mat1Scale, mat2Scale);
     case 89: return fp8_block_scaling_gemm_ada(mat1, mat2, mat1Scale, mat2Scale);
+    case 120: return fp8_block_scaling_gemm_ada(mat1, mat2, mat1Scale, mat2Scale);
     default: TORCH_CHECK(false, "Unsupported SM version for FP8 block scaling GEMM");
     }
 }
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -648,7 +648,7 @@ def fp8_block_scaling_bmm_out(
     mat2_dequant: Optional[torch.Tensor] = None,
 ) -> torch.Tensor:
     sm_version = get_sm_version()
-    if sm_version == 90 or sm_version == 89:
+    if sm_version == 90 or sm_version == 89 or sm_version == 120:
         mat1_fp8, mat1_scale = torch.ops.trtllm.fp8_batched_quantize_1x128_permute102(
             mat1)
 
diff --git a/tests/unittest/_torch/thop/parallel/test_fp8_block_scale_gemm.py b/tests/unittest/_torch/thop/parallel/test_fp8_block_scale_gemm.py
@@ -63,7 +63,7 @@ def test_fp8_block_scale_deep_gemm(dtype, m, k, n):
 
 
 @pytest.mark.skipif(
-    getSMVersion() != 100 and getSMVersion() != 89,
+    getSMVersion() != 100 and getSMVersion() != 89 and getSMVersion() != 120,
     reason="The test is for Blackwell and Ada only. Current SM is %d." %
     getSMVersion(),
 )
@@ -99,7 +99,7 @@ def test_fp8_block_scale_gemm(dtype, m, k, n):
 
 
 @pytest.mark.skipif(
-    getSMVersion() != 90 and getSMVersion() != 89,
+    getSMVersion() != 90 and getSMVersion() != 89 and getSMVersion() != 120,
     reason="The test is for Hopper and Ada only. Current SM is %d." %
     getSMVersion(),
 )

Original file line number	Diff line number	Diff line change
`@@ -209,6 +209,7 @@ extern torch::Tensor fp8_block_scaling_gemm(torch::Tensor const& mat1, torch::Te`
`209`	`209`	`case 100: return fp8_block_scale_gemm_blackwell(mat1, mat2, mat1Scale, mat2Scale);`
`210`	`210`	`case 90: return fp8_block_scaling_gemm_hopper(mat1, mat2, mat1Scale, mat2Scale);`
`211`	`211`	`case 89: return fp8_block_scaling_gemm_ada(mat1, mat2, mat1Scale, mat2Scale);`
	`212`	`+ case 120: return fp8_block_scaling_gemm_ada(mat1, mat2, mat1Scale, mat2Scale);`
`212`	`213`	`default: TORCH_CHECK(false, "Unsupported SM version for FP8 block scaling GEMM");`
`213`	`214`	`}`
`214`	`215`	`}`
Original file line number	Diff line number	Diff line change
`@@ -63,7 +63,7 @@ def test_fp8_block_scale_deep_gemm(dtype, m, k, n):`
`63`	`63`
`64`	`64`
`65`	`65`	`@pytest.mark.skipif(`
`66`		`- getSMVersion() != 100 and getSMVersion() != 89,`
	`66`	`+ getSMVersion() != 100 and getSMVersion() != 89 and getSMVersion() != 120,`
`67`	`67`	`reason="The test is for Blackwell and Ada only. Current SM is %d." %`
`68`	`68`	`getSMVersion(),`
`69`	`69`	`)`
`@@ -99,7 +99,7 @@ def test_fp8_block_scale_gemm(dtype, m, k, n):`
`99`	`99`
`100`	`100`
`101`	`101`	`@pytest.mark.skipif(`
`102`		`- getSMVersion() != 90 and getSMVersion() != 89,`
	`102`	`+ getSMVersion() != 90 and getSMVersion() != 89 and getSMVersion() != 120,`
`103`	`103`	`reason="The test is for Hopper and Ada only. Current SM is %d." %`
`104`	`104`	`getSMVersion(),`
`105`	`105`	`)`