perf: add 1x4x1 cluster shape for fp8 bmm M<16 cases (#1473)

ttyio · web-flow · commit 7e98d8b94ae6 · 2025-08-12T12:46:20.000-07:00
diff --git a/csrc/fp8_gemm_cutlass.jinja b/csrc/fp8_gemm_cutlass.jinja
@@ -20,6 +20,7 @@ namespace flashinfer {
 namespace gemm {
     INSTANCE_FP8_GEMM_TEMPLATE_SM100({{ type }}, {{ cta_m }}, {{ cta_n }}, {{ cta_k }}, 1, 1, 1, _1SM);
     INSTANCE_FP8_GEMM_TEMPLATE_SM100({{ type }}, {{ cta_m }}, {{ cta_n }}, {{ cta_k }}, 1, 2, 1, _1SM);
+    INSTANCE_FP8_GEMM_TEMPLATE_SM100({{ type }}, {{ cta_m }}, {{ cta_n }}, {{ cta_k }}, 1, 4, 1, _1SM);
     INSTANCE_FP8_GEMM_TEMPLATE_SM100({{ type }}, {{ cta_m }}, {{ cta_n }}, {{ cta_k }}, 2, 1, 1, _2SM);
     INSTANCE_FP8_GEMM_TEMPLATE_SM100({{ type }}, {{ cta_m }}, {{ cta_n }}, {{ cta_k }}, 2, 2, 1, _2SM);
 }  // namespace gemm
diff --git a/include/flashinfer/gemm/fp8_gemm_cutlass_template.h b/include/flashinfer/gemm/fp8_gemm_cutlass_template.h
@@ -83,6 +83,11 @@ size_t dispatchGemmClusterShapeSm100(__nv_fp8_e4m3 const* A, __nv_fp8_e4m3 const
                                                _2SM>(A, B, alpha, D, m, n, k, b, gemmConfig,
                                                      workspacePtr, workspaceBytes, stream);
       break;
+    case ClusterShape::ClusterShape_1x4x1:
+      return genericFp8GemmKernelLauncherSm100<T, arch, CTA_M_, CTA_N_, CTA_K_, Shape<_1, _4, _1>,
+                                               _1SM>(A, B, alpha, D, m, n, k, b, gemmConfig,
+                                                     workspacePtr, workspaceBytes, stream);
+      break;
     default:
       throw std::runtime_error("invalid config for fp8 gemm");
       break;
@@ -205,9 +210,8 @@ std::vector<CutlassGemmConfig> CutlassFp8GemmRunner<T>::getConfigs() const {
   };
 
   std::vector<ClusterShape> clusterShapes = {
-      ClusterShape::ClusterShape_1x1x1,
-      ClusterShape::ClusterShape_1x2x1,
-      ClusterShape::ClusterShape_2x1x1,
+      ClusterShape::ClusterShape_1x1x1, ClusterShape::ClusterShape_1x2x1,
+      ClusterShape::ClusterShape_1x4x1, ClusterShape::ClusterShape_2x1x1,
       ClusterShape::ClusterShape_2x2x1,
   };
   for (auto const& tile_config : tilesSm100) {