[NVIDIA] Add native FP4 scaled_dot for SM120 (#8494)

ita9naiwa · web-flow · commit ff63ee2d67ed · 2025-10-21T06:14:17.000+09:00
### Summary Implement native FP4 scaled matmul support for SM120 by replacing the previous decomposition fallback. - mxfp4 x mxfp4 - nvfp4 x nvfp4. For nvfp4, the scale needs to be e4m3 and the scale group size is 16 ### Benchmark E2E vLLM Benchmark: Llama3-8B-Instruct - in_len=1024 out_len=1024 batch_size=128 (5090 RTX) (Thanks to @mobicham, he conducted this benchmark) ``` current main Branch: mxfp4 x mxfp4: 61 sec This PR: mxfp4 x mxfp4: 33 sec nvfp4 x nvfp4: 34.5 sec ````
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -1496,15 +1496,9 @@ LinearLayout chooseScaledWmmaScaleLayout(
 
 // PTX ISA - Warp-level MMA Block Scaling
 //   https://docs.nvidia.com/cuda/parallel-thread-execution/#warp-level-block-scaling
-//
 // This function generates layouts for scale tensors used in scaled dot
 // operations.
-//
-// Supported .kind x scale_vec_size:
-//   mxf8f6f4 with UE8M0 scales -> .scale_vec::1X
-//
 // Implementation notes:
-//   - We support only scale_vec::1X for now.
 //   - We choose a fixed provider for A (thread-id-a = 0) and B (thread-id-b =
 //   0)
 //   - We choose a fixed byte selector for A (byte-id-a = 0) and B (byte-id-b =
diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
@@ -672,15 +672,6 @@ class ScaledBlockedToMMA : public mlir::OpRewritePattern<triton::DotScaledOp> {
     if (numCTAs != 1) {
       return failure();
     }
-
-    // TODO: support mxfp4 variants.
-    if (!((dotOp.getAElemType() == ScaleDotElemType::E5M2 ||
-           dotOp.getAElemType() == ScaleDotElemType::E4M3) &&
-          (dotOp.getBElemType() == ScaleDotElemType::E5M2 ||
-           dotOp.getBElemType() == ScaleDotElemType::E4M3))) {
-      return rewriter.notifyMatchFailure(dotOp, "only E5M2/E4M3 is supported");
-    }
-
     // Skip if any scale is missing. This pattern requires both scales.
     if (!dotOp.getAScale() || !dotOp.getBScale())
       return failure();
@@ -759,7 +750,6 @@ class ScaledBlockedToMMA : public mlir::OpRewritePattern<triton::DotScaledOp> {
     };
 
     const auto mmaWarps = mmaResult.mmaEnc.getWarpsPerCTA(); // [wM, wN]
-
     // Convert scales to Linear layout
     auto convertScale = [&](Value scale, int opIdx) -> Value {
       auto ty = cast<RankedTensorType>(scale.getType());
diff --git a/python/test/unit/language/test_matmul.py b/python/test/unit/language/test_matmul.py
@@ -1031,8 +1031,10 @@ def test_block_scale_fp4(M, N, K, BLOCK_M, BLOCK_N, BLOCK_K, VEC_SIZE, with_a_sc
     if is_cuda():
         if scale_type == "float8_e4m3fn" and not pack_along_k:
             pytest.skip("Packing along K is required for float8_e4m3fn")
-        if torch.cuda.get_device_capability()[0] != 10:
-            pytest.skip("Requires compute capability == 10")
+        if torch.cuda.get_device_capability()[0] != 10 and torch.cuda.get_device_capability()[0] != 12:
+            pytest.skip("Requires compute capability == 10 or 12")
+        if torch.cuda.get_device_capability()[0] == 12 and pack_along_k is False:
+            pytest.skip("Packing along M, N is not supported on SM120")
         if not (with_a_scale and with_b_scale):
             pytest.skip("None aScale/bScale is only tested on AMD backend for now")
     elif is_hip():
diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir
@@ -699,21 +699,18 @@ module attributes {"ttg.target" = "cuda:120", "ttg.num-ctas" = 1 : i32, "ttg.num
 
 // -----
 
-// Verify that for SM_120 with FP4 inputs, tt.dot_scaled is decomposed into:
-// 1. ttg.fp4_to_fp for unpacking FP4 values
-// 2. Scale application with arith.mulf
-// 3. Regular tt.dot operation with MMA encoding
+// Verify that for SM_120 with FP4 inputs, tt.dot_scaled is preserved and
+// scales are converted to linear layout for hardware acceleration.
 
 #blocked2 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
 #blocked2_k = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [0, 1]}>
 
 module attributes {"ttg.target" = "cuda:120", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
-  // CHECK-LABEL: @sm120_dot_scaled_fp4_fallback
-  // CHECK-NOT: tt.dot_scaled
-  // CHECK: ttg.fp4_to_fp
-  // CHECK: tt.dot
-  // CHECK: #mma
-  tt.func public @sm120_dot_scaled_fp4_fallback(
+  // CHECK-LABEL: @sm120_dot_scaled_fp4_native
+  // CHECK-DAG: tt.dot_scaled
+  // CHECK-DAG: #linear
+  // CHECK-DAG: #linear1
+  tt.func public @sm120_dot_scaled_fp4_native(
     %a: tensor<128x32xi8, #blocked2_k>,
     %scale_a: tensor<128x2xi8, #blocked2>,
     %b: tensor<32x128xi8, #blocked2>,
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv2.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/DotOpToLLVM/MMAv2.cpp
@@ -296,6 +296,9 @@ enum class TensorCoreType : uint8_t {
   FP32_FP8E4M3FN_FP8E5M2_FP32_SCALE_VEC_1X,
   FP32_FP8E4M3FN_FP8E4M3FN_FP32_SCALE_VEC_1X,
   //
+  FP32_FP4E2M1_FP4E2M1_FP32_SCALE_VEC_2X,
+  FP32_NVFP4_NVFP4_FP32_SCALE_VEC_4X,
+  //
   NOT_APPLICABLE,
 };
 
@@ -339,6 +342,8 @@ static Type getMmaRetType(TensorCoreType mmaType, MLIRContext *ctx) {
   case TensorCoreType::FP32_FP8E5M2_FP8E4M3FN_FP32_SCALE_VEC_1X:
   case TensorCoreType::FP32_FP8E4M3FN_FP8E5M2_FP32_SCALE_VEC_1X:
   case TensorCoreType::FP32_FP8E4M3FN_FP8E4M3FN_FP32_SCALE_VEC_1X:
+  case TensorCoreType::FP32_FP4E2M1_FP4E2M1_FP32_SCALE_VEC_2X:
+  case TensorCoreType::FP32_NVFP4_NVFP4_FP32_SCALE_VEC_4X:
     return fp32x4Ty;
   default:
     llvm::report_fatal_error("Unsupported mma type found");
@@ -367,6 +372,15 @@ static TensorCoreType getMmaTypeDotScaled(DotScaledOp op, RankedTensorType aTy,
         llvm::isa<Float8E4M3FNType>(bTy.getElementType())) {
       return TensorCoreType::FP32_FP8E4M3FN_FP8E4M3FN_FP32_SCALE_VEC_1X;
     }
+    if (op.getBElemType() == ScaleDotElemType::E2M1 &&
+        op.getAElemType() == ScaleDotElemType::E2M1) {
+      if (isa<mlir::Float8E4M3FNType>(
+              op.getBScale().getType().getElementType())) {
+        return TensorCoreType::FP32_NVFP4_NVFP4_FP32_SCALE_VEC_4X;
+      } else {
+        return TensorCoreType::FP32_FP4E2M1_FP4E2M1_FP32_SCALE_VEC_2X;
+      }
+    }
   }
   return TensorCoreType::NOT_APPLICABLE;
 }
@@ -493,6 +507,14 @@ inline static const std::map<TensorCoreType, std::string> mmaInstrPtxScaled = {
      "mma.sync.aligned.m16n8k32.row.col."
      "kind::mxf8f6f4.block_scale.scale_vec::"
      "1X.f32.e4m3.e4m3.f32.ue8m0"},
+    {TensorCoreType::FP32_FP4E2M1_FP4E2M1_FP32_SCALE_VEC_2X,
+     "mma.sync.aligned.m16n8k64.row.col."
+     "kind::mxf4nvf4.block_scale.scale_vec::"
+     "2X.f32.e2m1.e2m1.f32.ue8m0"},
+    {TensorCoreType::FP32_NVFP4_NVFP4_FP32_SCALE_VEC_4X,
+     "mma.sync.aligned.m16n8k64.row.col."
+     "kind::mxf4nvf4.block_scale.scale_vec::"
+     "4X.f32.e2m1.e2m1.f32.ue4m3"},
 };
 
 static void callMmaTuringInt8(PTXBuilder &builder, int b,
@@ -890,13 +912,12 @@ LogicalResult convertMMADotScaled(triton::DotScaledOp op,
   TensorCoreType mmaType =
       getMmaTypeDotScaled(op, aTensorTy, bTensorTy, dTensorTy);
 
-  NumRegisters numRegisters = {2, 1, 2};
-
   SmallVector<Value> unpackedAScale =
       unpackLLElements(op.getLoc(), adaptor.getAScale(), rewriter);
   SmallVector<Value> unpackedBScale =
       unpackLLElements(op.getLoc(), adaptor.getBScale(), rewriter);
 
+  NumRegisters numRegisters = {2, 1, 2};
   EmitMmaCallback emit = [&](PTXBuilder &builder, int b, int m, int n, int k,
                              mlir::triton::PTXInstr &mma, unsigned numMmaRets,
                              unsigned colsPerThread, unsigned batchOffset,
@@ -906,8 +927,34 @@ LogicalResult convertMMADotScaled(triton::DotScaledOp op,
     auto tb = TritonLLVMOpBuilder(op.getLoc(), rewriter);
     auto i32 = IntegerType::get(op->getContext(), 32);
 
-    Value aScaleValue = tb.zext(i32, unpackedAScale[m * repK + k]);
-    Value bScaleValue = tb.zext(i32, unpackedBScale[n * repK + k]);
+    auto packElements = [&](ArrayRef<Value> bytes, int loc,
+                            int numBytes) -> Value {
+      Value packed = tb.zext(i32, bytes[loc]);
+      for (int i = 1; i < numBytes; ++i) {
+        Value byte = tb.zext(i32, bytes[loc + i]);
+        Value shifted = tb.shl(byte, tb.i32_val(i * 8));
+        packed = tb.or_(packed, shifted);
+      }
+      return packed;
+    };
+
+    int scaleVecMode;
+    if (mmaInstrPtxScaled.at(mmaType).find("1X") != std::string::npos) {
+      scaleVecMode = 1;
+    } else if (mmaType ==
+               TensorCoreType::FP32_FP4E2M1_FP4E2M1_FP32_SCALE_VEC_2X) {
+      scaleVecMode = 2;
+    } else if (mmaType == TensorCoreType::FP32_NVFP4_NVFP4_FP32_SCALE_VEC_4X) {
+      scaleVecMode = 4;
+    } else {
+      llvm_unreachable("Unsupported scale vector mode!");
+    }
+    Value aScaleValue =
+        packElements(unpackedAScale, m * repK * scaleVecMode + k * scaleVecMode,
+                     scaleVecMode);
+    Value bScaleValue =
+        packElements(unpackedBScale, n * repK * scaleVecMode + k * scaleVecMode,
+                     scaleVecMode);
 
     BaseOffset base{numRegisters.m * m, numRegisters.n * n, numRegisters.k * k};
     callMmaScaled(builder, b, base, mma, numMmaRets, colsPerThread, aTable,