[FRONTEND][BACKEND] plumb fast_math attribute from scaled_dot frontend to LLVM codegen. Ignore NaN when set. (triton-lang#5582)

ptillet · web-flow · commit 22ac44735eab · 2025-01-11T23:33:33.000-08:00
diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
@@ -480,7 +480,8 @@ SmallVector<Value> convertMxfp4x2ToBf16x2(RewriterBase &rewriter, Location loc,
                                           ArrayRef<Value> values);
 
 // Scale a mxfp4 value by a given scale.
-Value mxfpScaleBf16(RewriterBase &rewriter, Location loc, Value v, Value scale);
+Value mxfpScaleBf16(RewriterBase &rewriter, Location loc, Value v, Value scale,
+                    bool fastMath);
 
 } // namespace LLVM
 
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -690,7 +690,8 @@ def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
       Optional<RankedTensorOf<[I8]>>:$lhs_scale,
       Optional<RankedTensorOf<[I8]>>:$rhs_scale,
       TT_ScaleDotElemTypeAttr:$lhs_type,
-      TT_ScaleDotElemTypeAttr:$rhs_type
+      TT_ScaleDotElemTypeAttr:$rhs_type,
+      BoolAttr:$fastMath
     );
 
     let results = (outs TT_FloatTensor:$d);
diff --git a/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td b/include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
@@ -292,10 +292,13 @@ def TTG_UpcastMXFPOp : TTG_Op<"upcast_mxfp", [Pure]> {
     Compute the bf16 encoded in the given mxfp number as per
     https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
   }];
-  let arguments = (ins
-                   TT_Tensor:$src,
-                   TT_Tensor:$scale,
-                   TT_ScaleDotElemTypeAttr:$fp_type);
+  let arguments = (
+    ins
+    TT_Tensor:$src,
+    TT_Tensor:$scale,
+    TT_ScaleDotElemTypeAttr:$fp_type,
+    BoolAttr:$fastMath
+  );
   let results = (outs TT_Tensor:$result);
 
   let assemblyFormat = [{
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -904,13 +904,15 @@ SmallVector<Value> convertMxfp4x2ToBf16x2(RewriterBase &rewriter, Location loc,
   return results;
 }
 
-Value mxfpScaleBf16(RewriterBase &rewriter, Location loc, Value v,
-                    Value scale) {
+Value mxfpScaleBf16(RewriterBase &rewriter, Location loc, Value v, Value scale,
+                    bool fastMath) {
   Value vBf16 = bitcast(v, bf16_ty);
-  Value nanBf16 = bitcast(i16_val(0x7fff), bf16_ty);
-  Value scaleIsNan = icmp_eq(scale, i8_val(0xff));
   Value scaleBf16 = bitcast(shl(zext(i16_ty, scale), i16_val(7)), bf16_ty);
   Value scaledBf16 = fmul(vBf16, scaleBf16);
+  if (fastMath)
+    return scaledBf16;
+  Value nanBf16 = bitcast(i16_val(0x7fff), bf16_ty);
+  Value scaleIsNan = icmp_eq(scale, i8_val(0xff));
   // Account for NaN in the scale as per the mxfp specification.
   return select(scaleIsNan, nanBf16, scaledBf16);
 };
diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
@@ -398,6 +398,7 @@ class DecomposeScaledBlocked
     auto scale = scaledDotOp.getLhsScale();
     auto aType = scaledDotOp.getLhsType();
     auto bType = scaledDotOp.getRhsType();
+    bool fastMath = scaledDotOp.getFastMath();
 
     auto rank = oldRetType.getShape().size();
     if (rank != 2)
@@ -510,15 +511,17 @@ class DecomposeScaledBlocked
       newScaleEncoding = LinearEncodingAttr::get(ctx, std::move(newLL));
     }
 
-    a = createArg(rewriter, a, 0, aType, newAEncoding, scale, newScaleEncoding);
+    a = createArg(rewriter, a, 0, aType, newAEncoding, scale, newScaleEncoding,
+                  fastMath);
 
     Operation *newDot = nullptr;
     if (versionMajor == 2) {
       // Upcast B operand
       assert(bType != ScaleDotElemType::E2M1 && "NYI: rhs scale for fp4");
       auto newBEncoding = DotOperandEncodingAttr::get(ctx, 1, mmaEnc, bKWidth);
       b = createArg(rewriter, b, 1, bType, newBEncoding,
-                    /*scale=*/std::nullopt, /*scaleEncoding=*/std::nullopt);
+                    /*scale=*/std::nullopt, /*scaleEncoding=*/std::nullopt,
+                    fastMath);
       newDot = rewriter.create<DotOp>(scaledDotOp.getLoc(), newRetType, a, b,
                                       newAcc);
     } else {
@@ -541,7 +544,7 @@ class DecomposeScaledBlocked
   createArg(mlir::PatternRewriter &rewriter, TypedValue<RankedTensorType> v,
             int idx, ScaleDotElemType type, std::optional<Attribute> vEncoding,
             std::optional<TypedValue<RankedTensorType>> opt_scale,
-            std::optional<Attribute> scaleEncoding) const {
+            std::optional<Attribute> scaleEncoding, bool fastMath) const {
     auto ctx = rewriter.getContext();
     // Create a new tensor with a given encoding or remove the encoding
     auto maybeWithEncoding =
@@ -576,7 +579,7 @@ class DecomposeScaledBlocked
       auto retTy = triton::gpu::UpcastMXFPOp::deduceOutputType(
           ret, type, Builder(v.getContext()).getBF16Type());
       ret = rewriter.create<triton::gpu::UpcastMXFPOp>(v.getLoc(), retTy, ret,
-                                                       scale, type);
+                                                       scale, type, fastMath);
     }
     return ret;
   }
@@ -589,6 +592,7 @@ class DecomposeScaledBlocked
     auto scale = scaledDotOp.getLhsScale();
     auto aType = scaledDotOp.getLhsType();
     auto bType = scaledDotOp.getRhsType();
+    bool fastMath = scaledDotOp.getFastMath();
 
     // create a DotOp to be passed in to getMMAVersionSafe
     // We don't pass encodings as we just want to get the type and shape
@@ -597,15 +601,16 @@ class DecomposeScaledBlocked
     // end up in the graph
     RankedTensorType aTType =
         createArg(rewriter, a, 0, aType, /*vEncoding=*/std::nullopt, scale,
-                  /*scaleEncoding=*/std::nullopt)
+                  /*scaleEncoding=*/std::nullopt, fastMath)
             .getType();
     auto aTypeNoEnc =
         RankedTensorType::get(aTType.getShape(), aTType.getElementType());
     a = rewriter.create<ConvertLayoutOp>(scaledDotOp.getLoc(), aTypeNoEnc, a);
 
     RankedTensorType bTType =
         createArg(rewriter, b, 1, bType, /*vEncoding=*/std::nullopt,
-                  /*scale=*/std::nullopt, /*scaleEncoding=*/std::nullopt)
+                  /*scale=*/std::nullopt, /*scaleEncoding=*/std::nullopt,
+                  fastMath)
             .getType();
     auto bTypeNoEnc =
         RankedTensorType::get(bTType.getShape(), bTType.getElementType());
@@ -752,7 +757,7 @@ static Operation *transposeDotOp(DotScaledOp dotOp) {
   Value result = builder.create<DotScaledOp>(
       dotOp.getLoc(), cTransposed.getType(), rhsTransposed, lhsTransposed,
       cTransposed, dotOp.getRhsScale(), dotOp.getLhsScale(), dotOp.getRhsType(),
-      dotOp.getLhsType());
+      dotOp.getLhsType(), dotOp.getFastMath());
   Operation *transposedResult =
       builder.create<TransOp>(result.getLoc(), result, transOrder);
   dotOp.replaceAllUsesWith(transposedResult);
diff --git a/python/src/ir.cc b/python/src/ir.cc
@@ -1513,10 +1513,12 @@ void init_triton_ir(py::module &&m) {
               std::optional<mlir::Value> &lhs_scale,
               ScaleDotElemType lhs_format, mlir::Value &rhs,
               std::optional<mlir::Value> &rhs_scale,
-              ScaleDotElemType rhs_format, mlir::Value &c) -> mlir::Value {
-             return self.create<DotScaledOp>(
-                 c.getType(), lhs, rhs, c, lhs_scale.value_or(Value()),
-                 rhs_scale.value_or(Value()), lhs_format, rhs_format);
+              ScaleDotElemType rhs_format, bool fast_math,
+              mlir::Value &c) -> mlir::Value {
+             return self.create<DotScaledOp>(c.getType(), lhs, rhs, c,
+                                             lhs_scale.value_or(Value()),
+                                             rhs_scale.value_or(Value()),
+                                             lhs_format, rhs_format, fast_math);
            })
       .def("create_floor",
            [](TritonOpBuilder &self, Value &val) -> Value {
diff --git a/python/triton/language/core.py b/python/triton/language/core.py
@@ -1733,7 +1733,8 @@ def dot(input, other, acc=None, input_precision=None, allow_tf32=None, max_num_i
 
 
 @builtin
-def dot_scaled(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc=None, out_dtype=float32, _builder=None):
+def dot_scaled(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, fast_math=False, acc=None, out_dtype=float32,
+               _builder=None):
     """
     Returns the matrix product of two blocks in microscaling format.
 
@@ -1763,7 +1764,8 @@ def dot_scaled(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc=None,
     """
     out_dtype = _constexpr_to_value(out_dtype)
     assert out_dtype == float32, "Only float32 is supported for out_dtype at the moment"
-    return semantic.dot_scaled(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, acc, out_dtype, _builder)
+    return semantic.dot_scaled(lhs, lhs_scale, lhs_format, rhs, rhs_scale, rhs_format, fast_math, acc, out_dtype,
+                               _builder)
 
 
 # -----------------------
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
@@ -1562,7 +1562,8 @@ def _bitcast_to_fp_type(val: tl.tensor, float_format: str, builder: ir.builder):
 
 
 def dot_scaled(lhs: tl.tensor, lhs_scale: tl.tensor, lhs_format: str, rhs: tl.tensor, rhs_scale: Optional[tl.tensor],
-               rhs_format: str, acc: tl.tensor | None, out_dtype: tl.dtype, builder: ir.builder) -> tl.tensor:
+               rhs_format: str, fast_math: bool, acc: tl.tensor | None, out_dtype: tl.dtype,
+               builder: ir.builder) -> tl.tensor:
     assert lhs.type.is_block() and rhs.type.is_block()
     #TODO: validate types.
     lhs_rank = len(lhs.shape)
@@ -1601,7 +1602,7 @@ def dot_scaled(lhs: tl.tensor, lhs_scale: tl.tensor, lhs_format: str, rhs: tl.te
     lhs_scale_handle = None if lhs_scale_is_none else lhs_scale.handle
     return tl.tensor(
         builder.create_dot_scaled(lhs.handle, lhs_scale_handle, lhs_format_enum, rhs.handle, rhs_scale_handle,
-                                  rhs_format_enum, acc_handle), ret_ty)
+                                  rhs_format_enum, fast_math, acc_handle), ret_ty)
 
 
 # ===----------------------------------------------------------------------===//
diff --git a/test/Conversion/tritongpu_to_llvm.mlir b/test/Conversion/tritongpu_to_llvm.mlir
@@ -2110,7 +2110,15 @@ tt.func @upcast_mxfp(%arg0: tensor<32x32xi8, #ttg.dot_op<{opIdx = 0, parent = #m
   // CHECK-COUNT-4: llvm.inline_asm
   // CHECK-COUNT-2: nvvm.shfl.sync
   // CHECK-COUNT-32: llvm.fmul
-  %0 = ttg.upcast_mxfp %arg0, %arg1 fp_type = e2m1 : tensor<32x32xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, tensor<32x2xi8, #linear> -> tensor<32x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+  // CHECK: llvm.icmp
+  // CHECK: llvm.select
+  %0 = ttg.upcast_mxfp %arg0, %arg1 fp_type = e2m1 {fastMath = false} : tensor<32x32xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, tensor<32x2xi8, #linear> -> tensor<32x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+  // CHECK-COUNT-4: llvm.inline_asm
+  // CHECK-COUNT-2: nvvm.shfl.sync
+  // CHECK-COUNT-32: llvm.fmul
+  // CHECK-NOT: llvm.icmp
+  // CHECK-NOT: llvm.select
+  %1 = ttg.upcast_mxfp %arg0, %arg1 fp_type = e2m1 {fastMath = true} : tensor<32x32xi8, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, tensor<32x2xi8, #linear> -> tensor<32x64xbf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
   tt.return
 }
 
diff --git a/test/TritonGPU/accelerate-matmul.mlir b/test/TritonGPU/accelerate-matmul.mlir
@@ -204,10 +204,10 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
     %b_bf16: tensor<64x128xbf16, #blocked>
     ) -> tensor<128x128xf32, #blocked> {
     // CHECK: ttg.convert_layout {{.*}} : tensor<128x2xi8, #blocked1> -> tensor<128x2xi8, {{.*}}>
-    // CHECK: ttg.upcast_mxfp {{.*}}, {{.*}} fp_type = e2m1 : tensor<128x32xi8, #ttg.dot_op<{{.*}}>>, tensor<128x2xi8, {{.*}}> -> tensor<128x64xbf16, #ttg.dot_op<{{.*}}>>
-    // CHECK: ttng.warp_group_dot
+    // CHECK: ttg.upcast_mxfp {{.*}}, {{.*}} fp_type = e2m1 {fastMath = false} : tensor<128x32xi8, #ttg.dot_op<{{.*}}>>, tensor<128x2xi8, {{.*}}> -> tensor<128x64xbf16, #ttg.dot_op<{{.*}}>>
+    // CHECK-NEXT: ttng.warp_group_dot {{.*}}
     %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
-    %result = tt.dot_scaled %a scale %scale, %b_bf16, %cst lhs = e2m1 rhs = bf16 : tensor<128x32xi8, #blocked2>, tensor<128x2xi8, #blocked1> * tensor<64x128xbf16, #blocked> -> tensor<128x128xf32, #blocked>
+    %result = tt.dot_scaled %a scale %scale, %b_bf16, %cst lhs = e2m1 rhs = bf16 {fastMath = false} : tensor<128x32xi8, #blocked2>, tensor<128x2xi8, #blocked1> * tensor<64x128xbf16, #blocked> -> tensor<128x128xf32, #blocked>
     tt.return %result : tensor<128x128xf32, #blocked>
   }
 
@@ -220,9 +220,9 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
     ) -> tensor<128x128xf32, #blocked> {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #blocked>
     // CHECK: ttg.convert_layout {{.*}} : tensor<128x2xi8, #blocked1> -> tensor<128x2xi8, #[[LINEAR]]>
-    // CHECK: ttg.upcast_mxfp {{.*}}, {{.*}} fp_type = e2m1 : tensor<128x32xi8, #ttg.dot_op<{{.*}}>>, tensor<128x2xi8, #[[LINEAR]]> -> tensor<128x64xbf16, #ttg.dot_op<{{.*}}>>
+    // CHECK: ttg.upcast_mxfp {{.*}}, {{.*}} fp_type = e2m1 {fastMath = true} : tensor<128x32xi8, #ttg.dot_op<{{.*}}>>, tensor<128x2xi8, #[[LINEAR]]> -> tensor<128x64xbf16, #ttg.dot_op<{{.*}}>>
     // CHECK: tt.dot
-    %result = tt.dot_scaled %a scale %scale, %b_fp8, %cst lhs = e2m1 rhs = e4m3 : tensor<128x32xi8, #blocked2>, tensor<128x2xi8, #blocked1> * tensor<64x128xf8E4M3FN, #blocked> -> tensor<128x128xf32, #blocked>
+    %result = tt.dot_scaled %a scale %scale, %b_fp8, %cst lhs = e2m1 rhs = e4m3 {fastMath = true} : tensor<128x32xi8, #blocked2>, tensor<128x2xi8, #blocked1> * tensor<64x128xf8E4M3FN, #blocked> -> tensor<128x128xf32, #blocked>
     tt.return %result : tensor<128x128xf32, #blocked>
   }
 }
@@ -246,7 +246,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     %0 = scf.for %arg4 = %c0_i32 to %c100_i32 step %c1_i32 iter_args(%arg5 = %cst) -> (tensor<128x32xf32, #blocked1>)  : i32 {
       // CHECK-DAG: tt.trans %{{.*}} {order = array<i32: 1, 0>} : tensor<128x64xf8E4M3FN, #{{.*}}> -> tensor<64x128xf8E4M3FN, #{{.*}}>
       // CHECK-DAG: tt.trans %a{{.*}} {order = array<i32: 1, 0>} : tensor<32x32xi8, #{{.*}}> -> tensor<32x32xi8, #{{.*}}>
-      %3 = tt.dot_scaled %arg0, %arg1 scale %arg2, %arg5 lhs = e4m3 rhs = e2m1 : tensor<128x64xf8E4M3FN, #blocked> * tensor<32x32xi8, #blocked1>, tensor<32x2xi8, #blocked2> -> tensor<128x32xf32, #blocked1>
+      %3 = tt.dot_scaled %arg0, %arg1 scale %arg2, %arg5 lhs = e4m3 rhs = e2m1 {fastMath = false}: tensor<128x64xf8E4M3FN, #blocked> * tensor<32x32xi8, #blocked1>, tensor<32x2xi8, #blocked2> -> tensor<128x32xf32, #blocked1>
       // CHECK: tt.dot
       // CHECK-NOT: tt.trans
       // CHECK: scf.yield
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -613,7 +613,7 @@ class ScaledBlockedToMFMA final : public OpRewritePattern<triton::DotScaledOp> {
         ctx, {1, 1}, threadsPerWarp, blockWarpsPerCTA, {1, 0}, ctaLayout);
 
     auto upcastMXFP = [&](TensorValue v, TensorValue scale,
-                          ScaleDotElemType elemType) -> Value {
+                          ScaleDotElemType elemType, bool fastMath) -> Value {
       if (!scale)
         return v;
 
@@ -629,11 +629,13 @@ class ScaledBlockedToMFMA final : public OpRewritePattern<triton::DotScaledOp> {
       auto outputType =
           ttg::UpcastMXFPOp::deduceOutputType(v, elemType, outputElemType);
       return rewriter.create<ttg::UpcastMXFPOp>(dotOp.getLoc(), outputType, v,
-                                                convOp, elemType);
+                                                convOp, elemType, fastMath);
     };
 
-    Value scaledA = upcastMXFP(a, aScale, dotOp.getLhsType());
-    Value scaledB = upcastMXFP(b, bScale, dotOp.getRhsType());
+    Value scaledA =
+        upcastMXFP(a, aScale, dotOp.getLhsType(), dotOp.getFastMath());
+    Value scaledB =
+        upcastMXFP(b, bScale, dotOp.getRhsType(), dotOp.getFastMath());
     auto newDot = rewriter.create<DotOp>(dotOp.getLoc(), newRetType, scaledA,
                                          scaledB, newAcc);
     rewriter.replaceOpWithNewOp<ttg::ConvertLayoutOp>(dotOp, oldRetType,
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/UpcastMXFPToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/UpcastMXFPToLLVM.cpp
@@ -149,8 +149,8 @@ class UpcastMXFPOpPattern : public ConvertOpToLLVMPattern<UpcastMXFPOp> {
             for (int k = 0; k < kWidth; ++k) {
               auto idx =
                   32 * i + 16 * mxfp + rep * 2 * kWidth + subTile * kWidth + k;
-              xVals[idx] =
-                  LLVM::mxfpScaleBf16(rewriter, loc, xVals[idx], si[subTile]);
+              xVals[idx] = LLVM::mxfpScaleBf16(rewriter, loc, xVals[idx],
+                                               si[subTile], op.getFastMath());
             }
           }
         }