[BACKEND] Implement BF16x3 trick (#7592)

plotfi · web-flow · commit 33e7dc2d799a · 2025-10-19T04:22:50.000Z
**Update:** I have found that for better perf, we need to use 3-6 BF16 dot products but not more. My findings are at: https://gist.github.com/plotfi/72554bd410ea55d8ae67b501c69b2766 The short version is that the Triton Bench tutorial matmul with F32 benefits by 60-70% using 3 BF16 dots or 10-15% using 6 BF16 dots. I think this is sufficient to move forward as a replacement for MI350s TF32 and is in line with what hipblas does: https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipblaslt/tensilelite/Tensile/Components/LocalRead.py#L288-L330 There is a similar implementation in XLA as well: https://github.com/openxla/xla/blob/e33f93fb7220d408811afdc926cf10baaf49c64e/xla/backends/gpu/codegen/triton/dot_algorithms.cc#L152 -------- Implements emulation of a 32-bit floating point dot operation using 3 BF16s. This is based on https://arxiv.org/abs/1904.06376 and works because the mantisa of 3 BF16s add up to the mantisa of a fp32. Storing 1 fp32 in 3 bf16s: ```python def BF16(v): return v.to(torch.bfloat16) def FP32(v): return v.to(torch.float32) def BF16x3(v): b0 = BF16(original) b1 = BF16(original - FP32(b0)) b2 = BF16(original - FP32(b0) - FP32(b1)) return (b0, b1, b2) original = torch.rand(1, 1, dtype=torch.float32) bf16x3 = BF16x3(original) ``` Emulating multiplication of two fp32s: ```python def mul_bf16x3(a, b, c): a0, a1, a2 = BF16x3(a) b0, b1, b2 = BF16x3(b) c = c + (a0 * b0) # low low c = c + (a1 * b0) # mid low c = c + (a0 * b1) # low mid c = c + (a1 * b1) # mid mid c = c + (a0 * b2) # low hi c = c + (a2 * b0) # hi low c = c + (a1 * b2) # mid hi c = c + (a2 * b1) # hi mid c = c + (a2 * b2) # hi hi return c a = torch.rand(1, 1, dtype=torch.float32) b = torch.rand(1, 1, dtype=torch.float32) c = torch.zeros(1, 1, dtype=torch.float32) # accumulator result = mul_bf16x3(a, b, c) ``` The emulation using BF16x3 is used when invoking tl.dot with input precision 'BF16x3'. This pass is implemented in a GPU agnostic manner, but it is needed support for MI350's lack of TF32 support. This part is a work in progress but will be based on this patch.
diff --git a/include/triton/Dialect/Triton/IR/TritonAttrDefs.td b/include/triton/Dialect/Triton/IR/TritonAttrDefs.td
@@ -129,7 +129,9 @@ def TT_InputPrecisionAttr : I32EnumAttr<
     [
       I32EnumAttrCase<"TF32", 0, "tf32">,
       I32EnumAttrCase<"TF32x3", 1, "tf32x3">,
-      I32EnumAttrCase<"IEEE", 2, "ieee">
+      I32EnumAttrCase<"IEEE", 2, "ieee">,
+      I32EnumAttrCase<"BF16x3", 3, "bf16x3">,
+      I32EnumAttrCase<"BF16x6", 4, "bf16x6">
     ]>{
   let cppNamespace = "::mlir::triton";
 }
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -664,9 +664,11 @@ def TT_DotOp : TT_Op<"dot", [Pure,
 
     let description = [{
         $d = matrix_multiply($a, $b) + $c. $inputPrecision describes how to exercise the TC
-        when the inputs are f32. It can be one of: tf32, tf32x3, ieee.
+        when the inputs are f32. It can be one of: tf32, tf32x3, ieee, bf16x3, bf16x6.
         tf32: use TC with tf32 ops.
         tf32x3: implement the 3xTF32 trick. For more info see the pass in F32DotTC.cpp
+        bf16x3: implement the 3xBF16 trick. For more info see the pass in F32DotTC.cpp
+        bf16x6: implement the 6xBF16 trick. For more info see the pass in F32DotTC.cpp
         ieee: don't use TC, implement dot in software.
         If the GPU does not have Tensor cores or the inputs are not f32, this flag is ignored.
     }];
diff --git a/include/triton/Dialect/TritonGPU/Transforms/Passes.td b/include/triton/Dialect/TritonGPU/Transforms/Passes.td
@@ -177,15 +177,22 @@ def TritonGPUPartitionScheduling : Pass<"tritongpu-partition-scheduling", "mlir:
 }
 
 def TritonGPUF32DotTC : Pass<"tritongpu-F32DotTC", "mlir::ModuleOp"> {
-  let summary = "3xTF32 trick";
+  let summary = "Emulate dot-product tensor core precision using TF32s or BF16s";
 
   let description = [{
-    Decompose fp32 `DotOp` instructions into 4 pointwise ops and 3 fp16 `DotOp`s
-    to allow using TensorCores. See https://github.com/NVIDIA/cutlass/discussions/385
+      Generic pass to emulate/decompose f32 `DotOp` instructions.
+    * Decompose fp32 `DotOp` instructions into 4 pointwise ops and 3 fp16 `DotOp`s
+      to allow using TensorCores. See https://github.com/NVIDIA/cutlass/discussions/385.
+    * Decompose fp32 `DotOp` instructions into BF16 operations.
+      See https://arxiv.org/abs/1904.06376
   }];
 
-  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
-                           "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect"];
+  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect"];
+  let options = [
+    Option<"emuTF32", "emu-tf32",
+           "bool", /*default*/"false",
+           "whether to handle InputPrecision TF32xN for Nvidia GPUs">
+  ];
 }
 
 def TritonGPUPrefetch : Pass<"tritongpu-prefetch", "mlir::ModuleOp"> {
diff --git a/lib/Dialect/TritonGPU/Transforms/F32DotTC.cpp b/lib/Dialect/TritonGPU/Transforms/F32DotTC.cpp
@@ -2,15 +2,134 @@
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 
-namespace mlir {
-namespace triton {
-namespace gpu {
+namespace mlir::triton::gpu {
 
 #define GEN_PASS_DEF_TRITONGPUF32DOTTC
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
 
 namespace {
 
+template <typename T>
+auto convertValue(Value value, const FloatType &scalarToType,
+                  PatternRewriter &rewriter) -> mlir::Value {
+  auto fromType = cast<RankedTensorType>(value.getType());
+  auto toType = fromType.cloneWith(std::nullopt, scalarToType);
+  return rewriter.create<T>(value.getLoc(), toType, value).getResult();
+}
+
+auto splitF32(Value input, unsigned N, PatternRewriter &rewriter)
+    -> llvm::SmallVector<Value, 3> {
+  llvm::SmallVector<Value, 3> splitInputs;
+  for (unsigned i = 0; i < N; ++i) {
+    Value inputAsBF16 =
+        convertValue<arith::TruncFOp>(input, rewriter.getBF16Type(), rewriter);
+    if (i != N - 1) {
+      Value inputAsF32 = convertValue<arith::ExtFOp>(
+          inputAsBF16, rewriter.getF32Type(), rewriter);
+      input = rewriter.create<arith::SubFOp>(input.getLoc(), input, inputAsF32);
+    }
+    splitInputs.push_back(inputAsBF16);
+  }
+  return splitInputs;
+}
+
+bool isF32(Value operand) {
+  return cast<RankedTensorType>(operand.getType()).getElementType().isF32();
+};
+
+Value zeroLike(Value c, PatternRewriter &rewriter) {
+  return rewriter.create<SplatOp>(c.getLoc(), c.getType(),
+                                  rewriter.create<arith::ConstantOp>(
+                                      c.getLoc(), rewriter.getF32FloatAttr(0)));
+};
+
+Value dot(Value lhs, Value rhs, Value acc, PatternRewriter &rewriter,
+          InputPrecision precision = InputPrecision::IEEE,
+          uint32_t maxNumImpreciseAcc = 0) {
+  return rewriter.create<DotOp>(lhs.getLoc(), lhs, rhs, acc, precision,
+                                maxNumImpreciseAcc);
+};
+
+Value replaceNansWithZeros(Value value, PatternRewriter &rewriter) {
+  auto nans = rewriter.create<arith::CmpFOp>(
+      value.getLoc(), arith::CmpFPredicate::UNO, value, value);
+  auto zero = zeroLike(value, rewriter);
+  return rewriter.create<arith::SelectOp>(value.getLoc(), nans, zero, value);
+};
+
+unsigned getBF16Count(triton::InputPrecision precision) {
+  switch (precision) {
+  default:
+    return 0;
+  case InputPrecision::BF16x3:
+    // BF16x3 only needs the first 2 values derived from splitting an F32
+    return 2;
+  case InputPrecision::BF16x6:
+    return 3;
+  }
+}
+
+// Implements 3xBF16 https://arxiv.org/abs/1904.06376
+// See also
+// https://github.com/openxla/xla/blob/e33f93fb7220d408811afdc926cf10baaf49c64e/xla/backends/gpu/codegen/triton/dot_algorithms.cc#L152
+// As well as
+// https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipblaslt/tensilelite/Tensile/Components/LocalRead.py#L288-L330
+struct BF16xN : public OpRewritePattern<DotOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(DotOp dotOp,
+                                PatternRewriter &rewriter) const override {
+    // BF16 indices and count
+    const unsigned hi = 0;
+    const unsigned mid = 1;
+    const unsigned lo = 2;
+    const unsigned N = getBF16Count(dotOp.getInputPrecision());
+
+    if (!isF32(dotOp.getA()) || !isF32(dotOp.getB()) || !N)
+      return failure();
+
+    // Starting Values: a(0), a(1), a(2), b(0), b(1), b(2) and zero accumulator
+    const auto lhs_parts = splitF32(dotOp.getA(), N, rewriter);
+    const auto rhs_parts = splitF32(dotOp.getB(), N, rewriter);
+    auto result = zeroLike(dotOp.getC(), rewriter);
+
+    switch (dotOp.getInputPrecision()) {
+    default:
+      assert(false && "BF16DotTCPass expects BF16x6 or BF16x3");
+      return failure();
+
+      // clang-format off
+    // NOTE: 9 dots possible; handled like so if not for lack of speedup:
+    // case InputPrecision::BF16x9:
+    //   result = dot(lhs_parts[lo], rhs_parts[lo], result, rewriter);
+    //   result = dot(lhs_parts[mid], rhs_parts[lo], result, rewriter);
+    //   result = dot(lhs_parts[lo], rhs_parts[mid], result, rewriter);
+      // clang-format on
+
+    case InputPrecision::BF16x6:
+      result = dot(lhs_parts[mid], rhs_parts[mid], result, rewriter);
+
+      result = dot(lhs_parts[lo], rhs_parts[hi], result, rewriter);
+      result = dot(lhs_parts[hi], rhs_parts[lo], result, rewriter);
+
+    case InputPrecision::BF16x3:
+      result = dot(lhs_parts[mid], rhs_parts[hi], result, rewriter);
+      result = dot(lhs_parts[hi], rhs_parts[mid], result, rewriter);
+      result = replaceNansWithZeros(result, rewriter);
+
+      // NOTE: For BF16x1 bail without replaceNansWithZeros
+      // case InputPrecision::BF16x1: break;
+    }
+
+    result = dot(lhs_parts[hi], rhs_parts[hi], result, rewriter);
+    result =
+        rewriter.create<arith::AddFOp>(dotOp.getLoc(), result, dotOp.getC());
+
+    rewriter.replaceOp(dotOp, result);
+    return success();
+  }
+};
+
 // nb. We call the trick TF32x3 as C++ disallows variables starting with numbers
 // Implement 3xTF32 trick https://github.com/NVIDIA/cutlass/discussions/385
 // For a, b f32
@@ -28,11 +147,6 @@ class TF32x3 : public OpRewritePattern<DotOp> {
 
   LogicalResult matchAndRewrite(DotOp dotOp,
                                 PatternRewriter &rewriter) const override {
-
-    auto isF32 = [](Value operand) {
-      return cast<RankedTensorType>(operand.getType()).getElementType().isF32();
-    };
-
     if (!(dotOp.getInputPrecision() == InputPrecision::TF32x3 &&
           isF32(dotOp.getA()) && isF32(dotOp.getB()))) {
       return failure();
@@ -47,41 +161,25 @@ class TF32x3 : public OpRewritePattern<DotOp> {
                                           ArrayRef<Value>{value})
           .getResult()[0];
     };
-    auto zeroLike = [&](Value c) -> Value {
-      return rewriter.create<SplatOp>(
-          dotOp->getLoc(), c.getType(),
-          rewriter.create<arith::ConstantOp>(dotOp->getLoc(),
-                                             rewriter.getF32FloatAttr(0)));
-    };
     auto add = [&](Value a, Value b) -> Value {
       return rewriter.create<arith::AddFOp>(dotOp.getLoc(), a, b);
     };
     auto sub = [&](Value a, Value b) -> Value {
       return rewriter.create<arith::SubFOp>(dotOp.getLoc(), a, b);
     };
-    auto dot = [&](Value a, Value b, Value c) -> Value {
-      return rewriter.create<DotOp>(dotOp->getLoc(), c.getType(), a, b, c,
-                                    InputPrecision::TF32,
-                                    dotOp.getMaxNumImpreciseAcc());
-    };
-    auto replaceNansWithZeros = [&](Value value) -> Value {
-      auto nans = rewriter.create<arith::CmpFOp>(
-          dotOp->getLoc(), arith::CmpFPredicate::UNO, value, value);
-      auto zero = zeroLike(value);
-      return rewriter.create<arith::SelectOp>(dotOp->getLoc(), nans, zero,
-                                              value);
-    };
 
     auto aBig = f32ToTF32(dotOp.getA());
     auto aSmall = sub(dotOp.getA(), aBig);
 
     auto bBig = f32ToTF32(dotOp.getB());
     auto bSmall = sub(dotOp.getB(), bBig);
 
-    auto zero = zeroLike(dotOp.getC());
+    auto zero = zeroLike(dotOp.getC(), rewriter);
 
-    auto dot1 = dot(aSmall, bBig, zero);
-    auto dot2 = dot(aBig, bSmall, dot1);
+    auto dot1 = dot(aSmall, bBig, zero, rewriter, InputPrecision::TF32,
+                    dotOp.getMaxNumImpreciseAcc());
+    auto dot2 = dot(aBig, bSmall, dot1, rewriter, InputPrecision::TF32,
+                    dotOp.getMaxNumImpreciseAcc());
 
     // If lhs is 1.0, we will have lhs_high = 1.0 and lhs_low = 0.0.
     // If rhs is +infinity, we will have:
@@ -90,8 +188,9 @@ class TF32x3 : public OpRewritePattern<DotOp> {
     // We would get the wrong result if we sum these partial products. Instead,
     // we must override any accumulated result if the last partial product is
     // non-finite.
-    auto dot2withZeroedNans = replaceNansWithZeros(dot2);
-    auto dot3 = dot(aBig, bBig, dot2withZeroedNans);
+    auto dot2withZeroedNans = replaceNansWithZeros(dot2, rewriter);
+    auto dot3 = dot(aBig, bBig, dot2withZeroedNans, rewriter,
+                    InputPrecision::TF32, dotOp.getMaxNumImpreciseAcc());
 
     auto sum = add(dot3, dotOp.getC());
 
@@ -103,18 +202,20 @@ class TF32x3 : public OpRewritePattern<DotOp> {
 } // anonymous namespace
 
 struct F32DotTCPass : public impl::TritonGPUF32DotTCBase<F32DotTCPass> {
+  using impl::TritonGPUF32DotTCBase<F32DotTCPass>::TritonGPUF32DotTCBase;
   void runOnOperation() override {
     MLIRContext *context = &getContext();
     ModuleOp m = getOperation();
 
     RewritePatternSet decomposePatterns(context);
-    decomposePatterns.add<TF32x3>(context);
+    if (this->emuTF32) {
+      decomposePatterns.add<TF32x3>(context);
+    }
+    decomposePatterns.add<BF16xN>(context);
     if (applyPatternsGreedily(m, std::move(decomposePatterns)).failed()) {
       signalPassFailure();
     }
   }
 };
 
-} // namespace gpu
-} // namespace triton
-} // namespace mlir
+} // namespace mlir::triton::gpu
diff --git a/python/src/ir.cc b/python/src/ir.cc
@@ -308,6 +308,8 @@ void init_triton_ir(py::module &&m) {
       .value("TF32", InputPrecision::TF32)
       .value("TF32x3", InputPrecision::TF32x3)
       .value("IEEE", InputPrecision::IEEE)
+      .value("BF16x3", InputPrecision::BF16x3)
+      .value("BF16x6", InputPrecision::BF16x6)
       .export_values();
 
   py::enum_<ScaleDotElemType>(m, "ScaleDotElemTypeTY", py::module_local())
diff --git a/python/src/passes.cc b/python/src/passes.cc
@@ -71,7 +71,7 @@ void init_triton_passes_ttgpuir(py::module &&m) {
   ADD_PASS_WRAPPER_0("add_accelerate_matmul", createTritonGPUAccelerateMatmul);
   ADD_PASS_WRAPPER_0("add_reorder_instructions",
                      createTritonGPUReorderInstructions);
-  ADD_PASS_WRAPPER_0("add_f32_dot_tc", createTritonGPUF32DotTC);
+  ADD_PASS_OPTION_WRAPPER_1("add_f32_dot_tc", createTritonGPUF32DotTC, bool);
   ADD_PASS_OPTION_WRAPPER_1("add_optimize_dot_operands",
                             createTritonGPUOptimizeDotOperands, bool);
   ADD_PASS_WRAPPER_0("add_remove_layout_conversions",
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3084,7 +3084,7 @@ def get_test_dot_base_cases():
     return [(*shape, 4, False, False, epilogue, input_precision, in_dtype, out_dtype, 1, None)
             for shape in [(64, 64, 64), (32, 32, 32), (16, 16, 16)]
             for epilogue in ['none', 'trans', 'add-matrix', 'add-rows', 'add-cols', 'softmax', 'chain-dot']
-            for input_precision in ['tf32', 'tf32x3', 'ieee']
+            for input_precision in ['tf32', 'tf32x3', 'ieee', 'bf16x3', 'bf16x6']
             for in_dtype, out_dtype in [('float16', 'float16'), ('float16',
                                                                  'float32'), ('float32',
                                                                               'float32'), ('float64', 'float64')]
@@ -3209,6 +3209,8 @@ def test_dot(M, N, K, num_warps, col_a, col_b, epilogue, input_precision, in_dty
     if is_interpreter():
         if in_dtype == 'bfloat16':
             pytest.skip("bfloat16 is not supported in the interpreter")
+        if input_precision == "bf16x3" or input_precision == "bf16x6":
+            pytest.skip(f"input_precision {input_precision} is not supported in the interpreter")
     else:
         if not is_hip() and K < 16:
             pytest.skip("small dots are supported only on HIP at the moment")
@@ -3238,7 +3240,8 @@ def test_dot(M, N, K, num_warps, col_a, col_b, epilogue, input_precision, in_dty
                 pytest.skip(f"{in_dtype} only supported on CDNA4 and gfx12")
             if in_dtype in ("float8e5b16", "float8e4b8") and not is_hip_cdna3():
                 pytest.skip(f"{in_dtype} only supported on CDNA3")
-            if not ((input_precision == "ieee") or (input_precision == "tf32" and is_hip_cdna3())):
+            if not ((input_precision in ("bf16x3", "bf16x6")) or (input_precision == "ieee") or
+                    (input_precision == "tf32" and is_hip_cdna3())):
                 pytest.skip(f"{input_precision} not supported on HIP")
             if kpack == 2 and in_dtype == 'int8' and K < 64:
                 pytest.skip("kpack too large for K")
@@ -3426,7 +3429,12 @@ def kernel(X, stride_xm, stride_xk, Y, stride_yk, stride_yn, W, stride_wn, strid
 
     if in_dtype == 'float32' and input_precision != "ieee":
         if is_tcgen5:
-            assert re.search(r'tcgen05.mma.cta_group::1.kind::tf32', ptx)
+            if input_precision in ("bf16x3", "bf16x6"):
+                assert re.search(r'tcgen05.mma.cta_group::1.kind::f16', ptx)
+            else:
+                assert re.search(r'tcgen05.mma.cta_group::1.kind::tf32', ptx)
+        elif input_precision in ("bf16x3", "bf16x6"):
+            assert re.search(r'[mma|wgmma.mma_async].sync.aligned.m\d+n\d+k16(?:.row.col)?.f32.bf16.bf16', ptx)
         else:
             assert re.search(r'[mma|wgmma.mma_async].sync.aligned.m\d+n\d+k8(?:.row.col)?.f32.tf32.tf32', ptx)
     elif in_dtype == 'float16' and out_dtype == tl.float32:
diff --git a/python/triton/language/semantic.py b/python/triton/language/semantic.py
@@ -1467,6 +1467,10 @@ def _str_to_dot_input_precision(self, input_precision):
         input_precision = input_precision.upper()
         if input_precision == "TF32X3":
             input_precision = "TF32x3"
+        if input_precision == "BF16X3":
+            input_precision = "BF16x3"
+        if input_precision == "BF16X6":
+            input_precision = "BF16x6"
         return getattr(ir.INPUT_PRECISION, input_precision)
 
     def dot(self, lhs: TensorTy, rhs: TensorTy, acc: TensorTy, input_precision: Optional[str],
diff --git a/test/TritonGPU/bf16x3-matmul.mlir b/test/TritonGPU/bf16x3-matmul.mlir
diff --git a/test/TritonGPU/tf32x3-matmul.mlir b/test/TritonGPU/tf32x3-matmul.mlir
diff --git a/third_party/amd/backend/compiler.py b/third_party/amd/backend/compiler.py
diff --git a/third_party/nvidia/backend/compiler.py b/third_party/nvidia/backend/compiler.py

Original file line number	Diff line number	Diff line change
`@@ -129,7 +129,9 @@ def TT_InputPrecisionAttr : I32EnumAttr<`
`129`	`129`	`[`
`130`	`130`	`I32EnumAttrCase<"TF32", 0, "tf32">,`
`131`	`131`	`I32EnumAttrCase<"TF32x3", 1, "tf32x3">,`
`132`		`- I32EnumAttrCase<"IEEE", 2, "ieee">`
	`132`	`+ I32EnumAttrCase<"IEEE", 2, "ieee">,`
	`133`	`+ I32EnumAttrCase<"BF16x3", 3, "bf16x3">,`
	`134`	`+ I32EnumAttrCase<"BF16x6", 4, "bf16x6">`
`133`	`135`	`]>{`
`134`	`136`	`let cppNamespace = "::mlir::triton";`
`135`	`137`	`}`