[AMD] Improve math.fdiv FTZ lowering for f32 inputs (triton-lang#5474)

knwng · web-flow · commit bde92ef844af · 2025-03-03T18:01:17.000-08:00
This commit lowered math.fdiv to a truely approximated div operation
which helps to save register usage and improve performance.
diff --git a/test/Conversion/amd/fdivide.mlir b/test/Conversion/amd/fdivide.mlir
@@ -0,0 +1,41 @@
+// RUN: triton-opt %s -split-input-file --convert-triton-amdgpu-to-llvm="arch=gfx942" | FileCheck %s
+
+#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @test_fdiv_f32(%arg0: tensor<64xf32, #blocked>, %arg1: tensor<64xf32, #blocked>) attributes {noinline = false} {
+    // CHECK-LABEL: test_fdiv_f32
+    // CHECK: llvm.amdgcn.div.scale.f32
+    // CHECK: llvm.amdgcn.div.scale.f32
+    // CHECK: llvm.amdgcn.rcp.f32
+    // CHECK: llvm.fmul
+    // CHECK: llvm.amdgcn.div.fmas.f32
+    // CHECK: llvm.amdgcn.div.fixup.f32
+    // CHECK-NOT: llvm.fdiv
+    %0 = arith.divf %arg0, %arg1 : tensor<64xf32, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @test_fdiv_f64(%arg0: tensor<64xf64, #blocked>, %arg1: tensor<64xf64, #blocked>) attributes {noinline = false} {
+    // CHECK-LABEL: test_fdiv_f64
+    // CHECK: llvm.fdiv
+    %0 = arith.divf %arg0, %arg1 : tensor<64xf64, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [1], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @test_div_rn(%arg0: tensor<64xf32, #blocked>, %arg1: tensor<64xf32, #blocked>) attributes {noinline = false} {
+    // CHECK-LABEL: test_div_rn
+    // CHECK: llvm.fdiv
+    %0 = tt.precise_divf %arg0, %arg1 : tensor<64xf32, #blocked>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -1081,9 +1081,62 @@ struct FDivOpConversion
                                    ConversionPatternRewriter &rewriter,
                                    Type elemTy, MultipleOperandsRange operands,
                                    Location loc) const {
+    // For non-F32 input, it's lowered to LLVM::FDivOp, which is a
+    // IEEE-compliant DIV operation.
+    if (elemTy.getIntOrFloatBitWidth() != 32)
+      return {rewriter.create<LLVM::FDivOp>(loc, elemTy, operands[0][0],
+                                            operands[0][1])};
+
+    auto b = TritonLLVMOpBuilder(loc, rewriter);
 
-    return {rewriter.create<LLVM::FDivOp>(loc, elemTy, operands[0][0],
-                                          operands[0][1])};
+    // The algorithm comes from
+    // https://github.com/llvm/llvm-project/blob/bda7aadf/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp#L4980-L5065
+    // with the Newton-Raphson refinement removed, to perform a faster,
+    // approximated DIV operation, aligning with the `div.full.f32` instruction
+    // on the NV backend.
+    Value &lhs = operands[0][0];
+    Value &rhs = operands[0][1];
+    MLIRContext *ctx = rewriter.getContext();
+    Type divScaleResType = struct_ty({elemTy, i1_ty});
+
+    // The `llvm.amdgcn.div.scale.f32` instruction's signature is
+    // (src0, src1, src2) -> (ret0, ret1), where
+    //
+    // src0: The numerator or lhs of FDivOp.
+    // src1: The denominator or rhs of FDivOp.
+    // src2: A boolean indicating which operand to scale. If true, lhs is
+    // scaled; Otherwise, rhs is scaled.
+    //
+    // ret0: The scaled operand.
+    // ret1: The VCC register indicating whether post-scaling is required.
+    auto denominatorScaleOp = LLVM::createLLVMIntrinsicCallOp(
+        rewriter, loc, "llvm.amdgcn.div.scale.f32", divScaleResType,
+        {lhs, rhs, b.false_val()});
+    Value denominatorScaled = b.extract_val(denominatorScaleOp.getResult(0), 0);
+    auto numeratorScaleOp = LLVM::createLLVMIntrinsicCallOp(
+        rewriter, loc, "llvm.amdgcn.div.scale.f32", divScaleResType,
+        {lhs, rhs, b.true_val()});
+    Value numeratorScaled = b.extract_val(numeratorScaleOp.getResult(0), 0);
+    Value vcc = b.extract_val(numeratorScaleOp.getResult(0), 1);
+
+    Value rcp =
+        LLVM::createLLVMIntrinsicCallOp(rewriter, loc, "llvm.amdgcn.rcp.f32",
+                                        elemTy, {denominatorScaled})
+            .getResult(0);
+
+    Value approxDiv = b.fmul(numeratorScaled, rcp);
+
+    // Since the Newton-Raphson is skipped, we use 0 instead of approximations
+    // as the inputs.
+    auto fmas = LLVM::createLLVMIntrinsicCallOp(
+                    rewriter, loc, "llvm.amdgcn.div.fmas.f32", elemTy,
+                    {b.f32_val(0), b.f32_val(0), approxDiv, vcc})
+                    .getResult(0);
+
+    return {LLVM::createLLVMIntrinsicCallOp(rewriter, loc,
+                                            "llvm.amdgcn.div.fixup.f32", elemTy,
+                                            {fmas, rhs, lhs})
+                .getResult(0)};
   }
 };