[AMD] Use LLVM ops for fp16<->fp32 casts (triton-lang#5859)

antiagainst · web-flow · commit de0f7543b0b1 · 2025-02-07T20:33:46.000-06:00
Inline assembly can be a blocker for LLVM backend to optimize.
diff --git a/test/Conversion/amd/fp_to_fp.mlir b/test/Conversion/amd/fp_to_fp.mlir
@@ -4,7 +4,7 @@
 #blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
   tt.func @f16_to_f32(%arg0: tensor<8x8xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>) {
-    // CHECK-COUNT-8: llvm.inline_asm asm_dialect {{.*}}v_cvt_f32_f16 {{.*}}: (f16) -> f32
+    // CHECK-COUNT-8: llvm.fpext %{{.+}} : f16 to f32
     %0 = tt.fp_to_fp %arg0 : tensor<8x8xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked}>> -> tensor<8x8xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked}>>
     tt.return
   }
@@ -21,3 +21,18 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     tt.return
   }
 }
+
+// -----
+
+//  CHECK-LABEL: f32_to_f16
+#blocked2 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  tt.func @f32_to_f16(%arg0: tensor<8x8xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>) {
+    // CHECK-COUNT-8: llvm.intr.experimental.constrained.fptrunc %{{.+}} tonearest ignore : f32 to f16
+    %0 = tt.fp_to_fp %arg0, rounding = rtne : tensor<8x8xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<8x8xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
+    // CHECK-COUNT-8: llvm.inline_asm asm_dialect {{.*}}s_setreg_imm32_b32{{.+}}v_cvt_f16_f32{{.+}}s_setreg_imm32_b32{{.+}} : (f32) -> f16
+
+    %1 = tt.fp_to_fp %arg0, rounding = rtz : tensor<8x8xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<8x8xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -5,6 +5,7 @@
 #include "triton/Analysis/Allocation.h"
 #include "triton/Conversion/TritonGPUToLLVM/ElementwiseOpToLLVMBase.h"
 #include "triton/Conversion/TritonGPUToLLVM/PatternTritonGPUOpToLLVM.h"
+#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
 using namespace mlir;
@@ -173,12 +174,8 @@ Fp16_to_Fp8E4M3FN_RTNE(Location loc, ConversionPatternRewriter &rewriter,
 
 static Value cvtFp16ToFp32(Location loc, ConversionPatternRewriter &rewriter,
                            const Value &v) {
-  GCNBuilder builder;
-  auto &cvt = *builder.create("v_cvt_f32_f16");
-  auto res = builder.newOperand("=v");
-  auto operand = builder.newOperand(v, "v");
-  cvt(res, operand);
-  return builder.launch(rewriter, loc, f32_ty, false);
+  TritonLLVMOpBuilder b(loc, rewriter);
+  return b.fpext(f32_ty, v);
 }
 
 // convert fp8 to fp32
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
@@ -532,20 +532,25 @@ int32_t getCtrlBitsForCacheModifierOnTarget(
 
 Value cvtFp32ToFp16(Location loc, RewriterBase &rewriter, const Value &v,
                     triton::RoundingMode rounding) {
+  if (rounding == triton::RoundingMode::RTNE) {
+    LLVM::RoundingMode rm = LLVM::RoundingMode::NearestTiesToEven;
+    return rewriter.create<LLVM::ConstrainedFPTruncIntr>(
+        loc, f16_ty, v, rm, LLVM::FPExceptionBehavior::Ignore);
+  }
+
+  // TODO: Figure out the test failure with RTZ LLVM::ConstrainedFPTruncIntr and
+  // switch to not use inline assembly too.
+  assert(rounding == triton::RoundingMode::RTZ);
   GCNBuilder builder;
 
   auto &cvt = *builder.create("v_cvt_f16_f32");
   auto res = builder.newOperand("=v");
   auto operand = builder.newOperand(v, "v");
-  if (rounding == triton::RoundingMode::RTZ) {
-    auto &setRTZ = *builder.create("s_setreg_imm32_b32 0x1801, 0xc");
-    setRTZ();
-  }
+  auto &setRTZ = *builder.create("s_setreg_imm32_b32 0x1801, 0xc");
+  setRTZ();
   cvt(res, operand);
-  if (rounding == triton::RoundingMode::RTZ) {
-    auto &resetRTZ = *builder.create("s_setreg_imm32_b32 0x1801, 0x0");
-    resetRTZ();
-  }
+  auto &resetRTZ = *builder.create("s_setreg_imm32_b32 0x1801, 0x0");
+  resetRTZ();
   return builder.launch(rewriter, loc, f16_ty, false);
 }