[AMD] Use more efficient fp32 to bf16 type conversion (triton-lang#5633)

scxiao · web-flow · commit 1c28e08971a0 · 2025-01-27T10:08:10.000-08:00
This PR is to use a more efficient approach for the type
conversion from fp32 to bf16 in the hip backend.
According to a simple unit test: the number of VGPR
used decreases from 18 to 10.
diff --git a/test/TritonGPU/amd/type_conversion.mlir b/test/TritonGPU/amd/type_conversion.mlir
@@ -0,0 +1,16 @@
+// RUN: triton-opt %s --convert-triton-amdgpu-to-llvm='arch=gfx942' | FileCheck %s
+
+// CHECK-LABEL: llvm.func @fp32_to_bf16
+// CHECK: llvm.inline_asm {{.*}} "v_cmp_u_f32 $0, $1, $2", "=s,v,v"
+// CHECK: llvm.inline_asm {{.*}} "v_bfe_u32 $0, $1, $2, $3", "=v,v,v,v"
+// CHECK: llvm.inline_asm {{.*}} "v_add3_u32 $0, $1, $2, $3", "=v,v,v,v"
+// CHECK: llvm.inline_asm {{.*}} "v_cndmask_b32 $0, $1, $2, $3", "=v,v,v,s"
+
+#blocked = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [64], warpsPerCTA = [4], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @fp32_to_bf16(
+    %arg: tensor<256xf32, #blocked>) {
+    %8 = arith.truncf %arg : tensor<256xf32, #blocked> to tensor<256xbf16, #blocked>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -433,36 +433,70 @@ static Value convertBf16ToFp32(Location loc,
   return bitcast(shifted, f32_ty);
 }
 
+static Value buildGCNInstruction(Location loc, RewriterBase &rewritter,
+                                 StringRef instrName,
+                                 ArrayRef<StringRef> constraints,
+                                 ArrayRef<Value> vals, Type retType) {
+  assert(constraints.size() == vals.size() + 1);
+  assert(vals.size() == 2 || vals.size() == 3);
+  GCNBuilder builder;
+  GCNInstr &instr = *builder.create(instrName.str());
+  GCNBuilder::Operand *out = builder.newOperand(constraints[0]);
+  SmallVector<GCNBuilder::Operand *> operands;
+  for (int i = 0; i < vals.size(); ++i) {
+    operands.push_back(builder.newOperand(vals[i], constraints[i + 1]));
+  }
+
+  if (vals.size() == 2) {
+    instr(out, operands[0], operands[1]);
+  } else {
+    instr(out, operands[0], operands[1], operands[2]);
+  }
+
+  return builder.launch(rewritter, loc, retType, false);
+}
+
 static Value convertFp32ToBf16(Location loc,
                                ConversionPatternRewriter &rewriter,
                                const Value &v, const RoundingMode rounding) {
+  auto as_int32 = bitcast(v, i32_ty);
   if (rounding == RoundingMode::RTZ) {
-    auto as_int32 = bitcast(v, i32_ty);
     auto shifted = lshr(i32_ty, as_int32, i32_val(16));
     auto truncated = trunc(i16_ty, shifted);
     return bitcast(truncated, bf16_ty);
   }
-  // Otherwise it is (rounding == RoundingMode::RTNE)
-  auto as_uint32 = bitcast(v, i32_ty);
-  auto check_exponent =
-      and_(i32_ty, xor_(i32_ty, as_uint32, i32_val(0xffffffff)),
-           i32_val(0x7f800000));
-  auto exponent_not_all1s = icmp_ne(check_exponent, i32_val(0));
-  auto exponent_all1s = icmp_eq(check_exponent, i32_val(0));
-  auto rounded =
-      add(i32_ty, i32_val(0x7fff),
-          and_(i32_ty, lshr(i32_ty, as_uint32, i32_val(16)), i32_val(1)));
-  rounded = add(i32_ty, rounded, as_uint32);
-  auto res = select(exponent_not_all1s, rounded, as_uint32);
-
-  auto preserve_nan =
-      and_(i1_ty, exponent_all1s,
-           icmp_ne(and_(i32_ty, as_uint32, i32_val(0xffff)), i32_val(0)));
-  auto nan = or_(i32_ty, as_uint32, i32_val(0x10000));
-  res = select(preserve_nan, nan, res);
-
-  auto shifted = lshr(i32_ty, res, i32_val(16));
-  auto truncated = trunc(i16_ty, shifted);
+
+  // This implementation is a faster version for fp32 to bf16 type conversion
+  // It is from CK:
+  // https://github.com/cgmillette/composable_kernel/commit/24e75bef6aa5
+  // It uses less VGPR and less number of instructions compared to the
+  // previous implementation
+  SmallVector<StringRef> constraints0 = {"=s", "v", "v"};
+  SmallVector<Value> vals0 = {v, v};
+  Value isNan = buildGCNInstruction(loc, rewriter, "v_cmp_u_f32", constraints0,
+                                    vals0, i64_ty);
+
+  Value v16 = i32_val(16);
+  Value v1 = i32_val(1);
+  SmallVector<StringRef> constraints1 = {"=v", "v", "v", "v"};
+  SmallVector<Value> vals1 = {v, v16, v1};
+  Value tmp = buildGCNInstruction(loc, rewriter, "v_bfe_u32", constraints1,
+                                  vals1, i32_ty);
+
+  SmallVector<StringRef> constraints2 = {"=v", "v", "v", "v"};
+  Value v7FFF = i32_val(0x7FFF);
+  SmallVector<Value> vals2 = {v, tmp, v7FFF};
+  Value tmp1 = buildGCNInstruction(loc, rewriter, "v_add3_u32", constraints2,
+                                   vals2, i32_ty);
+
+  SmallVector<StringRef> constraints3 = {"=v", "v", "v", "s"};
+  Value vNan = i32_val(0x7FFF0000);
+  SmallVector<Value> vals3 = {tmp1, vNan, isNan};
+  Value cndMask = buildGCNInstruction(loc, rewriter, "v_cndmask_b32",
+                                      constraints3, vals3, i32_ty);
+
+  Value shifted = lshr(i32_ty, cndMask, v16);
+  Value truncated = trunc(i16_ty, shifted);
   return bitcast(truncated, bf16_ty);
 }