[AMD] Enable packed Bf8/Fp8->Bf16 conversions for gfx950 (triton-lang#6291)

yiqian1 · web-flow · commit 3121ad5a32b0 · 2025-03-24T21:41:28.000-07:00
Support Bf8/Fp8-&gt;Bf16 conversions with ROCDL wrappers
`rocdl.cvt.scalef32.pk.bf16.*` in gfx950.
diff --git a/test/Conversion/amd/fp_to_fp.mlir b/test/Conversion/amd/fp_to_fp.mlir
@@ -78,13 +78,19 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     %0 = tt.fp_to_fp %arg0 : tensor<8x8xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<8x8xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
 
     // CHECK-GFX950-COUNT-4: rocdl.cvt.scalef32.pk.f16.bf8
-    %2 = tt.fp_to_fp %arg0 : tensor<8x8xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<8x8xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
+    %1 = tt.fp_to_fp %arg0 : tensor<8x8xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<8x8xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
+
+    // CHECK-GFX950-COUNT-4: rocdl.cvt.scalef32.pk.bf16.bf8
+    %2 = tt.fp_to_fp %arg0 : tensor<8x8xf8E5M2, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<8x8xbf16, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
 
     // CHECK-GFX950-COUNT-4: rocdl.cvt.scalef32.pk.f32.fp8
     %3 = tt.fp_to_fp %arg1 : tensor<8x8xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<8x8xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
 
     // CHECK-GFX950-COUNT-4: rocdl.cvt.scalef32.pk.f16.fp8
-    %5 = tt.fp_to_fp %arg1 : tensor<8x8xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<8x8xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
+    %4 = tt.fp_to_fp %arg1 : tensor<8x8xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<8x8xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
+
+    // CHECK-GFX950-COUNT-4: rocdl.cvt.scalef32.pk.bf16.fp8
+    %5 = tt.fp_to_fp %arg1 : tensor<8x8xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<8x8xbf16, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
     tt.return
   }
 }
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -31,8 +31,10 @@ cvtScalePkUpcastFromFp8(Location loc, ConversionPatternRewriter &rewriter,
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   auto fp8x4VecTy = vec_ty(i8_ty, 4);
   Value fp8x4Vec = b.undef(fp8x4VecTy);
-  fp8x4Vec = b.insert_element(fp8x4VecTy, fp8x4Vec, v0, b.i32_val(0));
-  fp8x4Vec = b.insert_element(fp8x4VecTy, fp8x4Vec, v1, b.i32_val(1));
+  auto idx0 = b.i32_val(0);
+  auto idx1 = b.i32_val(1);
+  fp8x4Vec = b.insert_element(fp8x4VecTy, fp8x4Vec, v0, idx0);
+  fp8x4Vec = b.insert_element(fp8x4VecTy, fp8x4Vec, v1, idx1);
   auto i32v = b.bitcast(fp8x4Vec, i32_ty);
 
   auto resType = i32_ty;
@@ -41,18 +43,22 @@ cvtScalePkUpcastFromFp8(Location loc, ConversionPatternRewriter &rewriter,
                 std::is_same_v<convertOp, ROCDL::CvtScaleF32PkF32Bf8Op>) {
     resType = i64_ty;
     dstType = f32_ty;
-  } else {
+  } else if constexpr (std::is_same_v<convertOp,
+                                      ROCDL::CvtScaleF32PkF16Fp8Op> ||
+                       std::is_same_v<convertOp,
+                                      ROCDL::CvtScaleF32PkF16Bf8Op>) {
     resType = i32_ty;
     dstType = f16_ty;
+  } else {
+    resType = i32_ty;
+    dstType = bf16_ty;
   }
   Value scale = b.f32_val(1);
   Value select = b.false_val();
   auto result = rewriter.create<convertOp>(loc, resType, i32v, scale, select);
   auto retVecTy = vec_ty(dstType, 2);
   auto retVec = b.bitcast(result, retVecTy);
   SmallVector<Value> ret(2);
-  auto idx0 = b.i32_val(0);
-  auto idx1 = b.i32_val(1);
   ret[0] = b.extract_element(dstType, retVec, idx0);
   ret[1] = b.extract_element(dstType, retVec, idx1);
   return ret;
@@ -77,8 +83,10 @@ cvtScalePkDowncastToFp8(Location loc, ConversionPatternRewriter &rewriter,
   } else {
     Type v2F16Ty = vec_ty(v0.getType(), 2);
     Value srcVec = b.undef(v2F16Ty);
-    srcVec = b.insert_element(v2F16Ty, srcVec, v0, b.i32_val(0));
-    srcVec = b.insert_element(v2F16Ty, srcVec, v1, b.i32_val(1));
+    auto idx0 = b.i32_val(0);
+    auto idx1 = b.i32_val(1);
+    srcVec = b.insert_element(v2F16Ty, srcVec, v0, idx0);
+    srcVec = b.insert_element(v2F16Ty, srcVec, v1, idx1);
     result = rewriter.create<convertOp>(loc, v2I16Ty, v2I16Vec, srcVec, scale,
                                         select);
   }
@@ -698,9 +706,10 @@ ConverterT Fp8E5M2FNUZ_to_Fp16(AMD::ISAFamily isaFamily) {
                                             : Fp8E5M2FNUZ_to_Fp16_SW;
 }
 
-static SmallVector<Value> Fp8E5M2_to_Bf16(Location loc,
-                                          ConversionPatternRewriter &rewriter,
-                                          const SmallVector<Value> &v) {
+// OCP Bf8 -> Bf16
+static SmallVector<Value>
+Fp8E5M2_to_Bf16_SW(Location loc, ConversionPatternRewriter &rewriter,
+                   const SmallVector<Value> &v) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   auto fp8x4VecTy = vec_ty(i8_ty, 4);
   Value a0 = b.undef(fp8x4VecTy);
@@ -761,6 +770,19 @@ static SmallVector<Value> Fp8E5M2_to_Bf16(Location loc,
           b.extract_element(bf16_ty, out1, b.i32_val(1))};
 }
 
+static SmallVector<Value>
+Fp8E5M2_to_Bf16_HW(Location loc, ConversionPatternRewriter &rewriter,
+                   const SmallVector<Value> &v) {
+  assert(v.size() == 2);
+  return cvtScalePkUpcastFromFp8<ROCDL::CvtScaleF32PkBf16Bf8Op>(loc, rewriter,
+                                                                v[0], v[1]);
+}
+
+ConverterT Fp8E5M2_to_Bf16(AMD::ISAFamily isaFamily) {
+  return isaFamily == AMD::ISAFamily::CDNA4 ? Fp8E5M2_to_Bf16_HW
+                                            : Fp8E5M2_to_Bf16_SW;
+}
+
 // Bf16 -> OCP Bf8
 static SmallVector<Value>
 Bf16_to_Fp8E5M2_SW(Location loc, ConversionPatternRewriter &rewriter,
@@ -869,9 +891,9 @@ static SmallVector<Value> Bf16_to_Fp8E4M3FN(Location loc,
 }
 
 // fp8e4m3fn to bf16
-static SmallVector<Value> Fp8E4M3FN_to_Bf16(Location loc,
-                                            ConversionPatternRewriter &rewriter,
-                                            const SmallVector<Value> &v) {
+static SmallVector<Value>
+Fp8E4M3FN_to_Bf16_SW(Location loc, ConversionPatternRewriter &rewriter,
+                     const SmallVector<Value> &v) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
   auto fp8x4VecTy = vec_ty(i8_ty, 4);
   Value a0 = b.undef(fp8x4VecTy);
@@ -904,6 +926,19 @@ static SmallVector<Value> Fp8E4M3FN_to_Bf16(Location loc,
           b.extract_element(bf16_ty, out0, b.i32_val(1))};
 }
 
+static SmallVector<Value>
+Fp8E4M3FN_to_Bf16_HW(Location loc, ConversionPatternRewriter &rewriter,
+                     const SmallVector<Value> &v) {
+  assert(v.size() == 2);
+  return cvtScalePkUpcastFromFp8<ROCDL::CvtScaleF32PkBf16Fp8Op>(loc, rewriter,
+                                                                v[0], v[1]);
+}
+
+ConverterT Fp8E4M3FN_to_Bf16(AMD::ISAFamily isaFamily) {
+  return isaFamily == AMD::ISAFamily::CDNA4 ? Fp8E4M3FN_to_Bf16_HW
+                                            : Fp8E4M3FN_to_Bf16_SW;
+}
+
 // fp8e4m3fnuz to bf16
 static SmallVector<Value>
 Fp8E4M3FNUZ_to_Bf16(Location loc, ConversionPatternRewriter &rewriter,
@@ -1130,9 +1165,10 @@ struct FpToFpOpConversion
              Fp16_to_Fp8E5M2_RTNE(isaFamily)},
             {{F16TyID, F8E5M2TyID, RoundingMode::RTZ}, Fp16_to_Fp8E5M2_RTZ},
             // F8 -> BF16
-            {{F8E5M2TyID, BF16TyID, undefRounding}, Fp8E5M2_to_Bf16},
+            {{F8E5M2TyID, BF16TyID, undefRounding}, Fp8E5M2_to_Bf16(isaFamily)},
             {{F8E5M2FNUZTyID, BF16TyID, undefRounding}, Fp8E5M2FNUZ_to_Bf16},
-            {{F8E4M3FNTyID, BF16TyID, undefRounding}, Fp8E4M3FN_to_Bf16},
+            {{F8E4M3FNTyID, BF16TyID, undefRounding},
+             Fp8E4M3FN_to_Bf16(isaFamily)},
             {{F8E4M3FNUZTyID, BF16TyID, undefRounding}, Fp8E4M3FNUZ_to_Bf16},
             // BF16 -> F8
             {{BF16TyID, F8E5M2TyID, RoundingMode::RTNE},
@@ -1197,16 +1233,11 @@ struct FpToFpOpConversion
     }
 
     // numElements = 4 for conversions:
-    // ocp bf8->bf16, or
-    // ocp bf8->fp32/fp16 on non-CDNA4, or
+    // ocp bf8->fp32/fp16/bf16 on non-CDNA4, or
     // fp32/bf16/fp16->ocp bf8 on non-CDNA4
     // fp32/bf16/fp16->ocp bf8 (RTZ) on CDNA4
     size_t numElements = 2;
     if ((llvm::isa<Float8E5M2Type>(srcElementType) &&
-         llvm::isa<BFloat16Type>(dstElementType)) ||
-        (llvm::isa<Float8E5M2Type>(srcElementType) &&
-         (llvm::isa<Float16Type>(dstElementType) ||
-          llvm::isa<Float32Type>(dstElementType)) &&
          isaFamily != AMD::ISAFamily::CDNA4) ||
         (llvm::isa<Float8E5M2Type>(dstElementType) &&
          isaFamily != AMD::ISAFamily::CDNA4) ||