[AMD] Fix packed f32 to fp16 cast for single value (triton-lang#6545)

antiagainst · web-flow · commit c5fed8e1ca66 · 2025-04-20T12:53:48.000-07:00
We need to check whether we only have a
single value before assuming a size-2 vector
and use packed version.
diff --git a/test/Conversion/amd/fp_to_fp.mlir b/test/Conversion/amd/fp_to_fp.mlir
@@ -1,5 +1,5 @@
-// RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx942 | FileCheck --check-prefix=GFX942 %s
-// RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx950 | FileCheck --check-prefix=GFX950 %s
+// RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx942 | FileCheck --check-prefixes=COMMON,GFX942 %s
+// RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx950 | FileCheck --check-prefixes=COMMON,GFX950 %s
 
 //  CHECK-LABEL: f16_to_f32
 #blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
@@ -32,15 +32,30 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     // GFX942-COUNT-8: llvm.fptrunc %{{.+}} : f32 to f16
     // GFX950-COUNT-4: llvm.fptrunc %{{.+}} : vector<2xf32> to vector<2xf16>
     %0 = tt.fp_to_fp %arg0, rounding = rtne : tensor<8x8xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<8x8xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
-    // GFX942-COUNT-4: rocdl.cvt.pkrtz
-    // GFX950-COUNT-4: rocdl.cvt.pkrtz
+    // COMMON-COUNT-4: rocdl.cvt.pkrtz
     %1 = tt.fp_to_fp %arg0, rounding = rtz : tensor<8x8xf32, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>> -> tensor<8x8xf16, #ttg.dot_op<{opIdx = 0, parent = #blocked2}>>
     tt.return
   }
 }
 
 // -----
 
+//  CHECK-LABEL: f32_to_f16_single_value
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [2, 2], order = [1, 0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @f32_to_f16_single_value(%arg0: tensor<1x128xf32, #blocked>) {
+    // COMMON: llvm.fptrunc %{{.+}} : f32 to f16
+    // COMMON-NOT: llvm.fptrunc
+    %0 = tt.fp_to_fp %arg0, rounding = rtne : tensor<1x128xf32, #blocked> -> tensor<1x128xf16, #blocked>
+    // COMMON: rocdl.cvt.pkrtz
+    // COMMON-NOT: rocdl.cvt.pkrtz
+    %1 = tt.fp_to_fp %arg0, rounding = rtz : tensor<1x128xf32, #blocked> -> tensor<1x128xf16, #blocked>
+    tt.return
+  }
+}
+
+// -----
+
 //  CHECK-LABEL: downcast_to_f8
 #blocked2 = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ElementwiseOpToLLVM.cpp
@@ -596,17 +596,20 @@ convertFp32ToFp16RTZ(Location loc, ConversionPatternRewriter &rewriter,
 // Fp32->Fp16/Bf16 (RTNE) in GFX950
 static SmallVector<Value>
 convertFp32ToFp16RTNE(Location loc, ConversionPatternRewriter &rewriter,
-                      const SmallVector<Value> &v, Type outElemTy) {
-  assert(v.size() == 2);
+                      ArrayRef<Value> v, Type outElemTy) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
+  if (v.size() == 1)
+    return {b.fptrunc(outElemTy, v.front())};
+
+  assert(v.size() == 2);
   auto inVecTy = vec_ty(f32_ty, 2);
   auto retVecTy = vec_ty(outElemTy, 2);
   Value inVec = b.undef(inVecTy);
   auto idx0 = b.i32_val(0);
   auto idx1 = b.i32_val(1);
   inVec = b.insert_element(inVecTy, inVec, v[0], idx0);
   inVec = b.insert_element(inVecTy, inVec, v[1], idx1);
-  Value retVec = rewriter.create<LLVM::FPTruncOp>(loc, retVecTy, inVec);
+  Value retVec = b.fptrunc(retVecTy, inVec);
   SmallVector<Value> ret(2);
   ret[0] = b.extract_element(outElemTy, retVec, idx0);
   ret[1] = b.extract_element(outElemTy, retVec, idx1);
@@ -680,6 +683,7 @@ static SmallVector<Value> Fp32_to_F16_RTNE(Location loc,
                                            Type inElemTy, Type outElemTy,
                                            MultipleOperandsRange operands,
                                            AMD::ISAFamily isaFamily) {
+  // For CDNA4 we can potentially use packed v_cvt_pk_[b]f16_f32 instructions.
   if (isaFamily == AMD::ISAFamily::CDNA4) {
     SmallVector<Value> inVals;
     size_t numElem = std::min(size_t(2), operands.size());