[AMD] Enable ds_read_tr for fp4 packed along K (#7481)

nzaghen · web-flow · commit 77ba5d7bb464 · 2025-07-12T19:50:45.000Z
This was defensively disabled in a previous commit but has been verified
to work fine.
FP4 when packed along K dimension needs to use ds_read_tr8 when loaded
from shared memory and transpose is needed. This is because packing
needs to stay the same so we need to operate on FP4 as if they were i8
types, this way we don't change the packing order.

Note: the LIT test that I've added is to show what the previous
behaviour was in comparison to current. The code was explicitly checking
dot_scaled usage so I've written the test to show the new behaviour
based on that. Although new behaviour doesn't need to look at dot_scaled
anymore.
diff --git a/test/Conversion/amd/ds_transpose.mlir b/test/Conversion/amd/ds_transpose.mlir
@@ -367,4 +367,15 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
     tt.return
   }
 
+  //  CHECK-LABEL: ds_transpose_fp4_mfma_32
+  tt.func @ds_transpose_fp4_mfma_32(%arg0: !ttg.memdesc<128x128xi8, #shared, #smem, mutable>, %arg1: !ttg.memdesc<128x128xi8, #shared1, #smem, mutable>, %arg2: !ttg.memdesc<128x128xf32, #shared1, #smem, mutable>) {
+    // CHECK-COUNT-32: rocdl.ds.read.tr8.b64 %{{.*}} : <3> -> vector<2xi32>
+    // CHECK-NOT: rocdl.ds.read.tr8.b64 %{{.*}} : <3> -> vector<2xi32>
+    %1 = ttg.local_load %arg0 : !ttg.memdesc<128x128xi8, #shared, #smem, mutable> -> tensor<128x128xi8, #ttg.dot_op<{opIdx = 0, parent = #mma32, kWidth = 16}>>
+    %2 = ttg.local_load %arg1 : !ttg.memdesc<128x128xi8, #shared1, #smem, mutable> -> tensor<128x128xi8, #ttg.dot_op<{opIdx = 1, parent = #mma32, kWidth = 16}>>
+    %cst_2 = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma32>
+    %3 = tt.dot_scaled %1, %2, %cst_2 lhs = e2m1 rhs = e2m1 {fastMath = false} : tensor<128x128xi8, #ttg.dot_op<{opIdx = 0, parent = #mma32, kWidth = 16}>> * tensor<128x128xi8, #ttg.dot_op<{opIdx = 1, parent = #mma32, kWidth = 16}>> -> tensor<128x128xf32, #mma32>
+    ttg.local_store %3, %arg2 : tensor<128x128xf32, #mma32> -> !ttg.memdesc<128x128xf32, #shared1, #smem, mutable>
+    tt.return
+  }
 }
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/MemoryOpToLLVM.cpp
@@ -213,14 +213,9 @@ struct TransLocalLoadOpConversion
     auto bitwidth = typeConverter->convertType(dstTy.getElementType())
                         .getIntOrFloatBitWidth();
 
-    // Triton does not natively support the FP4 type, so it is packed and
-    // represented as an i8. Currently, the only way to distinguish FP4 from an
-    // actual int8 is by checking whether the localLoad is used in a scaled dot
-    // operation, as int8 is never used in one.
-    bool isFP4 = isUsedByDotScaledOp(localLoad) && bitwidth == 8 &&
-                 dstTy.getElementType().isInteger();
-
-    if (isFP4 || (bitwidth != 16 && bitwidth != 8)) {
+    // FP4 is represented as i8 and, when packed along K, can be
+    // transposed using ds_read_tr8 which doesn't change packing.
+    if (bitwidth != 16 && bitwidth != 8) {
       return false;
     }