[AMD] Enable shared->MFMA dot operand conversion through LinearLayout (#4983)

binarman · web-flow · commit 152ef2deb885 · 2024-10-24T16:00:34.000-07:00
This PR:
- Introduces fallback from normal TTG-&gt;LLVM converter in case it does
not support given local_load.
- Enables conversion of MFMA dot layout to Linear Layout in local_load
pattern.
diff --git a/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp b/lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp
@@ -109,27 +109,30 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
       : ConvertOpToLLVMPattern(typeConverter, benefit), targetInfo(targetInfo) {
   }
 
+  // FIXME [Dot LL]
+  // Do for all DotOperandEncodingAttr once we have LLs for all of them
+  static bool isSupportedDotOpLayout(Attribute layout) {
+    if (auto dot = dyn_cast<DotOperandEncodingAttr>(layout)) {
+      if (auto mma = dyn_cast<NvidiaMmaEncodingAttr>(dot.getParent())) {
+        return mma.isAmpere() && dot.getKWidth() == 8;
+      }
+      if (isa<AMDMfmaEncodingAttr>(dot.getParent()))
+        return true;
+    }
+    return false;
+  };
+
   LogicalResult
   matchAndRewrite(LocalLoadOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     MemDescType srcTy = op.getSrc().getType();
     RankedTensorType dstTy = op.getType();
     Attribute srcLayout = srcTy.getEncoding();
     Attribute dstLayout = dstTy.getEncoding();
-    // FIXME [Dot LL]
-    // Do for all DotOperandEncodingAttr once we have LLs for all of them
-    auto isAmpereLargeKWidth = [](Attribute layout) {
-      if (auto dot = dyn_cast<DotOperandEncodingAttr>(layout)) {
-        if (auto mma = dyn_cast<NvidiaMmaEncodingAttr>(dot.getParent())) {
-          return mma.isAmpere() && dot.getKWidth() == 8;
-        }
-      }
-      return false;
-    };
     if (isa<SharedEncodingAttr>(srcLayout) &&
         (isa<BlockedEncodingAttr, MmaEncodingTrait, SliceEncodingAttr>(
              dstLayout) ||
-         isAmpereLargeKWidth(dstLayout))) {
+         isSupportedDotOpLayout(dstLayout))) {
       return lowerSharedToDistributed(op, adaptor, getTypeConverter(),
                                       rewriter);
     }
@@ -167,10 +170,10 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     auto srcTy = op.getSrc().getType();
     auto dstTy = op.getResult().getType();
     auto dstShape = dstTy.getShape();
-    assert(dstShape.size() <= 2 &&
-           "Unexpected rank of ConvertLayout(shared->blocked)");
     auto srcSharedLayout = cast<SharedEncodingAttr>(srcTy.getEncoding());
     auto dstLayout = dstTy.getEncoding();
+    assert((dstShape.size() <= 2 || isSupportedDotOpLayout(dstLayout)) &&
+           "Unexpected rank of ConvertLayout(shared->distributed)");
     auto inOrd = getOrder(srcSharedLayout);
 
     auto smemObj = LLVM::getSharedMemoryObjectFromStruct(
@@ -184,31 +187,36 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     // FIXME [Dot LL]
     // Ampere case
     // In this case, we need to pack the outputs into i32
-    if (isa<DotOperandEncodingAttr>(dstTy.getEncoding())) {
-      if (elemLlvmTy.isInteger(8)) {
-        auto concat = [&](Value a1, Value a2, Value a3, Value a4) {
-          return or_(or_(zext(i32_ty, a1), shl(zext(i32_ty, a2), i32_val(8))),
-                     or_(shl(zext(i32_ty, a3), i32_val(16)),
-                         shl(zext(i32_ty, a4), i32_val(24))));
-        };
-        SmallVector<Value> outVals32(outVals.size() / 4);
-        for (int i = 0; i < outVals32.size(); ++i) {
-          outVals32[i] = concat(outVals[4 * i], outVals[4 * i + 1],
-                                outVals[4 * i + 2], outVals[4 * i + 3]);
-        }
-        outVals = outVals32;
-      } else {
-        assert(elemLlvmTy.isBF16() && "Unexpected element type");
-        auto concat = [&](Value a, Value b) {
-          return or_(zext(i32_ty, bitcast(a, i16_ty)),
-                     shl(zext(i32_ty, bitcast(b, i16_ty)), i32_val(16)));
-        };
+    if (auto dotOp = dyn_cast<DotOperandEncodingAttr>(dstTy.getEncoding())) {
+      if (auto parent = dyn_cast<NvidiaMmaEncodingAttr>(dotOp.getParent())) {
+        if (parent.isAmpere()) {
+          if (elemLlvmTy.isInteger(8)) {
+            auto concat = [&](Value a1, Value a2, Value a3, Value a4) {
+              return or_(
+                  or_(zext(i32_ty, a1), shl(zext(i32_ty, a2), i32_val(8))),
+                  or_(shl(zext(i32_ty, a3), i32_val(16)),
+                      shl(zext(i32_ty, a4), i32_val(24))));
+            };
+            SmallVector<Value> outVals32(outVals.size() / 4);
+            for (int i = 0; i < outVals32.size(); ++i) {
+              outVals32[i] = concat(outVals[4 * i], outVals[4 * i + 1],
+                                    outVals[4 * i + 2], outVals[4 * i + 3]);
+            }
+            outVals = outVals32;
+          } else {
+            assert(elemLlvmTy.isBF16() && "Unexpected element type");
+            auto concat = [&](Value a, Value b) {
+              return or_(zext(i32_ty, bitcast(a, i16_ty)),
+                         shl(zext(i32_ty, bitcast(b, i16_ty)), i32_val(16)));
+            };
 
-        SmallVector<Value> outVals32(outVals.size() / 2);
-        for (int i = 0; i < outVals32.size(); ++i) {
-          outVals32[i] = concat(outVals[2 * i], outVals[2 * i + 1]);
+            SmallVector<Value> outVals32(outVals.size() / 2);
+            for (int i = 0; i < outVals32.size(); ++i) {
+              outVals32[i] = concat(outVals[2 * i], outVals[2 * i + 1]);
+            }
+            outVals = outVals32;
+          }
         }
-        outVals = outVals32;
       }
     }
 
diff --git a/test/Conversion/amd/compute-base-ptr.mlir b/test/Conversion/amd/compute-base-ptr.mlir
@@ -1,18 +1,19 @@
-// RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx942 | FileCheck %s
+// RUN: triton-opt %s --split-input-file --convert-triton-amdgpu-to-llvm=arch=gfx942 --mlir-print-debuginfo --mlir-pretty-debuginfo| FileCheck %s
 
 #blocked = #triton_gpu.blocked<{sizePerThread = [4, 1], threadsPerWarp = [2, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
 #mma = #triton_gpu.amd_mfma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [2, 4], instrShape = [16, 16], isTransposed = false}>
 #shared = #triton_gpu.shared<{vec = 16, perPhase = 4, maxPhase = 1, order = [1, 0], hasLeadingOffset = false}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, triton_gpu.shared = 544 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL: @local_load_offset
   tt.func @local_load_offset(%arg0: tensor<16x16xf16, #mma>) {
-    %0 = triton_gpu.convert_layout %arg0 {allocation.offset = 0 : i32} : tensor<16x16xf16, #mma> -> tensor<16x16xf16, #blocked>
-    %1 = triton_gpu.local_alloc %0 {allocation.offset = 0 : i32} : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
+    %0 = triton_gpu.convert_layout %arg0 {allocation.offset = 0 : i32} : tensor<16x16xf16, #mma> -> tensor<16x16xf16, #blocked> loc(#loc1)
+    %1 = triton_gpu.local_alloc %0 {allocation.offset = 0 : i32} : (tensor<16x16xf16, #blocked>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> loc(#loc2)
     // This catches base ptr calculation in the computeBasePtr, checks if the gep has correct element type.
-    // CHECK: llvm.sub
-    // CHECK-NEXT: llvm.getelementptr
-    // CHECK-SAME: (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, f16
-    %2 = triton_gpu.local_load %1 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>
+    // CHECK: llvm.getelementptr {{.*}} (!llvm.ptr<3>, i32) -> !llvm.ptr<3>, f16 local_load:3:0
+    %2 = triton_gpu.local_load %1 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>> loc(#loc3)
     tt.return
   }
 }
+#loc1 = loc("conert_layout":1:0)
+#loc2 = loc("local_alloc":2:0)
+#loc3 = loc("local_load":3:0)
diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
@@ -34,3 +34,31 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     tt.return
   }
 }
+
+// -----
+
+// Smoke test to check that mfma 32 and dot operand layouts can work with small tensors, for example with shape 16x16
+#mfma = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [2, 2], instrShape = [32, 32], isTransposed = true}>
+#dotop0 = #triton_gpu.dot_op<{opIdx = 0, parent = #mfma, kWidth=4}>
+#dotop1 = #triton_gpu.dot_op<{opIdx = 1, parent = #mfma, kWidth=4}>
+#shared = #triton_gpu.shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0], hasLeadingOffset = false}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: small_mfma_tensor_conversions
+  tt.func public @small_mfma_tensor_conversions(%arg0: tensor<16x16xf16, #mfma>, %arg1: tensor<16x16x!tt.ptr<f32>, #mfma>) {
+    // CHECK-NOT: triton_gpu.convert_layout
+    %0 = triton_gpu.local_alloc %arg0 : (tensor<16x16xf16, #mfma>) -> !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory>
+    // CHECK-4: store {{.*}} vector<4xf16>
+    %1 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dotop0>
+    // CHECK-2: load {{.*}} vector<4xf16>
+    %2 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #dotop1>
+    // CHECK-8: load {{.*}} vector<1xf16>
+    %3 = triton_gpu.local_load %0 : !tt.memdesc<16x16xf16, #shared, #triton_gpu.shared_memory> -> tensor<16x16xf16, #mfma>
+    // CHECK-4: load {{.*}} vector<4xf16>
+    %4 = tt.fp_to_fp %3 : tensor<16x16xf16, #mfma> -> tensor<16x16xf32, #mfma>
+
+    %5 = tt.dot %1, %2, %4 : tensor<16x16xf16, #dotop0> * tensor<16x16xf16, #dotop1> -> tensor<16x16xf32, #mfma>
+    // Store result to prevent DCE from removing all conversion related code
+    %6 = triton_gpu.local_alloc %5 : (tensor<16x16xf32, #mfma>) -> !tt.memdesc<16x16xf32, #shared, #triton_gpu.shared_memory>
+    tt.return
+  }
+}
diff --git a/test/TritonGPU/combine.mlir b/test/TritonGPU/combine.mlir
@@ -2649,3 +2649,39 @@ module attributes {"triton_gpu.num-warps" = 4 : i32, "triton_gpu.num-ctas" = 1 :
     tt.return
   }
 }
+
+// -----
+
+// Minimized reproducer for compiler crash during remove layouts conversions pass:
+// If dot result transformed into tensor with shape smaller than one MFMA instruction size, it triggers various asserts.
+// This is a smoke test that checks that compiler do not crash.
+//
+// CHECK-LABEL: small_tensor_mfma
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 4], warpsPerCTA = [1, 1], order = [0, 1]}>
+#mma = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 1], instrShape = [32, 32], isTransposed = true}>
+#mma1 = #triton_gpu.amd_mfma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [1, 1], instrShape = [16, 16], isTransposed = true}>
+module attributes {"triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 64 : i32} {
+  tt.func public @small_tensor_mfma(%arg0: !tt.ptr<f32>) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<32x32xf32, #mma>
+    %cst_0 = arith.constant dense<1.230000e+02> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+    %cst_1 = arith.constant dense<1.230000e+02> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+    %cst_2 = arith.constant dense<1.230000e+02> : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 4}>>
+    %cst_3 = arith.constant dense<1.230000e+02> : tensor<32x16xf32, #mma1>
+    %0 = tt.dot %cst_0, %cst_1, %cst : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<32x32xf32, #mma>
+    %1 = triton_gpu.convert_layout %0 : tensor<32x32xf32, #mma> -> tensor<32x32xf32, #blocked>
+    %2 = "tt.reduce" (%1) ({
+    ^bb0(%arg1: f32, %arg2: f32):
+      %3 = arith.addf %arg1, %arg2 : f32
+      tt.reduce.return %3 : f32
+    }) {axis = 1 : i32} : (tensor<32x32xf32, #blocked>) -> tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>>
+    %4 = tt.expand_dims %2 {axis = 1 : i32} : tensor<32xf32, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<32x1xf32, #blocked>
+    %5 = tt.broadcast %4 : tensor<32x1xf32, #blocked> -> tensor<32x16xf32, #blocked>
+    %6 = triton_gpu.convert_layout %5 : tensor<32x16xf32, #blocked> -> tensor<32x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1, kWidth = 4}>>
+    %7 = tt.dot %cst_2, %6, %cst_3 : tensor<32x32xf32, #triton_gpu.dot_op<{opIdx = 0, parent = #mma1, kWidth = 4}>> * tensor<32x16xf32, #triton_gpu.dot_op<{opIdx = 1, parent = #mma1, kWidth = 4}>> -> tensor<32x16xf32, #mma1>
+    %addr = tt.splat %arg0 : !tt.ptr<f32> -> tensor<32x16x!tt.ptr<f32>, #blocked>
+    %8 = triton_gpu.convert_layout %7 : tensor<32x16xf32, #mma1> -> tensor<32x16xf32, #blocked>
+    tt.store %addr, %8 : tensor<32x16x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -50,7 +50,11 @@ struct LocalLoadOpConversion
   }
 
 private:
-  // shared -> dot_operand if the result layout is mfma
+  /// Lower ttg.local_load in dot operand layout if the operand parent layout is
+  /// MFMA or WMMA.
+  ///
+  /// \returns value with packed loaded values or empty value if this local_load
+  /// is not supproted.
   Value lowerSharedToDotOperandMMA(
       triton::gpu::LocalLoadOp op, triton::gpu::LocalLoadOpAdaptor adaptor,
       const LLVMTypeConverter *typeConverter,
@@ -104,6 +108,8 @@ struct LocalLoadOpConversion
     isOuter = K == 1;
     Value res = lowerSharedToDotOperandMMA(op, adaptor, typeConverter, rewriter,
                                            dotOperandLayout, isOuter);
+    if (!res)
+      return failure();
     rewriter.replaceOp(op, res);
     return success();
   }
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM/SharedToDotOperandMFMA.cpp
@@ -231,6 +231,12 @@ Value convertLayout(int opIdx, ConversionPatternRewriter &rewriter,
     mfmaInstrK = elemsPerInstr[kDimIdx];
   }
 
+  if (mfmaInstrNonK > shape[nonKDimIdx] || mfmaInstrK > shape[kDimIdx]) {
+    // This pattern does not support cases tensor shape is smaller than
+    // one instruction size, it will be processed by LinearLayout converter
+    return Value();
+  }
+
   auto numReps = mfmaLayout.getRepForOperand(shape, kWidth, opIdx);
   auto numRepNonK = numReps[nonKDimIdx];
   auto numRepK = numReps[kDimIdx];