[GPU] Add col_major optional attribute to MMAAttr (#19860)

qedawkins · web-flow · commit 93bb9e4b9d4b · 2025-04-07T22:33:48.000Z
For MMA intrinsics with symmetric register layouts we have the freedom
to choose between row major and column major output layouts by simply
swapping the input operands. This adds a flag to `MMAAttr` that allows
specifying whether the result should be column major.
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.cpp
@@ -345,6 +345,21 @@ MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic,
   return {};
 }
 
+MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic,
+                                                MMAFragment fragment,
+                                                bool colMajor) {
+  MMASingleSubgroupLayout baseLayout =
+      getSingleSubgroupLayout(intrinsic, fragment);
+  assert(baseLayout.element.size() == 2 && "expected 2d layout");
+  if (colMajor) {
+    std::swap(baseLayout.element[0], baseLayout.element[1]);
+    std::swap(baseLayout.thread[0], baseLayout.thread[1]);
+    std::swap(baseLayout.outer[0], baseLayout.outer[1]);
+    std::swap(baseLayout.tstrides[0], baseLayout.tstrides[1]);
+  }
+  return baseLayout;
+}
+
 // Struct describing the shape of a MMA operation, but not the detailed layout.
 struct OpaqueMmaLayout {
   int64_t mSize = 0;
@@ -388,7 +403,11 @@ static OpaqueMmaLayout getOpaqueMMALayout(MLIRContext *context,
 MMASingleSubgroupLayout getSingleSubgroupLayout(MmaInterfaceAttr mmaKind,
                                                 MMAFragment fragment) {
   if (auto mmaAttr = dyn_cast<MMAAttr>(mmaKind)) {
-    return getSingleSubgroupLayout(mmaAttr.getIntrinsic(), fragment);
+    // |colMajor| indicates that the accumulator layout should be returned
+    // column major.
+    return getSingleSubgroupLayout(mmaAttr.getIntrinsic(), fragment,
+                                   fragment == MMAFragment::Acc &&
+                                       mmaAttr.getColMajor());
   }
   if (auto vmmaAttr = dyn_cast<VirtualMMAAttr>(mmaKind)) {
     return getSingleSubgroupLayout(vmmaAttr.getIntrinsic(), fragment);
@@ -401,6 +420,10 @@ MMASingleSubgroupLayout getSingleSubgroupLayout(MmaInterfaceAttr mmaKind,
 // MMA Attributes
 //===----------------------------------------------------------------------===//
 
+MMAAttr MMAAttr::get(MLIRContext *context, MMAIntrinsic type) {
+  return Base::get(context, type, /*colMajor=*/false);
+}
+
 std::tuple<Type, Type, Type> MMAAttr::getABCElementTypes() const {
   return IREE::GPU::getABCElementTypes(getContext(), getIntrinsic());
 }
@@ -468,7 +491,7 @@ SmallVector<VirtualMMAIntrinsic> MMAAttr::getVirtualIntrinsics() const {
 
 static Value createMmaOp(OpBuilder &builder, Location loc,
                          MMAIntrinsic intrinsic, Type resultType, Value lhs,
-                         Value rhs, Value acc) {
+                         Value rhs, Value acc, bool colMajor = false) {
   auto getVecOrSingleElem = [&](Value vec) -> Value {
     bool one = llvm::cast<VectorType>(vec.getType()).getNumElements() == 1;
     return one ? builder.create<vector::ExtractOp>(loc, vec, 0) : vec;
@@ -478,6 +501,13 @@ static Value createMmaOp(OpBuilder &builder, Location loc,
     // MFMA intrinsics want single-element operands of element type, not vector.
     lhs = getVecOrSingleElem(lhs);
     rhs = getVecOrSingleElem(rhs);
+
+    // Because the thread layout of the lhs and rhs are transpositions of one
+    // another for all MFMA variants, to produce a column major result we can
+    // simply swap the operands to the MFMA.
+    if (colMajor) {
+      std::swap(lhs, rhs);
+    }
     return builder
         .create<amdgpu::MFMAOp>(loc, resultType, layout.mSize, layout.nSize,
                                 layout.kSize, getBlockSize(intrinsic), lhs, rhs,
@@ -507,7 +537,7 @@ FailureOr<Value> MMAAttr::buildMmaOperation(OpBuilder &builder, Location loc,
     return failure();
   }
   if (Value value = createMmaOp(builder, loc, getIntrinsic(), resultType, lhs,
-                                rhs, acc)) {
+                                rhs, acc, getColMajor())) {
     return value;
   }
   return failure();
@@ -592,8 +622,8 @@ LogicalResult MMAAttr::populateOperandOffsetsSizesStrides(
     SmallVector<OpFoldResult> &offsets, SmallVector<OpFoldResult> &sizes,
     SmallVector<OpFoldResult> &strides) const {
 
-  MMASingleSubgroupLayout subgroupLayout =
-      getSingleSubgroupLayout(getIntrinsic(), fragment);
+  MMASingleSubgroupLayout subgroupLayout = getSingleSubgroupLayout(
+      getIntrinsic(), fragment, fragment == MMAFragment::Acc && getColMajor());
   SmallVector<OpFoldResult> canonicalOffsets;
   SmallVector<OpFoldResult> canonicalSizes;
   if (failed(populateCanonicalOffsetsSizesAndStrides(
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h
@@ -72,6 +72,10 @@ int64_t getKSize(MMAIntrinsic intrinsic);
 MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic,
                                                 MMAFragment fragment);
 
+MMASingleSubgroupLayout getSingleSubgroupLayout(MMAIntrinsic intrinsic,
+                                                MMAFragment fragment,
+                                                bool colMajor);
+
 MMASingleSubgroupLayout getSingleSubgroupLayout(VirtualMMAIntrinsic intrinsic,
                                                 MMAFragment fragment);
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.td
@@ -150,13 +150,25 @@ def IREEGPU_MMAAttr : AttrDef<IREEGPU_Dialect, "MMA", [
     The |intrinsic| field specifies which particular MMA intrinsic this refers
     to, with each intrinsic implicating a specific MNK shape and operand types.
     See IREEGPUEnums.td for the definition of the intrinsics.
+
+    If set to true, |col_major| indicates that the result should be produced
+    column major. This is equivalent to instead computing:
+
+    ```
+      C^T += B^T x A^T
+    ```
   }];
 
   let parameters = (ins
-    EnumParameter<IREEGPU_MMAIntrinsic>:$intrinsic
+    EnumParameter<IREEGPU_MMAIntrinsic>:$intrinsic,
+    DefaultValuedParameter<"bool", "false">:$col_major
   );
 
-  let assemblyFormat = "`<` params `>`";
+  let assemblyFormat = "`<` $intrinsic (`,` `col_major` `=` $col_major^)? `>`";
+
+  let builders = [
+    AttrBuilder<(ins "MMAIntrinsic":$intrinsic)>
+  ];
 
   let extraClassDeclaration = [{
     int64_t getBlockSize() const;
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/IR/test/iree_gpu_attrs.mlir
@@ -18,6 +18,15 @@ module {
 // CHECK-LABEL: func @test_mfma_f16_32x32x8_f32
 //  CHECK-SAME:   mma_types = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>
 
+module {
+  func.func @test_col_major_mfma_f16_16x16x16_f32() attributes {
+      mma_types = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16, col_major = true>} {
+    return
+  }
+}
+// CHECK-LABEL: func @test_col_major_mfma_f16_16x16x16_f32
+//  CHECK-SAME:   mma_types = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16, col_major = true>
+
 module {
   func.func @test_WMMAR3_f16_16x16x16_f32() attributes {
       mma_types = #iree_gpu.mma_layout<WMMAR3_F32_16x16x16_F16>} {
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TransformExtensions/test/lower_multi_mma.mlir
@@ -68,6 +68,40 @@ module attributes { transform.with_named_sequence } {
 
 // -----
 
+#contraction_accesses = [
+ affine_map<() -> ()>,
+ affine_map<() -> ()>,
+ affine_map<() -> ()>
+]
+func.func @lower_col_major_multi_mma_mfma_32x32x8(%lhs: vector<4xf16>, %rhs: vector<4xf16>, %acc: vector<16xf32>) -> vector<16xf32> {
+  %0 = iree_gpu.multi_mma %lhs, %rhs, %acc {
+    indexing_maps = #contraction_accesses,
+    iterator_types = [],
+    kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16, col_major = true>
+  } : vector<4xf16>, vector<4xf16> into vector<16xf32>
+  return %0 : vector<16xf32>
+}
+
+module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.readonly}) {
+    %func = transform.structured.match ops{["func.func"]} in %root : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %func {
+      transform.apply_patterns.iree.lower_multi_mma
+    } : !transform.any_op
+    transform.yield
+  }
+}
+
+// CHECK-LABEL: func @lower_col_major_multi_mma_mfma_32x32x8
+//  CHECK-SAME:   %[[LHS:[A-Za-z0-9]+]]: vector<4xf16>
+//  CHECK-SAME:   %[[RHS:[A-Za-z0-9]+]]: vector<4xf16>
+//  CHECK-SAME:   %[[ACC:[A-Za-z0-9]+]]: vector<16xf32>
+//       CHECK:   amdgpu.mfma %[[RHS]] * %[[LHS]] + %[[ACC]]
+//  CHECK-SAME:     blocks = 1 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32
+//  CHECK-SAME:     blgp =  none : vector<4xf16>, vector<4xf16>, vector<16xf32>
+
+// -----
+
 #contraction_accesses = [
  affine_map<() -> ()>,
  affine_map<() -> ()>,
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir b/compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms/test/distribute_mma_to_lanes.mlir
@@ -107,6 +107,45 @@ module {
 
 // -----
 
+#contraction_accesses = [
+ affine_map<(i, j, k) -> (i, k)>,
+ affine_map<(i, j, k) -> (k, j)>,
+ affine_map<(i, j, k) -> (i, j)>
+]
+module {
+  func.func @col_major_matmul_32x32x8(%arg0: tensor<2x8x32x8xf16>, %arg1: tensor<8x2x32x8xf16>, %arg2: tensor<2x2x32x4x8xf32>) -> tensor<2x2x32x4x8xf32> {
+    %mm = iree_gpu.multi_mma %arg0, %arg1, %arg2 {
+      indexing_maps = #contraction_accesses,
+      iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>],
+      kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16, col_major = true>,
+      rhs_permutation = array<i64: 1, 0>
+    } : tensor<2x8x32x8xf16>, tensor<8x2x32x8xf16> into tensor<2x2x32x4x8xf32>
+    return %mm : tensor<2x2x32x4x8xf32>
+  }
+}
+
+// CHECK-DAG: #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d0, d2)>
+// CHECK-DAG: #[[$MAP1:.+]] = affine_map<(d0, d1, d2) -> (d2, d1)>
+// CHECK-DAG: #[[$MAP2:.+]] = affine_map<(d0, d1, d2) -> (d0, d1)>
+
+// CHECK-LABEL: func @col_major_matmul_32x32x8
+//  CHECK-SAME:   %[[LHS:[A-Za-z0-9]+]]: tensor<2x8x32x8xf16>
+//  CHECK-SAME:   %[[RHS:[A-Za-z0-9]+]]: tensor<8x2x32x8xf16>
+//       CHECK:   scf.forall (%[[LANEID:.+]]) in (64) shared_outs(%[[ACC:.+]] = {{.*}}) -> (tensor<2x2x32x4x8xf32>)
+//   CHECK-DAG:     %[[ID:.+]]:3 = affine.delinearize_index %[[LANEID]] into (2, 32)
+//   CHECK-DAG:     %[[IDY:.+]] = affine.linearize_index disjoint [%[[ID]]#1, %c0] by (2, 4)
+//   CHECK-DAG:     %[[LHS_SLICE:.+]] = tensor.extract_slice %[[LHS]][0, 0, %[[ID]]#2, %[[IDY]]] [2, 8, 1, 4]
+//   CHECK-DAG:     %[[RHS_SLICE:.+]] = tensor.extract_slice %[[RHS]][0, 0, %[[ID]]#2, %[[IDY]]] [8, 2, 1, 4]
+//   CHECK-DAG:     %[[ACC_SLICE:.+]] = tensor.extract_slice %[[ACC]][0, 0, %[[ID]]#2, 0, %[[IDY]]] [2, 2, 1, 4, 4]
+//       CHECK:     %[[MMA:.+]] = iree_gpu.multi_mma %[[LHS_SLICE]], %[[RHS_SLICE]], %[[ACC_SLICE]]
+//  CHECK-SAME:       indexing_maps = [#[[$MAP]], #[[$MAP1]], #[[$MAP2]]]
+//  CHECK-SAME:       kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16, col_major = true>
+//  CHECK-SAME:       : tensor<2x8x1x4xf16>, tensor<8x2x1x4xf16> into tensor<2x2x1x4x4xf32>
+//       CHECK:     tensor.parallel_insert_slice %[[MMA]] into %[[ACC]][0, 0, %[[ID]]#2, 0, %[[IDY]]] [2, 2, 1, 4, 4]
+//       CHECK:   mapping = [#iree_gpu.lane_id<0>]
+
+// -----
+
 #contraction_accesses = [
  affine_map<(i, j, k) -> (i, k)>,
  affine_map<(i, j, k) -> (k, j)>,