[Util][GPU] Add TiedOpInterface implementation for iree_gpu.multi_mma (#18626)

Max191 · web-flow · commit a9c7ec1e6a2c · 2024-09-30T14:12:10.000Z
This PR is a part of what was originally #18608. The PR implements the TiedOpInterface for the iree_gpu.multi_mma op. This is a temporary solution to having multi_mma ops before dispatch workgroup creation, and is only needed right now because we rely on early materialization. This will enable e2e matmul tests with GPU data tiling while it is still being developed, and this change can be dropped once we switch to late materialization. --------- Signed-off-by: Max Dawkins <max.dawkins@gmail.com>
diff --git a/compiler/src/iree/compiler/DispatchCreation/test/convert_region_to_workgroups.mlir b/compiler/src/iree/compiler/DispatchCreation/test/convert_region_to_workgroups.mlir
@@ -47,3 +47,40 @@ util.func public @foo(%argA: tensor<?x?xf32>, %argB: tensor<5x10xf32>, %argC: te
   //      CHECK: util.return %[[r0]], %[[r1]]
   util.return %r0, %r1 : tensor<?x?xf32>, tensor<5x11xf32>
 }
+
+// -----
+
+// TODO(Max191): Remove this test once GPU data tiling stops using early
+// materialization.
+util.func public @multi_mma(
+    %arg0: tensor<4x16x8x4x16x2x4xf16>,
+    %arg1: tensor<4x16x4x2x4x16x2x4xf16>,
+    %arg2: tensor<4x4x8x4x2x4x16x4xf32>) -> (tensor<4x4x8x4x2x4x16x4xf32>) {
+  %9 = flow.dispatch.region -> (tensor<4x4x8x4x2x4x16x4xf32>) {
+    %13 = iree_gpu.multi_mma %arg0, %arg1, %arg2 {
+        indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
+                         affine_map<(d0, d1, d2) -> (d1, d2)>,
+                         affine_map<(d0, d1, d2) -> (d0, d1)>],
+        iterator_types = [#iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<parallel>, #iree_gpu.iterator_type<reduction>],
+        kind = #iree_gpu.data_tiled_mma_layout<intrinsic =  MFMA_F32_16x16x16_F16, unroll_m = 8, unroll_n = 2, unroll_n_to_subgroups = 4, unroll_k = 2>}
+        : tensor<4x16x8x4x16x2x4xf16>, tensor<4x16x4x2x4x16x2x4xf16> into tensor<4x4x8x4x2x4x16x4xf32>
+    flow.return %13 : tensor<4x4x8x4x2x4x16x4xf32>
+  }
+  util.return %9 : tensor<4x4x8x4x2x4x16x4xf32>
+}
+
+// CHECK-LABEL: util.func public @multi_mma(
+//       CHECK:     %[[arg0:.*]]: tensor<4x16x8x4x16x2x4xf16>, %[[arg1:.*]]: tensor<4x16x4x2x4x16x2x4xf16>, %[[arg2:.*]]: tensor<4x4x8x4x2x4x16x4xf32>
+//       CHECK:   %[[r0:.*]] = flow.dispatch.workgroups(%[[arg0]], %[[arg1]], %[[arg2]])
+//  CHECK-SAME:       : (tensor<4x16x8x4x16x2x4xf16>, tensor<4x16x4x2x4x16x2x4xf16>, tensor<4x4x8x4x2x4x16x4xf32>)
+//  CHECK-NEXT:       (%[[arg3:.*]]: !flow.dispatch.tensor<readonly:tensor<4x16x8x4x16x2x4xf16>>,
+//  CHECK-SAME:        %[[arg4:.*]]: !flow.dispatch.tensor<readonly:tensor<4x16x4x2x4x16x2x4xf16>>,
+//  CHECK-SAME:        %[[arg5:.*]]: !flow.dispatch.tensor<readwrite:tensor<4x4x8x4x2x4x16x4xf32>>)
+//   CHECK-DAG:     %[[loadLHS:.*]] = flow.dispatch.tensor.load %[[arg3]]
+//   CHECK-DAG:     %[[loadRHS:.*]] = flow.dispatch.tensor.load %[[arg4]]
+//   CHECK-DAG:     %[[loadACC:.*]] = flow.dispatch.tensor.load %[[arg5]]
+//       CHECK:     %[[MULTI_MMA:.*]] = iree_gpu.multi_mma %[[loadLHS]], %[[loadRHS]], %[[loadACC]]
+//       CHECK:     flow.dispatch.tensor.store %[[MULTI_MMA]], %[[arg5]]
+//       CHECK:     flow.return
+//       CHECK:   }
+//       CHECK:   util.return %[[r0]]
diff --git a/compiler/src/iree/compiler/ExternalInterfaces/BUILD.bazel b/compiler/src/iree/compiler/ExternalInterfaces/BUILD.bazel
@@ -27,6 +27,7 @@ iree_compiler_cc_library(
         "UtilExternalModels.h",
     ],
     deps = [
+        "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
         "//compiler/src/iree/compiler/Dialect/Encoding/IR",
         "//compiler/src/iree/compiler/Dialect/Flow/IR",
         "//compiler/src/iree/compiler/Dialect/HAL/IR",
diff --git a/compiler/src/iree/compiler/ExternalInterfaces/CMakeLists.txt b/compiler/src/iree/compiler/ExternalInterfaces/CMakeLists.txt
@@ -32,6 +32,7 @@ iree_cc_library(
     MLIRMLProgramDialect
     MLIRTensorDialect
     MLIRValueBoundsOpInterface
+    iree::compiler::Codegen::Dialect::GPU::IR::IREEGPUDialect
     iree::compiler::Dialect::Encoding::IR
     iree::compiler::Dialect::Flow::IR
     iree::compiler::Dialect::HAL::IR
diff --git a/compiler/src/iree/compiler/ExternalInterfaces/UtilExternalModels.cpp b/compiler/src/iree/compiler/ExternalInterfaces/UtilExternalModels.cpp
@@ -6,6 +6,8 @@
 
 #include "iree/compiler/ExternalInterfaces/UtilExternalModels.h"
 
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
 #include "iree/compiler/Dialect/Encoding/IR/EncodingDialect.h"
 #include "iree/compiler/Dialect/Encoding/IR/EncodingOps.h"
 #include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtDialect.h"
@@ -168,6 +170,27 @@ struct LinalgOpTiedOpInterfaceHelper {
   }
 };
 
+// TODO(Max191): Remove this interface once GPU data tiling stops using early
+// materialization. This only exists for handling multi_mma ops before dispatch
+// workgroups are created, which only happens with early materialization.
+struct MultiMmaOpTiedOpInterface
+    : public IREE::Util::TiedOpInterface::ExternalModel<
+          MultiMmaOpTiedOpInterface, IREE::GPU::MultiMmaOp> {
+  Value getTiedResult(Operation *op, unsigned resultIndex) const {
+    auto linalgOp = cast<IREE::GPU::MultiMmaOp>(op);
+    return IREE::Util::TiedOpInterface::findTiedBaseValue(linalgOp.getAcc());
+  }
+
+  ::std::optional<unsigned>
+  getTiedResultOperandIndex(Operation *op, unsigned resultIndex) const {
+    return {2}; // acc
+  }
+
+  SmallVector<int64_t> getTiedResultOperandIndices(Operation *op) const {
+    return {2}; // acc
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // HoistableOpInterface
 //===----------------------------------------------------------------------===//
@@ -289,6 +312,11 @@ void registerUtilExternalModels(DialectRegistry &registry) {
             *context);
       });
 
+  registry.addExtension(+[](MLIRContext *context,
+                            IREE::GPU::IREEGPUDialect *dialect) {
+    IREE::GPU::MultiMmaOp::attachInterface<MultiMmaOpTiedOpInterface>(*context);
+  });
+
   registry.addExtension(
       +[](MLIRContext *context, linalg::LinalgDialect *dialect) {
         // Register all Linalg structured ops. `LinalgOp` is an interface and it