GPU ukernel lowering config for data-tiled multi_mma, and a simple ukernel. (iree-org#19504)

bjacob · web-flow · commit 72d98bcafaf9 · 2024-12-17T15:31:52.000-05:00
This PR adds the KernelConfig logic to generate a lowering_config
selecting a ukernel for multi_mma. In order to be able to test it, this
PR also adds a very simple `multi_mma` ukernel, but it isn't actually
exercised yet, other than successfully compiling to bitcode. The
compiler logic only cares about the existence of the resulting bitcode
file. The actual lowering to ukernel op will come in the next PR.

---------

Signed-off-by: Benoit Jacob &lt;jacob.benoit.1@gmail.com&gt;
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel b/compiler/plugins/target/ROCM/builtins/ukernel/BUILD.bazel
@@ -46,8 +46,8 @@ argmax_types = [
 [iree_amdgpu_bitcode_library(
     name = "iree_uk_amdgpu_argmax_%s_%s" % (type, gpu_arch),
     srcs = [
-        "iree_uk_amdgpu_argmax_%s.c" % type,
         "common.h",
+        "iree_uk_amdgpu_argmax_%s.c" % type,
     ],
     out = "iree_uk_amdgpu_argmax_%s.%s.bc" % (type, gpu_arch),
     gpu_arch = gpu_arch,
@@ -59,9 +59,21 @@ argmax_bc_files = [
     for gpu_arch in gpu_archs
 ]
 
+iree_amdgpu_bitcode_library(
+    name = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4_gfx942",
+    srcs = [
+        "common.h",
+        "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c",
+    ],
+    out = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc",
+    gpu_arch = "gfx942",
+)
+
 iree_c_embed_data(
     name = "iree_uk_amdgpu_bitcode",
-    srcs = argmax_bc_files,
+    srcs = argmax_bc_files + [
+        "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc",
+    ],
     c_file_output = "iree_uk_amdgpu_bitcode.c",
     flatten = True,
     h_file_output = "iree_uk_amdgpu_bitcode.h",
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt b/compiler/plugins/target/ROCM/builtins/ukernel/CMakeLists.txt
@@ -206,6 +206,18 @@ iree_amdgpu_bitcode_library(
     "iree_uk_amdgpu_argmax_f32i64.gfx1100.bc"
 )
 
+iree_amdgpu_bitcode_library(
+  NAME
+    iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4_gfx942
+  GPU_ARCH
+    gfx942
+  SRCS
+    "common.h"
+    "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c"
+  OUT
+    "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc"
+)
+
 iree_c_embed_data(
   NAME
     iree_uk_amdgpu_bitcode
@@ -226,6 +238,7 @@ iree_c_embed_data(
     "iree_uk_amdgpu_argmax_f32i64.gfx1100.bc"
     "iree_uk_amdgpu_argmax_f32i64.gfx90a.bc"
     "iree_uk_amdgpu_argmax_f32i64.gfx942.bc"
+    "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc"
   C_FILE_OUTPUT
     "iree_uk_amdgpu_bitcode.c"
   H_FILE_OUTPUT
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/common.h b/compiler/plugins/target/ROCM/builtins/ukernel/common.h
@@ -57,6 +57,13 @@ typedef __UINT64_TYPE__ uint64_t;
 #define FLT_MIN __FLT_MIN__
 #define FLT_MAX __FLT_MAX__
 
+//===----------------------------------------------------------------------===//
+// Vector typedefs
+//===----------------------------------------------------------------------===//
+
+typedef __attribute__((__vector_size__(8 * 2))) int64_t int64x2_t;
+typedef __attribute__((__vector_size__(4 * 4))) int32_t int32x4_t;
+
 //===----------------------------------------------------------------------===//
 // Declarations for Clangd, which may be slightly older than actual clang.
 // Drop these as clangd versions used in practice gain these builtins.
diff --git a/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c b/compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.c
@@ -0,0 +1,53 @@
+// Copyright 2024 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "compiler/plugins/target/ROCM/builtins/ukernel/common.h"
+
+// Very naive kernel. TODO(bjacob):
+// 1. Shared memory: can't allocate it within the microkernel (which is just a
+//    helper device function, not the actual amdgpu_kernel). Need to get it
+//    passed down here as a `T [[clang::address_space(3)]] *` parameter.
+// 2. Better scheduling via either barrier intrinsics or inline assemby.
+// 3. Subgroups1x4 being asymmetric is a historical accident... should be 2x2.
+[[clang::always_inline]] void
+iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4(
+    const int8_t *a_buffer, int64_t a_offset, const int8_t *b_buffer,
+    int64_t b_offset, int32_t *c_buffer, int64_t c_offset, int64_t k_size) {
+  int tid = __builtin_amdgcn_workitem_id_x();
+
+  // Load existing accumulators.
+  int32x4_t acc[8][2] = {{0}};
+  int32x4_t *c_global = (int32x4_t *)(c_buffer + c_offset);
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      acc[i][j] = c_global[256 * (2 * i + j) + tid];
+    }
+  }
+
+  // Arithmetic loop.
+  const int64x2_t *a_global =
+      (const int64x2_t *)(a_buffer + a_offset) + (tid % 64);
+  const int64x2_t *b_global = (const int64x2_t *)(b_buffer + b_offset) + tid;
+  for (int k_outer = 0; k_outer < k_size; ++k_outer) {
+    for (int i = 0; i < 8; ++i) {
+      for (int j = 0; j < 2; ++j) {
+        for (int k = 0; k < 2; ++k) {
+          acc[i][j] = __builtin_amdgcn_mfma_i32_16x16x32_i8(
+              a_global[64 * i][k], b_global[256 * j][k], acc[i][j], 0, 0, 0);
+        }
+      }
+    }
+    a_global += 512;
+    b_global += 512;
+  }
+
+  // Store accumulators.
+  for (int i = 0; i < 8; ++i) {
+    for (int j = 0; j < 2; ++j) {
+      c_global[256 * (2 * i + j) + tid] = acc[i][j];
+    }
+  }
+}
diff --git a/compiler/plugins/target/ROCM/test/BUILD.bazel b/compiler/plugins/target/ROCM/test/BUILD.bazel
@@ -17,6 +17,7 @@ iree_lit_test_suite(
     srcs = [
         "config_ukernel_argmax_gfx908.mlir",
         "config_ukernel_argmax_gfx942.mlir",
+        "config_ukernel_multi_mma_gfx942.mlir",
         "default_tuning_specs_amdgpu.mlir",
         "lowering_strategy_from_tuning_spec.mlir",
         "ukernel_pipeline_transform.mlir",
diff --git a/compiler/plugins/target/ROCM/test/CMakeLists.txt b/compiler/plugins/target/ROCM/test/CMakeLists.txt
@@ -16,6 +16,7 @@ iree_lit_test_suite(
   SRCS
     "config_ukernel_argmax_gfx908.mlir"
     "config_ukernel_argmax_gfx942.mlir"
+    "config_ukernel_multi_mma_gfx942.mlir"
     "default_tuning_specs_amdgpu.mlir"
     "lowering_strategy_from_tuning_spec.mlir"
     "ukernel_pipeline_transform.mlir"
diff --git a/compiler/plugins/target/ROCM/test/config_ukernel_multi_mma_gfx942.mlir b/compiler/plugins/target/ROCM/test/config_ukernel_multi_mma_gfx942.mlir
@@ -0,0 +1,29 @@
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 --pass-pipeline='builtin.module(iree-llvmgpu-select-lowering-strategy)' %s | FileCheck %s
+
+func.func @multi_mma_mfma_i32_16x16x32_i8(%a : tensor<1x2x8x4x16x2x8xi8>,
+                                %b : tensor<1x2x4x2x4x16x2x8xi8>,
+                                %c : tensor<1x1x8x4x2x4x16x4xi32>)
+    -> tensor<1x1x8x4x2x4x16x4xi32> attributes {
+  hal.executable.target = #hal.executable.target<"rocm", "rocm-hsaco-fb", {ukernels = "multi_mma"}>
+} {
+  %d = iree_gpu.multi_mma %a, %b, %c {indexing_maps = [
+      affine_map<(d0, d1, d2) -> (d0, d2)>,
+      affine_map<(d0, d1, d2) -> (d1, d2)>,
+      affine_map<(d0, d1, d2) -> (d0, d1)>
+    ], iterator_types = [
+      #iree_gpu.iterator_type<parallel>,
+      #iree_gpu.iterator_type<parallel>,
+      #iree_gpu.iterator_type<reduction>
+    ], kind = #iree_gpu.data_tiled_mma_layout<
+      intrinsic =  MFMA_I32_16x16x32_I8,
+      unroll_m = 8, unroll_n = 2, subgroups_n = 4, unroll_k = 2
+    >} : tensor<1x2x8x4x16x2x8xi8>, tensor<1x2x4x2x4x16x2x8xi8> into tensor<1x1x8x4x2x4x16x4xi32>
+  return %d : tensor<1x1x8x4x2x4x16x4xi32>
+}
+
+// CHECK-LABEL: @multi_mma_mfma_i32_16x16x32_i8
+//       CHECK: iree_gpu.multi_mma
+//  CHECK-SAME: #hal.executable.object<{path = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4.gfx942.bc"
+//  CHECK-NOT:  promote_operands
+//  CHECK-SAME: reduction = [0, 0, 0]
+//  CHECK-SAME: #iree_gpu.ukernel_config<name = "iree_uk_amdgpu_multi_mma_mfma_i32_16x16x32_i8_unroll8x2x2_subgroups1x4"
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.cpp
@@ -33,10 +33,9 @@ namespace mlir::iree_compiler::IREE::GPU {
 
 constexpr int64_t kCacheLineSizeBits = 128 * 8;
 
-LogicalResult
-setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
-                                   mlir::FunctionOpInterface entryPoint,
-                                   Operation *op) {
+LogicalResult setDataTiledMultiMmaLoweringConfig(
+    IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
+    Operation *op, IREE::GPU::UKernelConfigAttr ukernelConfig) {
   auto multiMmaOp = dyn_cast<IREE::GPU::MultiMmaOp>(op);
   if (!multiMmaOp) {
     return failure();
@@ -70,7 +69,7 @@ setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
   SmallVector<int64_t> reductionTileSizes(iterationRank, 0);
   for (int64_t kDim : contractionDims.k) {
     workgroupTileSizes[kDim] = 0;
-    reductionTileSizes[kDim] = 1;
+    reductionTileSizes[kDim] = ukernelConfig ? 0 : 1;
   }
 
   // Set tile sizes.
@@ -81,8 +80,16 @@ setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
                      b.getI64ArrayAttr(workgroupTileSizes));
   attrs.emplace_back(b.getStringAttr("reduction"),
                      b.getI64ArrayAttr(reductionTileSizes));
-  // Promote operands to use shared memory for LHS and RHS.
-  GPU::setPromotedOperandList(context, attrs, {0, 1});
+  if (ukernelConfig) {
+    attrs.emplace_back(b.getStringAttr("ukernel"), ukernelConfig);
+  } else {
+    // Promote operands to use shared memory for LHS and RHS.
+    // Don't do that with ukernels: their untiled reduction dimension is too
+    // large to fit in shared memory, so they just want global memory and they
+    // will take care of moving small chunks at a time into a shared memory
+    // operand that will be created together with the ukernel op.
+    GPU::setPromotedOperandList(context, attrs, {0, 1});
+  }
   auto configDict = b.getDictionaryAttr(attrs);
   auto loweringConfig = IREE::GPU::LoweringConfigAttr::get(context, configDict);
 
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h b/compiler/src/iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h
@@ -16,10 +16,9 @@ namespace mlir::iree_compiler::IREE::GPU {
 
 /// Helper for setting up a data tiled multi_mma config based on the specified
 /// target.
-LogicalResult
-setDataTiledMultiMmaLoweringConfig(IREE::GPU::TargetAttr target,
-                                   mlir::FunctionOpInterface entryPoint,
-                                   Operation *op);
+LogicalResult setDataTiledMultiMmaLoweringConfig(
+    IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
+    Operation *op, IREE::GPU::UKernelConfigAttr ukernelConfig);
 
 /// Helper for setting up a convolution config using IGEMM based on the
 /// specified target.
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -2099,15 +2099,9 @@ static LogicalResult setTransposeConfig(mlir::FunctionOpInterface entryPoint,
 /// Set the configuration for argmax when ukernels are enabled.
 /// Distribute all parallel dim across different workgroups, and only use single
 /// subgroup per workgroup.
-static LogicalResult
-setArgmaxUkernelConfig(IREE::GPU::TargetAttr target,
-                       mlir::FunctionOpInterface entryPoint,
-                       linalg::GenericOp op) {
-  IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(op);
-  if (!ukernelConfig) {
-    return failure();
-  }
-
+static LogicalResult setArgmaxUkernelConfig(
+    IREE::GPU::TargetAttr target, mlir::FunctionOpInterface entryPoint,
+    linalg::GenericOp op, IREE::GPU::UKernelConfigAttr ukernelConfig) {
   SmallVector<unsigned> parallelDims;
   SmallVector<unsigned> reductionDims;
   op.getParallelDims(parallelDims);
@@ -2170,15 +2164,6 @@ setArgmaxUkernelConfig(IREE::GPU::TargetAttr target,
   return success();
 }
 
-/// Make UKernels take the LLVMGPUDefault lowering pipeline.
-static LogicalResult
-setUKernelConfig(mlir::FunctionOpInterface entryPoint,
-                 IREE::Codegen::UKernelOpInterface ukernelOp) {
-  auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
-      entryPoint->getContext(), CodeGenPipeline::LLVMGPUDefault);
-  return setTranslationInfo(entryPoint, translationInfo);
-}
-
 /// Decides the tiling and distribution parameters for one convolution
 /// dimension. Returns true if we can succesfully deduce.
 ///
@@ -2358,13 +2343,14 @@ static LogicalResult setConvolutionConfig(
 static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
                                    mlir::FunctionOpInterface entryPointFn,
                                    Operation *computeOp) {
+  IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(computeOp);
   LLVM_DEBUG({
     DBGS() << "Selecting root config for: ";
     computeOp->print(llvm::dbgs(), OpPrintingFlags().skipRegions());
     llvm::dbgs() << "\n";
   });
   if (succeeded(setDataTiledMultiMmaLoweringConfig(target, entryPointFn,
-                                                   computeOp))) {
+                                                   computeOp, ukernelConfig))) {
     LDBG("Tile and fuse data tiled multi_mma config");
     return success();
   }
@@ -2410,8 +2396,9 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
     if (genericOp && succeeded(setTransposeConfig(entryPointFn, genericOp))) {
       LDBG("Transpose Config");
       return success();
-    } else if (genericOp && succeeded(setArgmaxUkernelConfig(
-                                target, entryPointFn, genericOp))) {
+    } else if (genericOp && ukernelConfig &&
+               succeeded(setArgmaxUkernelConfig(target, entryPointFn, genericOp,
+                                                ukernelConfig))) {
       LDBG("Argmax Ukernel Config");
       return success();
     }
@@ -2435,10 +2422,6 @@ static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
         LDBG("Pack Config");
         return setPackConfig(target, entryPointFn, packOp);
       })
-      .Case<IREE::Codegen::UKernelOpInterface>([&](auto ukernelOp) {
-        LDBG("Ukernel Config");
-        return setUKernelConfig(entryPointFn, ukernelOp);
-      })
       .Case<IREE::LinalgExt::CustomOp>([&](auto customOp) {
         LDBG("CustomOp Config");
         return setDefaultCustomOpLoweringConfig(entryPointFn, customOp,
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.cpp
@@ -6,6 +6,7 @@
 
 #include "iree/compiler/Codegen/LLVMGPU/ROCDLKernelConfig.h"
 
+#include "compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/TargetUtils/ConfigUtils.h"
@@ -272,8 +273,9 @@ setWarpReductionConfig(IREE::GPU::TargetAttr target,
 static LogicalResult setRootConfig(IREE::GPU::TargetAttr target,
                                    mlir::FunctionOpInterface entryPointFn,
                                    Operation *computeOp) {
+  IREE::GPU::UKernelConfigAttr ukernelConfig = selectUKernel(computeOp);
   if (succeeded(setDataTiledMultiMmaLoweringConfig(target, entryPointFn,
-                                                   computeOp))) {
+                                                   computeOp, ukernelConfig))) {
     return success();
   }
   if (auto linalgOp = dyn_cast<linalg::LinalgOp>(computeOp)) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 #include "iree/compiler/Codegen/LLVMGPU/Utils/LLVMGPUSelectUKernels.h"
+#include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Codegen/Utils/Utils.h"
 #include "iree/compiler/Utils/EmbeddedDataDirectory.h"
@@ -18,8 +19,13 @@ namespace mlir::iree_compiler {
 
 namespace {
 
+struct UKernelNameAndSuffix {
+  std::string name;
+  std::string suffix;
+};
+
 // Returns ukernel name and suffix for argmax. Empty name = no ukernel.
-static std::tuple<std::string, std::string>
+static UKernelNameAndSuffix
 getUKernelNameAndSuffixForArgmax(linalg::GenericOp op) {
   Value input = op.getDpsInputOperand(0)->get();
   auto inputType = cast<ShapedType>(input.getType());
@@ -29,13 +35,34 @@ getUKernelNameAndSuffixForArgmax(linalg::GenericOp op) {
                                   indexType.getElementType())};
 }
 
+// Returns ukernel name and suffix for multi_mma. Empty name = no ukernel.
+static UKernelNameAndSuffix
+getUKernelNameAndSuffixForMultiMma(IREE::GPU::MultiMmaOp op) {
+  auto mma = dyn_cast<IREE::GPU::DataTiledMMAAttr>(op.getKind());
+  if (!mma) {
+    return {}; // Only handling DataTiledMMAAttr for now.
+  }
+  std::string suffix{
+      stringifyMMAIntrinsic(mma.getIntrinsic().getValue()).lower()};
+  if (mma.getUnrollM() != 1 || mma.getUnrollN() != 1 || mma.getUnrollK() != 1) {
+    suffix += llvm::formatv("_unroll{}x{}x{}", mma.getUnrollM(),
+                            mma.getUnrollN(), mma.getUnrollK());
+  }
+  if (mma.getSubgroupsM() != 1 || mma.getSubgroupsN() != 1) {
+    suffix += llvm::formatv("_subgroups{}x{}", mma.getSubgroupsM(),
+                            mma.getSubgroupsN());
+  }
+  return {"multi_mma", suffix};
+}
+
 // Returns ukernel name and suffix for any op. Empty name = no ukernel.
-static std::tuple<std::string, std::string>
-getUKernelNameAndSuffix(Operation *op) {
+static UKernelNameAndSuffix getUKernelNameAndSuffix(Operation *op) {
   if (auto genericOp = dyn_cast<linalg::GenericOp>(op)) {
     if (succeeded(isArgmaxOp(genericOp))) {
       return getUKernelNameAndSuffixForArgmax(genericOp);
     }
+  } else if (auto multiMmaOp = dyn_cast<IREE::GPU::MultiMmaOp>(op)) {
+    return getUKernelNameAndSuffixForMultiMma(multiMmaOp);
   }
   return {};
 }
@@ -44,7 +71,7 @@ getUKernelNameAndSuffix(Operation *op) {
 static IREE::GPU::UKernelConfigAttr getUKernelConfig(Operation *op) {
   MLIRContext *context = op->getContext();
   auto [name, suffix] = getUKernelNameAndSuffix(op);
-  if (name.empty() || suffix.empty()) {
+  if (name.empty()) {
     return {};
   }
   auto target = IREE::HAL::ExecutableTargetAttr::lookup(op);