[ROCM][DT] Add encoding specialization infra for data-tiled ukernels (iree-org#21914)

jtuyls · web-flow · commit 3aa9c8040848 · 2025-09-12T10:24:15.000+02:00
Passes the `UKernelProvider` to the encoding resolver so it can be used
to choose the data layouts for specialization and materialization. The
`UKernelProviderInterface` gets a new `getDataLayoutForUKernel` which is
responsible for returning a data layout attribute based on the encoding
and target configuration.

Signed-off-by: Jorn Tuyls &lt;jorn.tuyls@gmail.com&gt;
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/IR/BUILD.bazel b/compiler/plugins/target/ROCM/Dialect/ROCM/IR/BUILD.bazel
@@ -64,6 +64,7 @@ iree_compiler_cc_library(
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/IR:IREEGPUDialect",
         "//compiler/src/iree/compiler/Codegen/Dialect/GPU/Transforms:GPUTransforms",
         "//compiler/src/iree/compiler/Codegen/Utils",
+        "//compiler/src/iree/compiler/Dialect/Encoding/IR",
         "//compiler/src/iree/compiler/Dialect/HAL/IR",
         "//compiler/src/iree/compiler/Dialect/Util/IR",
         "//compiler/src/iree/compiler/Utils",
@@ -78,6 +79,7 @@ iree_compiler_cc_library(
         "@llvm-project//mlir:GPUUtils",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LinalgDialect",
+        "@llvm-project//mlir:LinalgInterfaces",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Support",
     ],
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/IR/CMakeLists.txt b/compiler/plugins/target/ROCM/Dialect/ROCM/IR/CMakeLists.txt
@@ -38,13 +38,15 @@ iree_cc_library(
     MLIRGPUUtils
     MLIRIR
     MLIRLinalgDialect
+    MLIRLinalgInterfacesIncGenLib
     MLIRParser
     MLIRSupport
     iree::compiler::Codegen::Dialect::Codegen::IR::IREECodegenDialect
     iree::compiler::Codegen::Dialect::Codegen::Utils
     iree::compiler::Codegen::Dialect::GPU::IR::IREEGPUDialect
     iree::compiler::Codegen::Dialect::GPU::Transforms::GPUTransforms
     iree::compiler::Codegen::Utils
+    iree::compiler::Dialect::Encoding::IR
     iree::compiler::Dialect::HAL::IR
     iree::compiler::Dialect::Util::IR
     iree::compiler::Utils
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMAttrs.cpp b/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMAttrs.cpp
@@ -11,6 +11,7 @@
 #include "iree/compiler/Codegen/Dialect/GPU/IR/GPUTileSwizzleUtils.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUOps.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
+#include "iree/compiler/Dialect/Encoding/IR/EncodingTypes.h"
 #include "iree/compiler/Dialect/HAL/IR/HALTypes.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
@@ -20,6 +21,7 @@
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/IR/LinalgInterfaces.h"
 #include "mlir/Dialect/Utils/IndexingUtils.h"
 #include "mlir/IR/AsmState.h"
 #include "mlir/IR/Attributes.h"
@@ -463,6 +465,48 @@ std::optional<LogicalResult> UKernelProviderAttr::createAndReplaceWithUkernelOp(
   return std::nullopt;
 }
 
+//===---------------------------------------------------------------------===//
+// rocm.tensor_ukernel_provider
+//===---------------------------------------------------------------------===//
+
+FailureOr<Operation *>
+TensorUKernelProviderAttr::getMLIRUKernel(StringRef name, DictionaryAttr,
+                                          Operation *annotationSite) const {
+  auto *symbolTableOp = SymbolTable::getNearestSymbolTable(annotationSite);
+  SymbolTable symbolTable(symbolTableOp);
+  return symbolTable.lookup(name);
+}
+
+Attribute TensorUKernelProviderAttr::getDataLayoutForUKernel(
+    Attribute encoding, DictionaryAttr targetConfiguration) const {
+  auto encodingAttr =
+      dyn_cast_if_present<IREE::Encoding::EncodingAttr>(encoding);
+  if (!encodingAttr) {
+    return {};
+  }
+  IREE::GPU::TargetAttr targetAttr = getGPUTargetAttr(targetConfiguration);
+  if (!targetAttr || targetAttr.getArch() != "gfx942") {
+    return {};
+  }
+  ArrayAttr indexingMapsAttr = encodingAttr.getUserIndexingMaps();
+  if (!indexingMapsAttr) {
+    return {};
+  }
+  if (failed(linalg::inferContractionDims(encodingAttr.getRootMaps()))) {
+    return {};
+  }
+  SmallVector<Type> types = encodingAttr.getElementTypesArray();
+  Type f16 = Float16Type::get(encoding.getContext());
+  Type f32 = Float32Type::get(encoding.getContext());
+  if (types.size() != 3 || types[0] != f16 || types[1] != f16 ||
+      types[2] != f32) {
+    return {};
+  }
+  return IREE::GPU::DataTiledMMAAttr::get(
+      encoding.getContext(), IREE::GPU::MMAIntrinsic::MFMA_F32_16x16x16_F16, 8,
+      2, 4, 4, 1);
+}
+
 //===----------------------------------------------------------------------===//
 // Attribute Registration
 //===----------------------------------------------------------------------===//
diff --git a/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMAttrs.td b/compiler/plugins/target/ROCM/Dialect/ROCM/IR/ROCMAttrs.td
@@ -53,4 +53,23 @@ def ROCM_UKernelProviderAttr  :
   let assemblyFormat = [{}];
 }
 
+//===---------------------------------------------------------------------===//
+// rocm.tensor_ukernel_provider
+//===---------------------------------------------------------------------===//
+
+def ROCM_TensorUKernelProviderAttr  :
+    AttrDef<ROCM_Dialect, "TensorUKernelProvider", [
+    DeclareAttrInterfaceMethods<IREECodegen_UKernelProviderInterface, [
+      "getDataLayoutForUKernel",
+      "getMLIRUKernel",
+      ]>
+    ]> {
+  let mnemonic = "tensor_ukernel_provider";
+  let summary = [{
+    An attribute that provides context specific tensor ukernel implementations for ROCM.
+  }];
+  let parameters = (ins);
+  let assemblyFormat = [{}];
+}
+
 #endif // IREE_PLUGINS_TARGET_ROCM_DIALECT_ROCMATTRS
diff --git a/compiler/plugins/target/ROCM/ROCMTarget.cpp b/compiler/plugins/target/ROCM/ROCMTarget.cpp
@@ -354,7 +354,7 @@ class ROCMTargetBackend final : public TargetBackend {
 
     if (options.enableTensorUKernels) {
       addConfig(kUKernelProviderName,
-                IREE::Codegen::SymbolicUKernelProviderAttr::get(context));
+                IREE::ROCM::TensorUKernelProviderAttr::get(context));
     }
 
     return b.getAttr<IREE::HAL::ExecutableTargetAttr>(
diff --git a/compiler/plugins/target/ROCM/test/BUILD.bazel b/compiler/plugins/target/ROCM/test/BUILD.bazel
@@ -20,8 +20,10 @@ iree_lit_test_suite(
         "default_tuning_specs_amdgpu.mlir",
         "enable_tensor_ukernels.mlir",
         "gpu_encoding_attrs.mlir",
+        "lower_rocm_tensor_ukernel_descriptor.mlir",
         "lower_rocm_ukernel_descriptor.mlir",
         "lowering_strategy_from_tuning_spec.mlir",
+        "materialize_encoding_ukernel_gfx942.mlir",
         "ukernel_pipeline_transform.mlir",
     ],
     cfg = "//compiler:lit.cfg.py",
diff --git a/compiler/plugins/target/ROCM/test/CMakeLists.txt b/compiler/plugins/target/ROCM/test/CMakeLists.txt
@@ -20,8 +20,10 @@ iree_lit_test_suite(
     "default_tuning_specs_amdgpu.mlir"
     "enable_tensor_ukernels.mlir"
     "gpu_encoding_attrs.mlir"
+    "lower_rocm_tensor_ukernel_descriptor.mlir"
     "lower_rocm_ukernel_descriptor.mlir"
     "lowering_strategy_from_tuning_spec.mlir"
+    "materialize_encoding_ukernel_gfx942.mlir"
     "ukernel_pipeline_transform.mlir"
   TOOLS
     FileCheck
diff --git a/compiler/plugins/target/ROCM/test/lower_rocm_tensor_ukernel_descriptor.mlir b/compiler/plugins/target/ROCM/test/lower_rocm_tensor_ukernel_descriptor.mlir
@@ -0,0 +1,33 @@
+// RUN: iree-opt --iree-codegen-lower-tensor-ukernels --split-input-file --verify-diagnostics %s | FileCheck %s
+
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider}>
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+module attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+  func.func private @ukernel_impl(tensor<16x32xf32>, tensor<16x32xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
+  func.func @test(%arg0: tensor<16x32xf32>, %arg1: tensor<16x32xf32>, %arg2: tensor<16x16xf32>) -> tensor<16x16xf32> {
+    %0 = call @ukernel_impl(%arg0, %arg1, %arg2) : (tensor<16x32xf32>, tensor<16x32xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
+    return %0 : tensor<16x16xf32>
+  }
+  func.func @replace_generic_with_ukernel_impl(%arg0: tensor<16x32xf32>, %arg1: tensor<16x32xf32>) -> tensor<16x16xf32> {
+    %cst = arith.constant 0.000000e+00 : f32
+    %0 = tensor.empty() : tensor<16x16xf32>
+    %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<16x16xf32>) -> tensor<16x16xf32>
+    %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<16x32xf32>, tensor<16x32xf32>) outs(%1 : tensor<16x16xf32>) attrs =  {iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"test", tensor>} {
+    ^bb0(%in: f32, %in_0: f32, %out: f32):
+      %3 = arith.mulf %in, %in_0 : f32
+      %4 = arith.addf %out, %3 : f32
+      linalg.yield %4 : f32
+    } -> tensor<16x16xf32>
+    return %2 : tensor<16x16xf32>
+  }
+}
+// CHECK-LABEL: @ukernel_impl
+// CHECK-LABEL: @replace_generic_with_ukernel_impl
+// CHECK-SAME:    %[[LHS:[a-zA-Z0-9]+]]: tensor<16x32xf32>
+// CHECK-SAME:    %[[RHS:[a-zA-Z0-9]+]]: tensor<16x32xf32>
+// CHECK-NOT:     linalg.generic
+// CHECK:         %[[OUT:.+]] = linalg.fill
+// CHECK:         %[[CALL:.+]] = call @ukernel_impl(%[[LHS]], %[[RHS]], %[[OUT]]) : (tensor<16x32xf32>, tensor<16x32xf32>, tensor<16x16xf32>) -> tensor<16x16xf32>
+// CHECK:         return %[[CALL]]
diff --git a/compiler/plugins/target/ROCM/test/materialize_encoding_ukernel_gfx942.mlir b/compiler/plugins/target/ROCM/test/materialize_encoding_ukernel_gfx942.mlir
@@ -0,0 +1,81 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-materialize-device-encoding))" --split-input-file %s | FileCheck %s
+
+// Note the ukernel provider being specified in the executable target. This should be used to determine the data tiling.
+
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {
+  abi = "hip",
+  iree.encoding.resolver = #iree_gpu.gpu_encoding_resolver<>,
+  iree_codegen.target_info = #iree_gpu.target<
+    arch = "gfx942",
+    features = "",
+    wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8,
+    storage =  b64|b32|b16|b8,
+    subgroup =  shuffle|arithmetic,
+    dot =  dp4xi8toi32,
+    mma = [<MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>,
+           <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>,
+           <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>,
+           <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>,
+           <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>,
+           <MFMA_I32_32x32x16_I8>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x4_F32>,
+           <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>
+          ],
+    subgroup_size_choices = [64],
+    max_workgroup_sizes = [1024, 1024, 1024],
+    max_thread_count_per_workgroup = 1024,
+    max_workgroup_memory_bytes = 65536,
+    max_workgroup_counts = [2147483647, 2147483647, 2147483647],
+    max_load_instruction_bits = 128,
+    simds_per_wgp = 4,
+    vgpr_space_bits = 16384>
+  >,
+  iree_codegen.ukernel_provider = #rocm.tensor_ukernel_provider,
+  ukernels = "none"
+}>
+
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d2, d1)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#encoding_lhs = #iree_encoding.encoding<operand_index = 0, op_type = matmul, element_types = [f16, f16, f32], user_indexing_maps = [#map, #map1, #map2], iteration_sizes = [?, ?, ?]>
+#encoding_rhs = #iree_encoding.encoding<operand_index = 1, op_type = matmul, element_types = [f16, f16, f32], user_indexing_maps = [#map, #map1, #map2], iteration_sizes = [?, ?, ?]>
+#encoding_result = #iree_encoding.encoding<operand_index = 2, op_type = matmul, element_types = [f16, f16, f32], user_indexing_maps = [#map, #map1, #map2], iteration_sizes = [?, ?, ?]>
+#pipeline_layout_3 = #hal.pipeline.layout<constants = 3, bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+
+func.func @matmul_lowering_ukernel_provider() attributes {hal.executable.target = #executable_target_rocm_hsaco_fb} {
+  %c0 = arith.constant 0 : index
+  %M = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(0) : index
+  %N = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(1) : index
+  %K = hal.interface.constant.load layout(#pipeline_layout_3) ordinal(2) : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(0) alignment(64) offset(%c0)
+      : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf16, #encoding_lhs>>{%M, %K}
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(1) alignment(64) offset(%c0)
+      : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf16, #encoding_rhs>>{%K, %N}
+  %2 = hal.interface.binding.subspan layout(#pipeline_layout_3) binding(2) alignment(64) offset(%c0)
+      : !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N}
+  %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0], sizes = [%M, %K], strides = [1, 1]
+      : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf16, #encoding_lhs>>{%M, %K}
+      -> tensor<?x?xf16, #encoding_lhs>
+  %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [%K, %N], strides = [1, 1]
+      : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?xf16, #encoding_rhs>>{%K, %N}
+      -> tensor<?x?xf16, #encoding_rhs>
+  %5 = iree_tensor_ext.dispatch.tensor.load %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
+      : !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N}
+      -> tensor<?x?xf32, #encoding_result>
+  %6 = linalg.matmul
+      ins(%3, %4 : tensor<?x?xf16, #encoding_lhs>,
+                   tensor<?x?xf16, #encoding_rhs>)
+      outs(%5 : tensor<?x?xf32, #encoding_result>)
+      -> tensor<?x?xf32, #encoding_result>
+  iree_tensor_ext.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [%M, %N], strides = [1, 1]
+      : tensor<?x?xf32, #encoding_result>
+      -> !iree_tensor_ext.dispatch.tensor<readwrite:tensor<?x?xf32, #encoding_result>>{%M, %N}
+  return
+}
+// CHECK-LABEL: matmul_lowering_ukernel_provider
+// CHECK:      iree_codegen.inner_tiled
+// CHECK-SAME:     iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>, #linalg.iterator_type<reduction>]
+// CHECK-SAME:     kind = #iree_gpu.data_tiled_mma_layout<intrinsic = MFMA_F32_16x16x16_F16, intrinsics_m = 8, subgroups_m = 2, intrinsics_n = 4, subgroups_n = 4>
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.td b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenInterfaces.td
@@ -575,6 +575,25 @@ def IREECodegen_UKernelProviderInterface :
         return failure();
       }]
     >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Returns a data layout attribute for the provided |encoding| and
+        |target_configuration|.
+
+        The return type is deliberately '::mlir::Attribute' to accommodate
+        various data layout specifications. Callers, such as encoding
+        resolvers, are expected to handle a range of possible attributes
+        and gracefully manage cases where an unsupported attribute is returned.
+      }],
+      /*retTy=*/"::mlir::Attribute",
+      /*methodName=*/"getDataLayoutForUKernel",
+      /*args=*/(ins "::mlir::Attribute":$encoding,
+                    "::mlir::DictionaryAttr":$target_configuration),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return {};
+      }]
+    >,
   ];
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/ExternalInterfaces/GPUEncodingExternalModels.cpp b/compiler/src/iree/compiler/Codegen/ExternalInterfaces/GPUEncodingExternalModels.cpp

Original file line number	Diff line number	Diff line change
`@@ -354,7 +354,7 @@ class ROCMTargetBackend final : public TargetBackend {`
`354`	`354`
`355`	`355`	`if (options.enableTensorUKernels) {`
`356`	`356`	`addConfig(kUKernelProviderName,`
`357`		`- IREE::Codegen::SymbolicUKernelProviderAttr::get(context));`
	`357`	`+ IREE::ROCM::TensorUKernelProviderAttr::get(context));`
`358`	`358`	`}`
`359`	`359`
`360`	`360`	`return b.getAttr<IREE::HAL::ExecutableTargetAttr>(`