[ROCM] Add ukernel descriptor lowering to pipeline (#21634)

jtuyls · web-flow · commit cc1a7dda09b9 · 2025-08-14T15:20:11.000+02:00
Adds tensor ukernel matching and lowering to the GPU pipeline.

---------

Signed-off-by: Jorn Tuyls &lt;jorn.tuyls@gmail.com&gt;
diff --git a/compiler/plugins/target/ROCM/ROCMTarget.cpp b/compiler/plugins/target/ROCM/ROCMTarget.cpp
@@ -12,6 +12,7 @@
 #include "compiler/plugins/target/ROCM/Dialect/ROCM/Transforms/Passes.h"
 #include "compiler/plugins/target/ROCM/builtins/ukernel/iree_uk_amdgpu_bitcode.h"
 #include "iree/compiler/Codegen/Common/Passes.h"
+#include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenDialect.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUAttrs.h"
 #include "iree/compiler/Codegen/Dialect/GPU/IR/IREEGPUDialect.h"
@@ -85,6 +86,7 @@ struct ROCMOptions {
   bool globalISel = false;
 
   bool specializeDispatches = false;
+  bool enableTensorUKernels = false;
 
   void bindOptions(OptionsBinder &binder) {
     using namespace llvm;
@@ -156,6 +158,9 @@ struct ROCMOptions {
         cl::cat(category),
         cl::desc(
             "Enable runtime specialization of dynamically shaped dispatches."));
+    binder.opt<bool>("iree-hip-enable-tensor-ukernels", enableTensorUKernels,
+                     cl::cat(category),
+                     cl::desc("Enable MLIR-based ukernels."));
   }
 
   LogicalResult verify(mlir::Builder &builder) const {
@@ -329,6 +334,11 @@ class ROCMTargetBackend final : public TargetBackend {
       addConfigWavesPerEu(b.getContext(), options.wavesPerEu, configItems);
     }
 
+    if (options.enableTensorUKernels) {
+      addConfig(kUKernelProviderName,
+                IREE::Codegen::SymbolicUKernelProviderAttr::get(context));
+    }
+
     return b.getAttr<IREE::HAL::ExecutableTargetAttr>(
         b.getStringAttr("rocm"), b.getStringAttr(format),
         b.getDictionaryAttr(configItems));
@@ -365,7 +375,31 @@ class ROCMTargetBackend final : public TargetBackend {
         });
       }
     }
-    buildLLVMGPUCodegenConfigurationPassPipeline(passManager);
+    if (options.enableTensorUKernels) {
+      if (auto attr = getGPUTargetAttr(targetAttr.getContext(), targetAttr)) {
+        ROCM::ApplyBuiltinPDLPatternsPassOptions options;
+        options.enableTensorUKernels = true;
+        if (IREE::GPU::TargetChipAttr chip = attr.getChip()) {
+          if (StringAttr sku = chip.getSku()) {
+            options.targets.push_back(sku.str());
+          }
+        }
+        options.targets.push_back(attr.getArch().str());
+        OpPassManager &modulePassManager = passManager.nest<ModuleOp>();
+        FunctionLikeNest(modulePassManager).addPass([&]() {
+          return ROCM::createApplyBuiltinPDLPatternsPass(options);
+        });
+      }
+    }
+    buildLLVMGPUCodegenCommonConfigurationPassPipeline(passManager);
+    OpPassManager &modulePassManager = passManager.nest<ModuleOp>();
+    if (options.enableTensorUKernels) {
+      modulePassManager.addPass(
+          IREE::ROCM::createApplyBuiltinPDLPatternsDriverPass());
+    }
+    modulePassManager.addPass(createMaterializeTuningSpecsPass());
+    modulePassManager.addPass(createMaterializeUserConfigsPass());
+    modulePassManager.addPass(createLLVMGPUSelectLoweringStrategyPass());
   }
 
   void buildTranslationPassPipeline(IREE::HAL::ExecutableTargetAttr targetAttr,
diff --git a/compiler/plugins/target/ROCM/test/BUILD.bazel b/compiler/plugins/target/ROCM/test/BUILD.bazel
@@ -18,6 +18,7 @@ iree_lit_test_suite(
         "config_ukernel_argmax_gfx942.mlir",
         "config_ukernel_data_tiled_mma_gfx942.mlir",
         "default_tuning_specs_amdgpu.mlir",
+        "enable_tensor_ukernels.mlir",
         "gpu_encoding_attrs.mlir",
         "lowering_strategy_from_tuning_spec.mlir",
         "ukernel_pipeline_transform.mlir",
diff --git a/compiler/plugins/target/ROCM/test/CMakeLists.txt b/compiler/plugins/target/ROCM/test/CMakeLists.txt
@@ -18,6 +18,7 @@ iree_lit_test_suite(
     "config_ukernel_argmax_gfx942.mlir"
     "config_ukernel_data_tiled_mma_gfx942.mlir"
     "default_tuning_specs_amdgpu.mlir"
+    "enable_tensor_ukernels.mlir"
     "gpu_encoding_attrs.mlir"
     "lowering_strategy_from_tuning_spec.mlir"
     "ukernel_pipeline_transform.mlir"
diff --git a/compiler/plugins/target/ROCM/test/enable_tensor_ukernels.mlir b/compiler/plugins/target/ROCM/test/enable_tensor_ukernels.mlir
@@ -0,0 +1,54 @@
+// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx942 \
+// RUN:   --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(iree-hal-configure-target-executable-variants{target=rocm})))" \
+// RUN:   --iree-hip-enable-tensor-ukernels \
+// RUN:   --verify-diagnostics %s | FileCheck %s
+
+// Make sure we can match and insert a tensor ukernel.
+
+#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+#map2 = affine_map<(d0, d1, d2, d3) -> (d2, d3)>
+#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>,
+  #hal.pipeline.binding<storage_buffer>
+]>
+hal.executable public @main {
+  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
+    hal.executable.export public @matmul_f8 ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
+      %x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice
+      hal.return %x, %y, %z : index, index, index
+    }
+    builtin.module {
+      func.func @matmul_f8() {
+        %cst = arith.constant 0.000000e+00 : f32
+        %c0 = arith.constant 0 : index
+        %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x128x4096xf8E4M3FNUZ>>
+        %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x4096xf8E4M3FNUZ>>
+        %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<1x128x1024xf32>>
+        %3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [1, 128, 4096], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1x128x4096xf8E4M3FNUZ>> -> tensor<1x128x4096xf8E4M3FNUZ>
+        %4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 4096], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<1024x4096xf8E4M3FNUZ>> -> tensor<1024x4096xf8E4M3FNUZ>
+        %5 = tensor.empty() : tensor<1x128x1024xf32>
+        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1x128x1024xf32>) -> tensor<1x128x1024xf32>
+        %7 = linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<1x128x4096xf8E4M3FNUZ>, tensor<1024x4096xf8E4M3FNUZ>) outs(%6 : tensor<1x128x1024xf32>) {
+        ^bb0(%in: f8E4M3FNUZ, %in_4: f8E4M3FNUZ, %out: f32):
+          %12 = arith.extf %in : f8E4M3FNUZ to f32
+          %13 = arith.extf %in_4 : f8E4M3FNUZ to f32
+          %14 = arith.mulf %12, %13 : f32
+          %15 = arith.addf %out, %14 : f32
+          linalg.yield %15 : f32
+        } -> tensor<1x128x1024xf32>
+        iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0, 0], sizes = [1, 128, 1024], strides = [1, 1, 1] : tensor<1x128x1024xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<1x128x1024xf32>>
+        return
+      }
+    }
+  }
+}
+// CHECK:      #[[TRANSLATION:.+]] = #iree_codegen.translation_info<pipeline = LLVMGPUTileAndFuse workgroup_size = [512, 1, 1] subgroup_size = 64
+// CHECK:      func.func @matmul_f8
+// CHECK-SAME:   translation_info = #[[TRANSLATION]]
+// CHECK:        linalg.generic
+// CHECK-SAME:     iree_codegen.ukernel = #iree_codegen.ukernel_descriptor<"pingpong_medium_f8_expanded", tensor>
+// CHECK-SAME:     lowering_config = #iree_gpu.lowering_config
+// CHECK:        util.func private @pingpong_medium_f8_expanded
+// CHECK:          iree_codegen.inner_tiled
diff --git a/compiler/src/iree/compiler/Codegen/Common/LowerUKernelDescriptors.cpp b/compiler/src/iree/compiler/Codegen/Common/LowerUKernelDescriptors.cpp
@@ -261,8 +261,15 @@ processUKernelKind(Operation *root, IREE::Codegen::UKernelArgumentKind kind) {
       FailureOr<Operation *> maybeTargetFunction = provider.getMLIRUKernel(
           name, targetAttr.getConfiguration(), annotationSite);
       if (failed(maybeTargetFunction) || !*maybeTargetFunction) {
-        return op->emitOpError()
-               << "failed to retrieve a uKernel with name " << name;
+        // If not found at the annotation site, look in the first ModuleOp
+        // parent as well.
+        auto moduleParent = op->getParentOfType<ModuleOp>();
+        maybeTargetFunction = provider.getMLIRUKernel(
+            name, targetAttr.getConfiguration(), moduleParent);
+        if (failed(maybeTargetFunction) || !*maybeTargetFunction) {
+          return op->emitOpError()
+                 << "failed to retrieve a uKernel with name " << name;
+        }
       }
       auto targetFunction = dyn_cast<FunctionOpInterface>(*maybeTargetFunction);
       if (!targetFunction) {
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.cpp
@@ -26,7 +26,6 @@
 static const char kTranslationInfoAttrName[] = "translation_info";
 static const char kCompilationInfoAttrName[] = "compilation_info";
 static const char kRootOpInfoAttrName[] = "root_op";
-static const char kUKernelProviderName[] = "iree_codegen.ukernel_provider";
 static const char kUKernelDescriptorName[] = "iree_codegen.ukernel";
 
 namespace mlir::iree_compiler {
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h b/compiler/src/iree/compiler/Codegen/Dialect/Codegen/IR/IREECodegenAttrs.h
@@ -51,6 +51,7 @@ constexpr StringLiteral kTuningSpecEntrypointAttrName =
 constexpr StringLiteral kSerializedTuningSpecAttrName =
     "iree_codegen.tuning_spec_mlirbc";
 constexpr StringLiteral kKernelConfigSpecName = "__kernel_config";
+constexpr StringLiteral kUKernelProviderName = "iree_codegen.ukernel_provider";
 
 //===----------------------------------------------------------------------===//
 // Helpers for getting/setting iree_codegen.translation_info attribute on a
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -430,6 +430,7 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
   //
   // In the future there may be cases where we want the custom strategy run at
   // later points in the pipeline.
+  funcPassManager.addPass(createLowerTensorUKernelsPass());
   funcPassManager.addPass(createLoweringConfigInterpreterPass());
   funcPassManager.addPass(createConfigTrackingCanonicalizerPass());
   funcPassManager.addPass(createCSEPass());
@@ -1276,7 +1277,7 @@ void addGPUTransformDialectPasses(OpPassManager &funcPassManager,
 // Common Pass Pipelines
 //===----------------------------------------------------------------------===//
 
-static void buildLLVMGPUCodegenConfigurationPassPipelineImpl(
+static void buildLLVMGPUCodegenCommonConfigurationPassPipelineImpl(
     OpPassManager &modulePassManager) {
   {
     FunctionLikeNest funcPassManager(modulePassManager);
@@ -1302,6 +1303,17 @@ static void buildLLVMGPUCodegenConfigurationPassPipelineImpl(
     funcPassManager.addPass(createConfigTrackingCanonicalizerPass);
     funcPassManager.addPass(createCSEPass);
   }
+}
+
+void buildLLVMGPUCodegenCommonConfigurationPassPipeline(
+    OpPassManager &variantPassManager) {
+  buildLLVMGPUCodegenCommonConfigurationPassPipelineImpl(
+      variantPassManager.nest<ModuleOp>());
+}
+
+static void buildLLVMGPUCodegenConfigurationPassPipelineImpl(
+    OpPassManager &modulePassManager) {
+  buildLLVMGPUCodegenCommonConfigurationPassPipelineImpl(modulePassManager);
   modulePassManager.addPass(createMaterializeTuningSpecsPass());
   modulePassManager.addPass(createMaterializeUserConfigsPass());
   modulePassManager.addPass(createLLVMGPUSelectLoweringStrategyPass());
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.h
@@ -81,6 +81,11 @@ void addGPUDefaultPassPipeline(OpPassManager &funcPassManager,
 /// Pass pipeline to lower IREE HAL executables without tiling and distribution.
 void addGPUBaseLoweringPassPipeline(OpPassManager &pm);
 
+/// Populates the common passes needed to preprocess and select the translation
+/// strategy.
+void buildLLVMGPUCodegenCommonConfigurationPassPipeline(
+    OpPassManager &variantPassManagery);
+
 /// Populates passes needed to preprocess and select the translation strategy.
 void buildLLVMGPUCodegenConfigurationPassPipeline(
     OpPassManager &variantPassManagery);