[LLVMGPU][DT] Add MaterializeDeviceEncodingPass to LLVMGPU passes behind flag (#19849)

Max191 · web-flow · commit d2e0fdd97d3a · 2025-03-13T06:51:17.000-07:00
This PR adds the `MaterializeDeviceEncodingPass` to LLVMGPU/Passes.cpp
behind a new flag `iree-llvmgpu-experimental-data-tiling`. The flag's
default value is false, because the codegen for data tiling ops on GPU
is not yet working and performant for all cases. Some of the work is in
flight, but it will likely take some time before data tiling codegen is
ready to be flipped on by default. For now, the flag allows developers
to enable the late materialization codegen path on LLVMGPU.

To effectively use the late materialization path for data-tiling fusion,
some additional non-default flags need to be set:
- `--iree-opt-data-tiling=false` (to turn off the early materialization
data tiling path)
- `--iree-dispatch-creation-experimental-data-tiling=true` (to turn on
the late materialization data tiling path)
- `--iree-dispatch-creation-pad-factor=128` (the current default is
based on CPU materialization)

This PR also includes a small fix to the ROCDLKernelConfig logic for
selecting the root op when there are pack/unpack ops in the dispatch.
The fix avoids selecting pack and unpack ops as root ops if possible.

---------

Signed-off-by: Max Dawkins &lt;max.dawkins@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -74,6 +74,12 @@ static llvm::cl::opt<bool> clLLVMGPUEnableSharedMemoryReuse(
         "Enable shared memory reuse in the vector distribute pipeline"),
     llvm::cl::init(false));
 
+static llvm::cl::opt<bool> clLLVMGPUEnableExperimentalDataTiling(
+    "iree-llvmgpu-experimental-data-tiling",
+    llvm::cl::desc("Enables late data-tiling materialization for LLVMGPU "
+                   "(experimental)."),
+    llvm::cl::init(false));
+
 static llvm::cl::opt<bool> clDistributeToWorkgroupsUsingForall(
     "iree-llvmgpu-test-distribute-to-workgroups-using-forall",
     llvm::cl::desc("Use scf.forall for distribution to workgroups"),
@@ -1165,11 +1171,15 @@ static void buildLLVMGPUCodegenConfigurationPassPipelineImpl(
     OpPassManager &modulePassManager) {
   {
     FunctionLikeNest funcPassManager(modulePassManager);
+    if (clLLVMGPUEnableExperimentalDataTiling) {
+      funcPassManager.addPass(createMaterializeDeviceEncodingPass);
+    } else {
+      addEncodingToPaddingPasses(funcPassManager);
+    }
     funcPassManager.addPass(createGPUGeneralizeNamedOpsPass);
     addCommonTargetExecutablePreprocessingPasses(funcPassManager);
     // This materializes into 'nop' in the absence of pad encoding layout
     // attributes.
-    addEncodingToPaddingPasses(funcPassManager);
     funcPassManager.addPass(createBlockDynamicDimensionsPass);
     funcPassManager.addPass(createConfigTrackingCanonicalizerPass);
     funcPassManager.addPass(createCSEPass);
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -35,6 +35,7 @@ iree_lit_test_suite(
             "config_winograd.mlir",
             "extract_address_computation_gpu.mlir",
             "gpu_set_num_workgroups.mlir",
+            "gpu_pipeline_data_tiling.mlir",
             "gpu_pipeline_generalize_named_ops.mlir",
             "horizontal_fusion_pipeline.mlir",
             "link_executables.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -31,6 +31,7 @@ iree_lit_test_suite(
     "distribute_to_thread.mlir"
     "elementwise_pipeline.mlir"
     "extract_address_computation_gpu.mlir"
+    "gpu_pipeline_data_tiling.mlir"
     "gpu_pipeline_generalize_named_ops.mlir"
     "gpu_set_num_workgroups.mlir"
     "horizontal_fusion_pipeline.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_data_tiling.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_data_tiling.mlir
@@ -0,0 +1,39 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline))))" \
+// RUN:    --iree-gpu-test-target=gfx942 --iree-llvmgpu-experimental-data-tiling \
+// RUN:    --split-input-file %s | FileCheck %s
+
+// Make sure that the GPU configuration pipelines materialize encoding ops.
+
+#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  int8, storage =  b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_I32_16x16x32_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
+#map = affine_map<(d0, d1, d2) -> (d0, d2)>
+#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
+#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
+#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
+#encoding = #iree_encoding.encoding<operand_index = 0 : index, op_type =  matmul, element_types = [i8, i8, i32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 128, 128, 128>>
+module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
+  hal.executable private @executable {
+    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
+      hal.executable.export public @export ordinal(0) layout(#pipeline_layout) {
+      ^bb0(%arg0: !hal.device):
+        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
+        hal.return %x, %y, %z : index, index, index
+      }
+      builtin.module {
+        func.func @set_encoding() {
+          %c0 = arith.constant 0 : index
+          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32768x1280xi8>>
+          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<32768x1280xi8, #encoding>>
+          %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32768, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32768x1280xi8>> -> tensor<32768x1280xi8>
+          %3 = iree_encoding.set_encoding %2 : tensor<32768x1280xi8> -> tensor<32768x1280xi8, #encoding>
+          flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [32768, 1280], strides = [1, 1] : tensor<32768x1280xi8, #encoding> -> !flow.dispatch.tensor<writeonly:tensor<32768x1280xi8, #encoding>>
+          return
+        }
+      }
+    }
+  }
+}
+
+// CHECK:      @set_encoding()
+// CHECK:        linalg.pack
+// CHECK:        tensor.expand_shape
+// CHECK:        linalg.generic