Skip to content

Commit d2e0fdd

Browse files
authored
[LLVMGPU][DT] Add MaterializeDeviceEncodingPass to LLVMGPU passes behind flag (#19849)
This PR adds the `MaterializeDeviceEncodingPass` to LLVMGPU/Passes.cpp behind a new flag `iree-llvmgpu-experimental-data-tiling`. The flag's default value is false, because the codegen for data tiling ops on GPU is not yet working and performant for all cases. Some of the work is in flight, but it will likely take some time before data tiling codegen is ready to be flipped on by default. For now, the flag allows developers to enable the late materialization codegen path on LLVMGPU. To effectively use the late materialization path for data-tiling fusion, some additional non-default flags need to be set: - `--iree-opt-data-tiling=false` (to turn off the early materialization data tiling path) - `--iree-dispatch-creation-experimental-data-tiling=true` (to turn on the late materialization data tiling path) - `--iree-dispatch-creation-pad-factor=128` (the current default is based on CPU materialization) This PR also includes a small fix to the ROCDLKernelConfig logic for selecting the root op when there are pack/unpack ops in the dispatch. The fix avoids selecting pack and unpack ops as root ops if possible. --------- Signed-off-by: Max Dawkins <[email protected]>
1 parent 5492301 commit d2e0fdd

File tree

4 files changed

+52
-1
lines changed

4 files changed

+52
-1
lines changed

compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,12 @@ static llvm::cl::opt<bool> clLLVMGPUEnableSharedMemoryReuse(
7474
"Enable shared memory reuse in the vector distribute pipeline"),
7575
llvm::cl::init(false));
7676

77+
static llvm::cl::opt<bool> clLLVMGPUEnableExperimentalDataTiling(
78+
"iree-llvmgpu-experimental-data-tiling",
79+
llvm::cl::desc("Enables late data-tiling materialization for LLVMGPU "
80+
"(experimental)."),
81+
llvm::cl::init(false));
82+
7783
static llvm::cl::opt<bool> clDistributeToWorkgroupsUsingForall(
7884
"iree-llvmgpu-test-distribute-to-workgroups-using-forall",
7985
llvm::cl::desc("Use scf.forall for distribution to workgroups"),
@@ -1165,11 +1171,15 @@ static void buildLLVMGPUCodegenConfigurationPassPipelineImpl(
11651171
OpPassManager &modulePassManager) {
11661172
{
11671173
FunctionLikeNest funcPassManager(modulePassManager);
1174+
if (clLLVMGPUEnableExperimentalDataTiling) {
1175+
funcPassManager.addPass(createMaterializeDeviceEncodingPass);
1176+
} else {
1177+
addEncodingToPaddingPasses(funcPassManager);
1178+
}
11681179
funcPassManager.addPass(createGPUGeneralizeNamedOpsPass);
11691180
addCommonTargetExecutablePreprocessingPasses(funcPassManager);
11701181
// This materializes into 'nop' in the absence of pad encoding layout
11711182
// attributes.
1172-
addEncodingToPaddingPasses(funcPassManager);
11731183
funcPassManager.addPass(createBlockDynamicDimensionsPass);
11741184
funcPassManager.addPass(createConfigTrackingCanonicalizerPass);
11751185
funcPassManager.addPass(createCSEPass);

compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ iree_lit_test_suite(
3535
"config_winograd.mlir",
3636
"extract_address_computation_gpu.mlir",
3737
"gpu_set_num_workgroups.mlir",
38+
"gpu_pipeline_data_tiling.mlir",
3839
"gpu_pipeline_generalize_named_ops.mlir",
3940
"horizontal_fusion_pipeline.mlir",
4041
"link_executables.mlir",

compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ iree_lit_test_suite(
3131
"distribute_to_thread.mlir"
3232
"elementwise_pipeline.mlir"
3333
"extract_address_computation_gpu.mlir"
34+
"gpu_pipeline_data_tiling.mlir"
3435
"gpu_pipeline_generalize_named_ops.mlir"
3536
"gpu_set_num_workgroups.mlir"
3637
"horizontal_fusion_pipeline.mlir"
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
// RUN: iree-opt --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(iree-codegen-llvmgpu-configuration-pipeline))))" \
2+
// RUN: --iree-gpu-test-target=gfx942 --iree-llvmgpu-experimental-data-tiling \
3+
// RUN: --split-input-file %s | FileCheck %s
4+
5+
// Make sure that the GPU configuration pipelines materialize encoding ops.
6+
7+
#executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = int8, storage = b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_I32_16x16x32_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none"}>
8+
#map = affine_map<(d0, d1, d2) -> (d0, d2)>
9+
#map1 = affine_map<(d0, d1, d2) -> (d1, d2)>
10+
#map2 = affine_map<(d0, d1, d2) -> (d0, d1)>
11+
#pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
12+
#encoding = #iree_encoding.encoding<operand_index = 0 : index, op_type = matmul, element_types = [i8, i8, i32], user_indexing_maps = [#map, #map1, #map2], round_dims_to = array<i64: 128, 128, 128>>
13+
module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
14+
hal.executable private @executable {
15+
hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
16+
hal.executable.export public @export ordinal(0) layout(#pipeline_layout) {
17+
^bb0(%arg0: !hal.device):
18+
%x, %y, %z = flow.dispatch.workgroup_count_from_slice
19+
hal.return %x, %y, %z : index, index, index
20+
}
21+
builtin.module {
22+
func.func @set_encoding() {
23+
%c0 = arith.constant 0 : index
24+
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32768x1280xi8>>
25+
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<32768x1280xi8, #encoding>>
26+
%2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [32768, 1280], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<32768x1280xi8>> -> tensor<32768x1280xi8>
27+
%3 = iree_encoding.set_encoding %2 : tensor<32768x1280xi8> -> tensor<32768x1280xi8, #encoding>
28+
flow.dispatch.tensor.store %3, %1, offsets = [0, 0], sizes = [32768, 1280], strides = [1, 1] : tensor<32768x1280xi8, #encoding> -> !flow.dispatch.tensor<writeonly:tensor<32768x1280xi8, #encoding>>
29+
return
30+
}
31+
}
32+
}
33+
}
34+
}
35+
36+
// CHECK: @set_encoding()
37+
// CHECK: linalg.pack
38+
// CHECK: tensor.expand_shape
39+
// CHECK: linalg.generic

0 commit comments

Comments
 (0)