Skip to content

Commit fa98bda

Browse files
committed
[LLVMGPU] Add ROCDLLoadToTransposeLoadPass to TileAndFuse pipeline
Signed-off-by: Max Dawkins <[email protected]>
1 parent a08d24b commit fa98bda

File tree

5 files changed

+486
-1
lines changed

5 files changed

+486
-1
lines changed

compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,11 @@ static llvm::cl::opt<bool> clCombineLayoutTransformation(
8888
llvm::cl::desc("Combine relayout ops during dispatch configuration"),
8989
llvm::cl::init(true), llvm::cl::Hidden);
9090

91+
static llvm::cl::opt<bool> clROCDLLoadToTransposeLoad(
92+
"iree-llvmgpu-test-load-to-transpose-load",
93+
llvm::cl::desc("Enable amdgpu.transpose_load targeting for ROCDL"),
94+
llvm::cl::init(true), llvm::cl::Hidden);
95+
9196
static llvm::cl::opt<IREE::Codegen::WorkgroupId>
9297
clSetWorkgroupDistributionAlong(
9398
"iree-llvmgpu-set-workgroup-distribution-along",
@@ -580,6 +585,9 @@ void addGPUTileAndFusePassPipeline(OpPassManager &funcPassManager,
580585
funcPassManager.addPass(IREE::GPU::createUnrollToIntrinsicsPass());
581586
funcPassManager.addPass(createCanonicalizerPass());
582587
funcPassManager.addPass(createCSEPass());
588+
if (forROCDL && clROCDLLoadToTransposeLoad) {
589+
funcPassManager.addPass(createROCDLLoadToTransposeLoadPass());
590+
}
583591

584592
// Step 9. Remaining post-bufferization optimizations/lowerings.
585593
funcPassManager.addPass(createFlattenSwizzleHintAllocsPass());

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ iree_lit_test_suite(
3636
"pipeline_elementwise_f8fnuz.mlir",
3737
"pipeline_elementwise_f8ocp.mlir",
3838
"pipeline_igemm_tile_and_fuse.mlir",
39+
"pipeline_igemm_tile_and_fuse_gfx950.mlir",
3940
"pipeline_lower_to_llvmgpu.mlir",
4041
"pipeline_scaled_truncation_gfx950.mlir",
4142
"pipeline_tile_and_fuse.mlir",

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ iree_lit_test_suite(
3131
"pipeline_elementwise_f8fnuz.mlir"
3232
"pipeline_elementwise_f8ocp.mlir"
3333
"pipeline_igemm_tile_and_fuse.mlir"
34+
"pipeline_igemm_tile_and_fuse_gfx950.mlir"
3435
"pipeline_lower_to_llvmgpu.mlir"
3536
"pipeline_scaled_truncation_gfx950.mlir"
3637
"pipeline_tile_and_fuse.mlir"
Lines changed: 205 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,205 @@
1+
// RUN: iree-opt --split-input-file --iree-gpu-test-target=gfx950 \
2+
// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(builtin.module(func.func(iree-llvmgpu-lower-executable-target{for-rocdl=true})))))" %s | FileCheck %s
3+
4+
#pipeline_layout = #hal.pipeline.layout<bindings = [
5+
#hal.pipeline.binding<storage_buffer, ReadOnly>,
6+
#hal.pipeline.binding<storage_buffer, ReadOnly>,
7+
#hal.pipeline.binding<storage_buffer>
8+
]>
9+
#translation = #iree_codegen.translation_info<pipeline =
10+
LLVMGPUTileAndFuse
11+
workgroup_size = [256, 1, 1]
12+
subgroup_size = 64,
13+
{
14+
gpu_pipeline_options = #iree_gpu.pipeline_options<
15+
prefetch_num_stages = 0,
16+
no_reduce_shared_memory_bank_conflicts = false,
17+
use_igemm_convolution = true>
18+
}>
19+
#config = #iree_gpu.lowering_config<{
20+
workgroup = [1, 4, 16, 256, 0],
21+
reduction = [0, 0, 0, 0, 2],
22+
subgroup = [1, 4, 1, 4, 0],
23+
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F16>,
24+
promote_operands = [0, 1]
25+
}>
26+
hal.executable private @conv_nhwc_f16 {
27+
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
28+
hal.executable.export public @conv_nhwc_f16 ordinal(0) layout(#pipeline_layout) count(%arg0: !hal.device) -> (index, index, index) {
29+
%x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
30+
hal.return %x, %y, %z : index, index, index
31+
}
32+
builtin.module {
33+
func.func @conv_nhwc_f16() attributes {translation_info = #translation} {
34+
%cst = arith.constant 0.000000e+00 : f32
35+
%c0 = arith.constant 0 : index
36+
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x34x34x1280xf16>>
37+
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<3x3x1280x1280xf16>>
38+
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x32x32x1280xf32>>
39+
%3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 34, 34, 1280], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x34x34x1280xf16>> -> tensor<2x34x34x1280xf16>
40+
%4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 1280, 1280], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<3x3x1280x1280xf16>> -> tensor<3x3x1280x1280xf16>
41+
%5 = tensor.empty() : tensor<2x32x32x1280xf32>
42+
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32x32x1280xf32>) -> tensor<2x32x32x1280xf32>
43+
%7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>, lowering_config = #config} ins(%3, %4 : tensor<2x34x34x1280xf16>, tensor<3x3x1280x1280xf16>) outs(%6 : tensor<2x32x32x1280xf32>) -> tensor<2x32x32x1280xf32>
44+
iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 32, 1280], strides = [1, 1, 1, 1] : tensor<2x32x32x1280xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x32x32x1280xf32>>
45+
return
46+
}
47+
}
48+
}
49+
}
50+
51+
// CHECK-LABEL: func @conv_nhwc_f16
52+
// CHECK: scf.forall
53+
// CHECK: scf.for {{.*}} iter_args
54+
// CHECK-DAG: vector.transfer_read {{.*}}memref<2x34x34x1280xf16, #amdgpu.address_space<fat_raw_buffer>>{{.*}}vector<8xf16>
55+
// CHECK-DAG: vector.transfer_write {{.*}}memref<1x4x16x{{.*}}xf16, {{.*}}#gpu.address_space<workgroup>>
56+
// CHECK-DAG: vector.transfer_read {{.*}}memref<11520x1280xf16, #amdgpu.address_space<fat_raw_buffer>>{{.*}}vector<8xf16>
57+
// CHECK-DAG: vector.transfer_write {{.*}}memref<64x{{.*}}xf16, {{.*}}#gpu.address_space<workgroup>>
58+
// CHECK: gpu.barrier
59+
// CHECK: vector.transfer_read {{.*}}#gpu.address_space<workgroup>
60+
// CHECK: amdgpu.transpose_load {{.*}}#gpu.address_space<workgroup>{{.*}}vector<4xf16>
61+
// CHECK: amdgpu.mfma 16x16x32 {{.*}} vector<8xf16>, vector<8xf16>, vector<4xf32>
62+
// CHECK: scf.yield
63+
64+
// -----
65+
66+
#pipeline_layout_unaligned = #hal.pipeline.layout<bindings = [
67+
#hal.pipeline.binding<storage_buffer, ReadOnly>,
68+
#hal.pipeline.binding<storage_buffer, ReadOnly>,
69+
#hal.pipeline.binding<storage_buffer>
70+
]>
71+
#translation_unaligned = #iree_codegen.translation_info<pipeline =
72+
LLVMGPUTileAndFuse
73+
workgroup_size = [256, 1, 1]
74+
subgroup_size = 64,
75+
{
76+
gpu_pipeline_options = #iree_gpu.pipeline_options<
77+
prefetch_num_stages = 0,
78+
no_reduce_shared_memory_bank_conflicts = false,
79+
use_igemm_convolution = true>
80+
}>
81+
#config_unaligned = #iree_gpu.lowering_config<{
82+
padding = [2, 1, 32, 16, 32],
83+
workgroup = [2, 1, 32, 16, 0],
84+
reduction = [0, 0, 0, 0, 1],
85+
subgroup = [1, 1, 1, 1, 0],
86+
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_F16>,
87+
promote_operands = [0, 1]
88+
}>
89+
hal.executable private @conv_nhwc_unaligned_f16 {
90+
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
91+
hal.executable.export public @conv_nhwc_unaligned_f16 ordinal(0) layout(#pipeline_layout_unaligned) count(%arg0: !hal.device) -> (index, index, index) {
92+
%x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
93+
hal.return %x, %y, %z : index, index, index
94+
}
95+
builtin.module {
96+
func.func @conv_nhwc_unaligned_f16() attributes {translation_info = #translation_unaligned} {
97+
%cst = arith.constant 0.000000e+00 : f32
98+
%c0 = arith.constant 0 : index
99+
%0 = hal.interface.binding.subspan layout(#pipeline_layout_unaligned) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>>
100+
%1 = hal.interface.binding.subspan layout(#pipeline_layout_unaligned) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<3x3x1281x1281xf16>>
101+
%2 = hal.interface.binding.subspan layout(#pipeline_layout_unaligned) binding(2) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
102+
%3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 35, 35, 1281], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x35x35x1281xf16>> -> tensor<2x35x35x1281xf16>
103+
%4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 1281, 1281], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<3x3x1281x1281xf16>> -> tensor<3x3x1281x1281xf16>
104+
%5 = tensor.empty() : tensor<2x17x17x1281xf32>
105+
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x17x17x1281xf32>) -> tensor<2x17x17x1281xf32>
106+
%7 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>, lowering_config = #config_unaligned} ins(%3, %4 : tensor<2x35x35x1281xf16>, tensor<3x3x1281x1281xf16>) outs(%6 : tensor<2x17x17x1281xf32>) -> tensor<2x17x17x1281xf32>
107+
iree_tensor_ext.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [2, 17, 17, 1281], strides = [1, 1, 1, 1] : tensor<2x17x17x1281xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x17x17x1281xf32>>
108+
return
109+
}
110+
}
111+
}
112+
}
113+
114+
// CHECK-LABEL: func @conv_nhwc_unaligned_f16
115+
// CHECK: scf.forall
116+
// CHECK: scf.for {{.*}} iter_args
117+
// CHECK-DAG: vector.transfer_read {{.*}}memref<2x35x35x1281xf16, #amdgpu.address_space<fat_raw_buffer>>
118+
// CHECK-DAG: vector.transfer_write {{.*}}memref<2x1x32x{{.*}}xf16, {{.*}}#gpu.address_space<workgroup>>
119+
// CHECK-DAG: vector.transfer_read {{.*}}memref<11529x1281xf16, #amdgpu.address_space<fat_raw_buffer>>
120+
// CHECK-DAG: vector.transfer_write {{.*}}memref<32x{{.*}}xf16, {{.*}}#gpu.address_space<workgroup>>
121+
// CHECK: gpu.barrier
122+
// CHECK: vector.transfer_read {{.*}}#gpu.address_space<workgroup>
123+
// CHECK: amdgpu.transpose_load {{.*}}#gpu.address_space<workgroup>{{.*}}vector<4xf16>
124+
// CHECK: amdgpu.mfma 16x16x32 {{.*}} vector<8xf16>, vector<8xf16>, vector<4xf32>
125+
// CHECK: scf.yield
126+
127+
// -----
128+
129+
#pipeline_layout_backward = #hal.pipeline.layout<bindings = [
130+
#hal.pipeline.binding<storage_buffer, "ReadOnly">,
131+
#hal.pipeline.binding<storage_buffer, "ReadOnly">,
132+
#hal.pipeline.binding<storage_buffer>
133+
]>
134+
#translation_backward = #iree_codegen.translation_info<pipeline =
135+
LLVMGPUTileAndFuse
136+
workgroup_size = [256, 1, 1]
137+
subgroup_size = 64,
138+
{
139+
gpu_pipeline_options = #iree_gpu.pipeline_options<
140+
prefetch_num_stages = 0,
141+
no_reduce_shared_memory_bank_conflicts = false,
142+
use_igemm_convolution = true>
143+
}>
144+
#config_backward = #iree_gpu.lowering_config<{
145+
padding = [2, 32, 64, 64],
146+
workgroup = [2, 32, 64, 0],
147+
reduction = [0, 0, 0, 2],
148+
subgroup = [2, 2, 1, 0],
149+
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x32_BF16>,
150+
promote_operands = [0, 1]
151+
}>
152+
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
153+
#map1 = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
154+
#map2 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
155+
#map3 = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
156+
hal.executable private @conv_input_backward_bf16 {
157+
hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb">) {
158+
hal.executable.export public @conv_input_backward_bf16 ordinal(0) layout(#pipeline_layout_backward) count(%arg0: !hal.device) -> (index, index, index) {
159+
%x, %y, %z = iree_tensor_ext.dispatch.workgroup_count_from_slice()
160+
hal.return %x, %y, %z : index, index, index
161+
}
162+
builtin.module {
163+
func.func @conv_input_backward_bf16() attributes {translation_info = #translation_backward} {
164+
%cst = arith.constant 0.000000e+00 : f32
165+
%c0 = arith.constant 0 : index
166+
%0 = hal.interface.binding.subspan layout(#pipeline_layout_backward) binding(0) alignment(64) offset(%c0) flags("ReadOnly") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<16x21x384xbf16>>
167+
%1 = hal.interface.binding.subspan layout(#pipeline_layout_backward) binding(1) alignment(64) offset(%c0) flags("ReadOnly") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<384x192xbf16>>
168+
%2 = hal.interface.binding.subspan layout(#pipeline_layout_backward) binding(2) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<16x21x192xbf16>>
169+
%3 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [16, 21, 384], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<16x21x384xbf16>> -> tensor<16x21x384xbf16>
170+
%4 = iree_tensor_ext.dispatch.tensor.load %1, offsets = [0, 0], sizes = [384, 192], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<384x192xbf16>> -> tensor<384x192xbf16>
171+
%5 = tensor.empty() : tensor<16x21x192xf32>
172+
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<16x21x192xf32>) -> tensor<16x21x192xf32>
173+
%7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%3, %4 : tensor<16x21x384xbf16>, tensor<384x192xbf16>) outs(%6 : tensor<16x21x192xf32>) attrs = {lowering_config = #config_backward} {
174+
^bb0(%in: bf16, %in_0: bf16, %out: f32):
175+
%10 = arith.extf %in : bf16 to f32
176+
%11 = arith.extf %in_0 : bf16 to f32
177+
%12 = arith.mulf %10, %11 : f32
178+
%13 = arith.addf %out, %12 : f32
179+
linalg.yield %13 : f32
180+
} -> tensor<16x21x192xf32>
181+
%8 = tensor.empty() : tensor<16x21x192xbf16>
182+
%9 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel"]} ins(%7 : tensor<16x21x192xf32>) outs(%8 : tensor<16x21x192xbf16>) {
183+
^bb0(%in: f32, %out: bf16):
184+
%10 = arith.truncf %in : f32 to bf16
185+
linalg.yield %10 : bf16
186+
} -> tensor<16x21x192xbf16>
187+
iree_tensor_ext.dispatch.tensor.store %9, %2, offsets = [0, 0, 0], sizes = [16, 21, 192], strides = [1, 1, 1] : tensor<16x21x192xbf16> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<16x21x192xbf16>>
188+
return
189+
}
190+
}
191+
}
192+
}
193+
194+
// CHECK-LABEL: func @conv_input_backward_bf16
195+
// CHECK: scf.forall
196+
// CHECK: scf.for {{.*}} iter_args
197+
// CHECK-DAG: vector.transfer_read {{.*}}memref<16x21x384xbf16, #amdgpu.address_space<fat_raw_buffer>>{{.*}}vector<8xbf16>
198+
// CHECK-DAG: vector.transfer_write {{.*}}memref<2x32x{{.*}}xbf16, {{.*}}#gpu.address_space<workgroup>>
199+
// CHECK-DAG: vector.transfer_read {{.*}}memref<384x192xbf16, #amdgpu.address_space<fat_raw_buffer>>{{.*}}vector<8xbf16>
200+
// CHECK-DAG: vector.transfer_write {{.*}}memref<64x{{.*}}xbf16, {{.*}}#gpu.address_space<workgroup>>
201+
// CHECK: gpu.barrier
202+
// CHECK: vector.transfer_read {{.*}}#gpu.address_space<workgroup>
203+
// CHECK: amdgpu.transpose_load {{.*}}#gpu.address_space<workgroup>{{.*}}vector<4xbf16>
204+
// CHECK: amdgpu.mfma 16x16x32 {{.*}} vector<8xbf16>, vector<8xbf16>, vector<4xf32>
205+
// CHECK: scf.yield

0 commit comments

Comments
 (0)