Skip to content

Commit ea599e2

Browse files
authored
Add SDXL regression test on mi308 (iree-org#19747)
- Adds mi308 runner for regression testing and specifies sku for tuned specs - Adds new unet IR for sdxl with 960x1024 image shape - New tuning spec for new unet IR for mi308 sku --------- Signed-off-by: Ian <[email protected]> Signed-off-by: ianNod <[email protected]>
1 parent ec6e00b commit ea599e2

File tree

6 files changed

+467
-1
lines changed

6 files changed

+467
-1
lines changed

.github/workflows/pkgci_regression_test.yml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,18 @@ jobs:
4242
- name: amdgpu_rocm_mi250_gfx90a
4343
rocm-chip: gfx90a
4444
backend: rocm
45+
sku: mi250
4546
runs-on: nodai-amdgpu-mi250-x86-64
4647
- name: amdgpu_rocm_mi300_gfx942
4748
rocm-chip: gfx942
4849
backend: rocm
50+
sku: mi300
4951
runs-on: nodai-amdgpu-mi300-x86-64
52+
- name: amdgpu_rocm_mi308_gfx942
53+
rocm-chip: gfx942
54+
backend: rocm
55+
sku: mi308
56+
runs-on: nodai-amdgpu-mi308-x86-64
5057
env:
5158
PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
5259
IREE_TEST_PATH_EXTENSION: ${{ github.workspace }}/build_tools/pkgci/external_test_suite
@@ -95,6 +102,7 @@ jobs:
95102
--durations=0
96103
env:
97104
ROCM_CHIP: ${{ matrix.rocm-chip }}
105+
SKU: ${{ matrix.sku }}
98106

99107
- name: "Running SD3 special model tests"
100108
if: "!cancelled()"
@@ -165,3 +173,32 @@ jobs:
165173
--timeout=600 \
166174
--retries 7
167175
echo "$(<job_summary.md )" >> $GITHUB_STEP_SUMMARY
176+
# Note: allowing 10% deviation from observed averages here to account for
177+
# different runner conditions.
178+
- name: "Running SDXL rocm pipeline benchmark (mi308)"
179+
if: contains(matrix.name, 'rocm_mi308_gfx942')
180+
run: |
181+
source ${VENV_DIR}/bin/activate
182+
pytest ./experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py \
183+
--goldentime-tolerance-multiplier 1.1 \
184+
--goldentime-rocm-e2e-ms 800.0 \
185+
--goldentime-rocm-unet-ms 195.0 \
186+
--goldentime-rocm-clip-ms 15.0 \
187+
--goldentime-rocm-vae-ms 190.0 \
188+
--goldendispatch-rocm-unet 1602 \
189+
--goldendispatch-rocm-clip 1139 \
190+
--goldendispatch-rocm-vae 246 \
191+
--goldensize-rocm-unet-bytes 2270000 \
192+
--goldensize-rocm-clip-bytes 860000 \
193+
--goldensize-rocm-vae-bytes 840000 \
194+
--goldentime-rocm-punet-int8-fp16-ms 140.0 \
195+
--goldentime-rocm-punet-int8-fp8-ms 150 \
196+
--goldendispatch-rocm-punet-int8-fp16 1424 \
197+
--goldendispatch-rocm-punet-int8-fp8 1704 \
198+
--goldensize-rocm-punet-int8-fp8-bytes 2800000 \
199+
--goldensize-rocm-punet-int8-fp16-bytes 2560000 \
200+
--rocm-chip gfx942 \
201+
--log-cli-level=info \
202+
--timeout=600 \
203+
--retries 7
204+
echo "$(<job_summary.md )" >> $GITHUB_STEP_SUMMARY
Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
module attributes { transform.with_named_sequence } {
2+
//===----------------------------------------------------------------------===//
3+
// Tuning infra
4+
//===----------------------------------------------------------------------===//
5+
6+
transform.named_sequence @apply_op_config(%op: !transform.any_op {transform.readonly},
7+
%config: !transform.any_param {transform.readonly}) {
8+
transform.annotate %op "compilation_info" = %config : !transform.any_op, !transform.any_param
9+
// transform.print %op {name = "Applied"} : !transform.any_op
10+
transform.yield
11+
}
12+
13+
transform.named_sequence @apply_attn_op_config(%attention: !transform.any_op {transform.readonly},
14+
%config: !transform.any_param {transform.readonly},
15+
%decomposition_config: !transform.any_param {transform.readonly}) {
16+
transform.annotate %attention "compilation_info" = %config : !transform.any_op, !transform.any_param
17+
transform.annotate %attention "decomposition_config" = %decomposition_config : !transform.any_op, !transform.any_param
18+
// transform.print %attention {name = "Applied attention config"} : !transform.any_op
19+
transform.yield
20+
}
21+
22+
transform.named_sequence @match_attention_f16(%attention: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param, !transform.any_param) {
23+
transform.match.operation_name %attention ["iree_linalg_ext.attention"] : !transform.any_op
24+
%in0 = transform.get_operand %attention[0] : (!transform.any_op) -> !transform.any_value
25+
transform.iree.match.cast_compatible_type %in0 = tensor<?x?x?x?xf16> : !transform.any_value
26+
27+
%config = transform.param.constant #iree_codegen.compilation_info<
28+
lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 1, 64, 0, 0, 0], reduction=[0, 0, 0, 0, 0, 64], promote_operands = [1, 2]}>,
29+
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
30+
workgroup_size = [64, 4]
31+
subgroup_size = 64 ,
32+
{llvm_func_attrs = { "amdgpu-waves-per-eu" = "2", "denormal-fp-math-f32" = "preserve-sign" }}>>
33+
-> !transform.any_param
34+
35+
%decomposition_config = transform.param.constant {
36+
qk_attrs = {attention_qk_matmul,
37+
lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.virtual_mma_layout<intrinsic = VMFMA_F32_32x32x16_F16>,
38+
subgroup_m_count = 4, subgroup_n_count = 1, promote_operands = [1] }>},
39+
pv_attrs = {attention_pv_matmul,
40+
lowering_config = #iree_gpu.lowering_config<{mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>,
41+
subgroup_m_count = 4, subgroup_n_count = 1, promote_operands = [1] }>}
42+
} -> !transform.any_param
43+
44+
transform.yield %attention, %config, %decomposition_config : !transform.any_op, !transform.any_param, !transform.any_param
45+
}
46+
47+
transform.named_sequence @match_mmt_f16_f16_f32(%root: !transform.any_op {transform.readonly}) -> (!transform.any_op) {
48+
transform.match.operation_name %root ["linalg.generic"] : !transform.any_op
49+
// transform.print %root {name = "Generic"} : !transform.any_op
50+
%ins, %outs = transform.iree.match.cast_compatible_dag_from_root %root {
51+
^bb0(%lhs: tensor<?x?xf16>, %rhs: tensor<?x?xf16>, %out: tensor<?x?xf32>):
52+
%7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
53+
affine_map<(d0, d1, d2) -> (d1, d2)>,
54+
affine_map<(d0, d1, d2) -> (d0, d1)>],
55+
iterator_types = ["parallel", "parallel", "reduction"]}
56+
ins(%lhs, %rhs : tensor<?x?xf16>, tensor<?x?xf16>) outs(%out : tensor<?x?xf32>) {
57+
^bb0(%in: f16, %in_0: f16, %acc: f32):
58+
%18 = arith.extf %in : f16 to f32
59+
%19 = arith.extf %in_0 : f16 to f32
60+
%20 = arith.mulf %18, %19 : f32
61+
%21 = arith.addf %acc, %20 : f32
62+
linalg.yield %21 : f32
63+
} -> tensor<?x?xf32>
64+
} : (!transform.any_op) -> (!transform.any_value, !transform.any_value)
65+
transform.yield %root : !transform.any_op
66+
}
67+
68+
// TUNING_SPEC_BEGIN DO NOT REMOVE
69+
70+
//===----------------------------------------------------------------------===//
71+
// Matmul tuning
72+
//===----------------------------------------------------------------------===//
73+
74+
transform.named_sequence @match_mmt_1920x10240x1280(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
75+
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
76+
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
77+
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
78+
transform.iree.match.cast_compatible_type %lhs = tensor<1920x1280xf16> : !transform.any_value
79+
transform.iree.match.cast_compatible_type %rhs = tensor<10240x1280xf16> : !transform.any_value
80+
%config = transform.param.constant #iree_codegen.compilation_info<
81+
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
82+
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
83+
subgroup_m_count = 4, subgroup_n_count = 2,
84+
reduction = [0, 0, 32],
85+
workgroup = [128, 128, 0]}>,
86+
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
87+
workgroup_size = [128, 4, 1] subgroup_size = 64,
88+
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
89+
llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}
90+
}>> -> !transform.any_param
91+
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
92+
}
93+
94+
transform.named_sequence @match_mmt_1920x1280x1280(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
95+
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
96+
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
97+
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
98+
transform.iree.match.cast_compatible_type %lhs = tensor<1920x1280xf16> : !transform.any_value
99+
transform.iree.match.cast_compatible_type %rhs = tensor<1280x1280xf16> : !transform.any_value
100+
%config = transform.param.constant #iree_codegen.compilation_info<
101+
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
102+
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
103+
subgroup_m_count = 4, subgroup_n_count = 2,
104+
reduction = [0, 0, 32],
105+
workgroup = [128, 128, 0]}>,
106+
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
107+
workgroup_size = [128, 4, 1] subgroup_size = 64,
108+
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
109+
llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}
110+
}>> -> !transform.any_param
111+
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
112+
}
113+
114+
transform.named_sequence @match_mmt_1920x1280x5120(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
115+
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
116+
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
117+
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
118+
transform.iree.match.cast_compatible_type %lhs = tensor<1920x5120xf16> : !transform.any_value
119+
transform.iree.match.cast_compatible_type %rhs = tensor<1280x5120xf16> : !transform.any_value
120+
%config = transform.param.constant #iree_codegen.compilation_info<
121+
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
122+
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
123+
subgroup_m_count = 4, subgroup_n_count = 2,
124+
reduction = [0, 0, 32],
125+
workgroup = [128, 128, 0]}>,
126+
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
127+
workgroup_size = [128, 4, 1] subgroup_size = 64,
128+
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
129+
llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}
130+
}>> -> !transform.any_param
131+
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
132+
}
133+
134+
transform.named_sequence @match_mmt_7680x5120x640(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
135+
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
136+
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
137+
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
138+
transform.iree.match.cast_compatible_type %lhs = tensor<7680x640xf16> : !transform.any_value
139+
transform.iree.match.cast_compatible_type %rhs = tensor<5120x640xf16> : !transform.any_value
140+
%config = transform.param.constant #iree_codegen.compilation_info<
141+
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
142+
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
143+
subgroup_m_count = 2, subgroup_n_count = 4,
144+
reduction = [0, 0, 32],
145+
workgroup = [128, 256, 0]}>,
146+
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
147+
workgroup_size = [256, 2, 1] subgroup_size = 64,
148+
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
149+
llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}
150+
}>> -> !transform.any_param
151+
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
152+
}
153+
154+
transform.named_sequence @match_mmt_128x1280x2048(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
155+
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
156+
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
157+
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
158+
transform.iree.match.cast_compatible_type %lhs = tensor<1280x2048xf16> : !transform.any_value
159+
transform.iree.match.cast_compatible_type %rhs = tensor<1280x2048xf16> : !transform.any_value
160+
%config = transform.param.constant #iree_codegen.compilation_info<
161+
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
162+
mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>,
163+
subgroup_m_count = 2, subgroup_n_count = 1,
164+
reduction = [0, 0, 128],
165+
workgroup = [64, 16, 0]}>,
166+
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
167+
workgroup_size = [64, 2, 1] subgroup_size = 64,
168+
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
169+
llvm_func_attrs = {"amdgpu-waves-per-eu" = "2"}
170+
}>> -> !transform.any_param
171+
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
172+
}
173+
174+
transform.named_sequence @match_mmt_7680x640x640(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
175+
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
176+
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
177+
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
178+
transform.iree.match.cast_compatible_type %lhs = tensor<7680x640xf16> : !transform.any_value
179+
transform.iree.match.cast_compatible_type %rhs = tensor<640x640xf16> : !transform.any_value
180+
%config = transform.param.constant #iree_codegen.compilation_info<
181+
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
182+
mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>,
183+
subgroup_m_count = 1, subgroup_n_count = 4,
184+
reduction = [0, 0, 32],
185+
workgroup = [256, 128, 0]}>,
186+
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
187+
workgroup_size = [256, 1, 1] subgroup_size = 64,
188+
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
189+
llvm_func_attrs = {"amdgpu-waves-per-eu" = "1"}
190+
}>> -> !transform.any_param
191+
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
192+
}
193+
194+
transform.named_sequence @match_mmt_7680x640x2560(%matmul: !transform.any_op {transform.readonly}) -> (!transform.any_op, !transform.any_param) {
195+
%mmt = transform.include @match_mmt_f16_f16_f32 failures(propagate) (%matmul) : (!transform.any_op) -> !transform.any_op
196+
%lhs = transform.get_operand %matmul[0] : (!transform.any_op) -> !transform.any_value
197+
%rhs = transform.get_operand %matmul[1] : (!transform.any_op) -> !transform.any_value
198+
transform.iree.match.cast_compatible_type %lhs = tensor<7680x2560xf16> : !transform.any_value
199+
transform.iree.match.cast_compatible_type %rhs = tensor<640x2560xf16> : !transform.any_value
200+
%config = transform.param.constant #iree_codegen.compilation_info<
201+
lowering_config = #iree_gpu.lowering_config<{promote_operands = [0, 1],
202+
mma_kind = #iree_gpu.mma_layout<MFMA_F32_32x32x8_F16>,
203+
subgroup_m_count = 4, subgroup_n_count = 2,
204+
reduction = [0, 0, 32],
205+
workgroup = [256, 128, 0]}>,
206+
translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute
207+
workgroup_size = [128, 4, 1] subgroup_size = 64,
208+
{gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = true>,
209+
llvm_func_attrs = {"amdgpu-waves-per-eu" = "4"}
210+
}>> -> !transform.any_param
211+
transform.yield %matmul, %config : !transform.any_op, !transform.any_param
212+
}
213+
214+
//===----------------------------------------------------------------------===//
215+
// Convolution tuning
216+
//===----------------------------------------------------------------------===//
217+
218+
//===----------------------------------------------------------------------===//
219+
// Batch matmul tuning
220+
//===----------------------------------------------------------------------===//
221+
222+
//===----------------------------------------------------------------------===//
223+
// Broadcast rhs mmt tuning
224+
//===----------------------------------------------------------------------===//
225+
226+
//===----------------------------------------------------------------------===//
227+
// Contraction tuning
228+
//===----------------------------------------------------------------------===//
229+
230+
// TUNING_SPEC_END DO NOT REMOVE
231+
232+
//===----------------------------------------------------------------------===//
233+
// Entry point
234+
//===----------------------------------------------------------------------===//
235+
236+
transform.named_sequence @__kernel_config(%variant_op: !transform.any_op {transform.consumed}) {
237+
transform.foreach_match in %variant_op
238+
@match_attention_f16 -> @apply_attn_op_config
239+
240+
// TUNING_MATCH_BEGIN DO NOT REMOVE
241+
242+
// MMT.
243+
, @match_mmt_1920x10240x1280 -> @apply_op_config
244+
, @match_mmt_1920x1280x1280 -> @apply_op_config
245+
, @match_mmt_1920x1280x5120 -> @apply_op_config
246+
, @match_mmt_7680x5120x640 -> @apply_op_config
247+
, @match_mmt_128x1280x2048 -> @apply_op_config
248+
, @match_mmt_7680x640x640 -> @apply_op_config
249+
, @match_mmt_7680x640x2560 -> @apply_op_config
250+
251+
// TUNING_MATCH_END DO NOT REMOVE
252+
: (!transform.any_op) -> (!transform.any_op)
253+
transform.yield
254+
}
255+
} //// module

0 commit comments

Comments
 (0)