Skip to content

Commit 2e40437

Browse files
authored
[Codegen] Test Cleanup 6/8: LLVMGPU tests (#22749)
Result of a scan over all tests in Codegen to cleanup common issues in tests. A summary of the results + a preamble approximating the issues to look for can be found here: https://gist.github.com/qedawkins/40f9e604fd83745bf1ac20fd63a7a61f
1 parent 222940b commit 2e40437

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

43 files changed

+176
-275
lines changed

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/buffer_instructions_optimization.mlir

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ func.func @no_simplify_mask_no_fat_raw_buffer(%1 : memref<1x?x?x8xbf16>, %index1
134134
}
135135

136136
// CHECK-LABEL: @no_simplify_mask_no_fat_raw_buffer
137-
// CHECK-SAME: (%[[ARG0:.+]]: memref<1x?x?x8xbf16>, %[[ARG1:.+]]: index, %[[ARG2:.+]]: index)
137+
// CHECK-SAME: (%[[ARG0:.+]]: memref<1x?x?x8xbf16>, %{{.+}}: index, %{{.+}}: index)
138138
// CHECK-DAG: %[[MASK:.+]] = vector.create_mask
139139
// CHECK: %[[READ:.+]] = vector.transfer_read %[[ARG0]]
140140
// CHECK-SAME: %[[MASK]]
@@ -153,7 +153,7 @@ func.func @no_simplify_mask_tensor(%1 : tensor<1x?x?x8xbf16>, %index1 : index, %
153153
}
154154

155155
// CHECK-LABEL: @no_simplify_mask_tensor
156-
// CHECK-SAME: (%[[ARG0:.+]]: tensor<1x?x?x8xbf16>, %[[ARG1:.+]]: index, %[[ARG2:.+]]: index)
156+
// CHECK-SAME: (%[[ARG0:.+]]: tensor<1x?x?x8xbf16>, %{{.+}}: index, %{{.+}}: index)
157157
// CHECK-DAG: %[[MASK:.+]] = vector.create_mask
158158
// CHECK: %[[READ:.+]] = vector.transfer_read %[[ARG0]]
159159
// CHECK-SAME: %[[MASK]]
@@ -172,7 +172,7 @@ func.func @no_simplify_mask_outofbounds(%1 : memref<1x?x?x6xbf16, #amdgpu.addres
172172
}
173173

174174
// CHECK-LABEL: @no_simplify_mask_outofbounds
175-
// CHECK-SAME: (%[[ARG0:.+]]: memref<1x?x?x6xbf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.+]]: index, %[[ARG2:.+]]: index)
175+
// CHECK-SAME: (%[[ARG0:.+]]: memref<1x?x?x6xbf16, #amdgpu.address_space<fat_raw_buffer>>, %{{.+}}: index, %{{.+}}: index)
176176
// CHECK-DAG: %[[MASK:.+]] = vector.create_mask
177177
// CHECK: %[[READ:.+]] = vector.transfer_read %[[ARG0]]
178178
// CHECK-SAME: %[[MASK]]
@@ -191,7 +191,7 @@ func.func @no_simplify_partial_mask(%1 : memref<1x?x?x8xbf16, #amdgpu.address_sp
191191
}
192192

193193
// CHECK-LABEL: @no_simplify_partial_mask
194-
// CHECK-SAME: (%[[ARG0:.+]]: memref<1x?x?x8xbf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.+]]: index, %[[ARG2:.+]]: index)
194+
// CHECK-SAME: (%[[ARG0:.+]]: memref<1x?x?x8xbf16, #amdgpu.address_space<fat_raw_buffer>>, %{{.+}}: index, %{{.+}}: index)
195195
// CHECK-DAG: %[[MASK:.+]] = vector.create_mask
196196
// CHECK: %[[READ:.+]] = vector.transfer_read %[[ARG0]]
197197
// CHECK-SAME: %[[MASK]]
@@ -210,7 +210,7 @@ func.func @no_simplify_mask_nonunit(%1 : memref<1x?x?x8xbf16, #amdgpu.address_sp
210210
}
211211

212212
// CHECK-LABEL: @no_simplify_mask_nonunit
213-
// CHECK-SAME: (%[[ARG0:.+]]: memref<1x?x?x8xbf16, #amdgpu.address_space<fat_raw_buffer>>, %[[ARG1:.+]]: index, %[[ARG2:.+]]: index)
213+
// CHECK-SAME: (%[[ARG0:.+]]: memref<1x?x?x8xbf16, #amdgpu.address_space<fat_raw_buffer>>, %{{.+}}: index, %{{.+}}: index)
214214
// CHECK-DAG: %[[MASK:.+]] = vector.create_mask
215215
// CHECK: %[[READ:.+]] = vector.transfer_read %[[ARG0]]
216216
// CHECK-SAME: %[[MASK]]

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_tile_and_fuse.mlir

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4)>
1212
#map2 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3)>
1313
func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor<10x64x2048xf16>) -> tensor<2x10x64x64xf32> {
14-
%c0 = arith.constant 0 : index
1514
%cst = arith.constant 0.000000e+00 : f32
1615
%5 = tensor.empty() : tensor<2x10x64x64xf32>
1716
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x10x64x64xf32>) -> tensor<2x10x64x64xf32>
@@ -49,7 +48,6 @@ func.func @expanded_matmul_transpose_b(%lhs: tensor<2x64x2048xf16>, %rhs: tensor
4948
#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d3, d4, d5)>
5049
#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d2, d3)>
5150
func.func @multi_dim_mma_schedule(%lhs: tensor<10x32x128x16xf16>, %rhs: tensor<4x32x128x16xf16>) -> tensor<10x4x32x32xf32> {
52-
%c0 = arith.constant 0 : index
5351
%cst = arith.constant 0.000000e+00 : f32
5452
%5 = tensor.empty() : tensor<10x4x32x32xf32>
5553
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<10x4x32x32xf32>) -> tensor<10x4x32x32xf32>
@@ -119,7 +117,6 @@ func.func @dynamic_multi_dim_mma_schedule(%lhs: tensor<?x6x16x?x16xf16>, %rhs: t
119117

120118
func.func @mfma_matmul_1024x1024x1024(%lhs: tensor<1024x1024xf16>, %rhs: tensor<1024x1024xf16>) -> tensor<1024x1024xf32> {
121119
%cst = arith.constant 0.000000e+00 : f32
122-
%c0 = arith.constant 0 : index
123120
%5 = tensor.empty() : tensor<1024x1024xf32>
124121
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
125122
%7 = linalg.matmul ins(%lhs, %rhs : tensor<1024x1024xf16>, tensor<1024x1024xf16>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
@@ -212,7 +209,6 @@ func.func @mfma_matmul_m_aligned_intrinsic(%lhs: tensor<176x1024xi8>, %rhs: tens
212209

213210
module {
214211
func.func @conv_nhwc(%3: tensor<2x258x514x768xf16>, %4: tensor<3x3x768x256xf16>) -> tensor<2x256x512x256xf32> {
215-
%c0 = arith.constant 0 : index
216212
%cst = arith.constant 0.000000e+00 : f32
217213
%5 = tensor.empty() : tensor<2x256x512x256xf32>
218214
%6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x256x512x256xf32>) -> tensor<2x256x512x256xf32>
@@ -249,7 +245,6 @@ func.func @matmul_dynamic_M(%arg0: tensor<?x256xf32>, %arg1: tensor<256x256xf32>
249245
module {
250246
func.func @elementwise_dynamic_dim(%11: tensor<?x256xf16>, %12: tensor<?x256xf16>) -> tensor<?x256xf16> {
251247
%c0 = arith.constant 0 : index
252-
%cst = arith.constant 0.000000e+00 : f32
253248
%8 = tensor.dim %11, %c0 : tensor<?x256xf16>
254249
%13 = tensor.empty(%8) : tensor<?x256xf16>
255250
%15 = linalg.add ins(%11, %12 : tensor<?x256xf16>, tensor<?x256xf16>) outs(%13 : tensor<?x256xf16>) -> tensor<?x256xf16>
@@ -266,7 +261,6 @@ module {
266261
// -----
267262

268263
func.func @elementwise_unaligned(%11: tensor<180x180xf16>, %12: tensor<180x180xf16>) -> tensor<180x180xf16> {
269-
%cst = arith.constant 0.000000e+00 : f32
270264
%13 = tensor.empty() : tensor<180x180xf16>
271265
%15 = linalg.add ins(%11, %12 : tensor<180x180xf16>, tensor<180x180xf16>) outs(%13 : tensor<180x180xf16>) -> tensor<180x180xf16>
272266
return %15 : tensor<180x180xf16>
@@ -278,7 +272,6 @@ func.func @elementwise_unaligned(%11: tensor<180x180xf16>, %12: tensor<180x180xf
278272
// -----
279273

280274
func.func @elementwise_large_rank(%11: tensor<3x5x7x11x13x17x19x23xf16>, %12: tensor<3x5x7x11x13x17x19x23xf16>) -> tensor<3x5x7x11x13x17x19x23xf16> {
281-
%cst = arith.constant 0.000000e+00 : f32
282275
%13 = tensor.empty() : tensor<3x5x7x11x13x17x19x23xf16>
283276
%15 = linalg.add ins(%11, %12 : tensor<3x5x7x11x13x17x19x23xf16>, tensor<3x5x7x11x13x17x19x23xf16>) outs(%13 : tensor<3x5x7x11x13x17x19x23xf16>) -> tensor<3x5x7x11x13x17x19x23xf16>
284277
return %15 : tensor<3x5x7x11x13x17x19x23xf16>
@@ -293,9 +286,6 @@ func.func @elementwise_large_rank(%11: tensor<3x5x7x11x13x17x19x23xf16>, %12: te
293286

294287
func.func @multi_mma_data_tiled_unrolled_MFMA_F32_16x16x4_F32(
295288
%3: tensor<1x8x8x4x16x4xf32>, %4: tensor<1x8x4x2x4x16x4xf32>, %5: tensor<1x1x4x8x2x4x16x4xf32>) -> tensor<1x1x4x8x2x4x16x4xf32> {
296-
%c0 = arith.constant 0 : index
297-
%c65536 = arith.constant 65536 : index
298-
%c131072 = arith.constant 131072 : index
299289
%6 = iree_codegen.inner_tiled ins(%3, %4) outs(%5) {
300290
indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>,
301291
affine_map<(d0, d1, d2) -> (d1, d2)>,
@@ -323,9 +313,9 @@ func.func @multi_mma_data_tiled_unrolled_MFMA_F32_16x16x4_F32(
323313
// -----
324314

325315
func.func @unaligned_to_intrinsic_batched_matmul(%lhs : tensor<12x8x577xf32>, %rhs : tensor<12x577x577xf32>) -> tensor<12x8x577xf32> {
326-
%c0 = arith.constant 0.0 : f32
316+
%cst = arith.constant 0.0 : f32
327317
%empty = tensor.empty() : tensor<12x8x577xf32>
328-
%fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x8x577xf32>) -> tensor<12x8x577xf32>
318+
%fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<12x8x577xf32>) -> tensor<12x8x577xf32>
329319
%mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x8x577xf32>, tensor<12x577x577xf32>) outs(%fill : tensor<12x8x577xf32>) -> tensor<12x8x577xf32>
330320
return %mm : tensor<12x8x577xf32>
331321
}
@@ -413,9 +403,9 @@ func.func @unaligned_dynamic_matmul_with_two_reduce_dim(%arg0: tensor<196x?x4xf3
413403
// -----
414404

415405
func.func @unaligned_to_intrinsic_batched_matmul_tiling_check(%lhs : tensor<12x577x577xf32>, %rhs : tensor<12x577x1024xf32>) -> tensor<12x577x1024xf32> {
416-
%c0 = arith.constant 0.0 : f32
406+
%cst = arith.constant 0.0 : f32
417407
%empty = tensor.empty() : tensor<12x577x1024xf32>
418-
%fill = linalg.fill ins(%c0 : f32) outs(%empty : tensor<12x577x1024xf32>) -> tensor<12x577x1024xf32>
408+
%fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<12x577x1024xf32>) -> tensor<12x577x1024xf32>
419409
%mm = linalg.batch_matmul ins(%lhs, %rhs : tensor<12x577x577xf32>, tensor<12x577x1024xf32>) outs(%fill : tensor<12x577x1024xf32>) -> tensor<12x577x1024xf32>
420410
return %mm : tensor<12x577x1024xf32>
421411
}

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_gfx1100.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,5 +34,5 @@ func.func @wmma_matmul_1024x1024x1024() {
3434
// WMMA: linalg.matmul {{.*}}lowering_config = #iree_gpu.lowering_config
3535
// WMMA-SAME: mma_kind = #iree_gpu.mma_layout<WMMAR3_F32_16x16x16_F16>
3636
// WMMA-SAME: reduction = [0, 0, 64]
37-
// WMMA-LITERAL-SAME: subgroup_basis = [[2, 2, 1], [0, 1, 2]]
37+
// WMMA-SAME{LITERAL}: subgroup_basis = [[2, 2, 1], [0, 1, 2]]
3838
// WMMA-SAME: workgroup = [64, 128, 0]

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_reduction_gfx942.mlir

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,6 @@ func.func @attention_20x1x64x4096x64() {
6161
func.func @reduction_with_no_consumer() {
6262
%c0 = arith.constant 0 : index
6363
%cst = arith.constant 0.000000e+00 : f32
64-
%cst_0 = arith.constant 4.096000e+04 : f32
65-
%cst_1 = arith.constant 9.99999974E-6 : f32
66-
%c69524992 = arith.constant 69524992 : index
67-
%c74767872 = arith.constant 74767872 : index
6864
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x32x10x4096xf16>>
6965
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<2x32xf32>>
7066
%2 = iree_tensor_ext.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [2, 32, 10, 4096], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<2x32x10x4096xf16>> -> tensor<2x32x10x4096xf16>

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/config_vector_distribute_reduction_gfx950.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ func.func @skinny_scaled_matmul() {
3535
%13 = linalg.generic {
3636
indexing_maps = [#lhs_map, #rhs_map, #scale_m, #scale_n, #out_map],
3737
iterator_types = ["parallel", "parallel", "reduction", "reduction"]
38-
} ins(%6, %8, %9, %10 : tensor<4x512x32xf4E2M1FN>, tensor<1024x512x32xf4E2M1FN>, tensor<4x512xf8E8M0FNU>, tensor<1024x512xf8E8M0FNU>) outs(%11 : tensor<4x1024xf32>) {
38+
} ins(%6, %8, %9, %10 : tensor<4x512x32xf4E2M1FN>, tensor<1024x512x32xf4E2M1FN>, tensor<4x512xf8E8M0FNU>, tensor<1024x512xf8E8M0FNU>) outs(%12 : tensor<4x1024xf32>) {
3939
^bb0(%a: f4E2M1FN, %b: f4E2M1FN, %a_scale: f8E8M0FNU, %b_scale: f8E8M0FNU, %out: f32):
4040
%14 = arith.scaling_extf %a, %a_scale : f4E2M1FN, f8E8M0FNU to f32
4141
%15 = arith.scaling_extf %b, %b_scale : f4E2M1FN, f8E8M0FNU to f32

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_direct_conv_tile_and_fuse.mlir

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -66,17 +66,17 @@ hal.executable private @main {
6666
// CHECK-DAG: %[[C4:.+]] = arith.constant 4 : index
6767
// CHECK-DAG: %[[C36:.+]] = arith.constant 36 : index
6868
// CHECK: scf.forall ({{.*}}) in (16, 48, 9) {
69-
// CHECK: %[[LOOP1:.+]] = scf.for %[[IV1:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
70-
// CHECK: %[[LOOP2:.+]] = scf.for %[[IV2:.+]] = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
71-
// CHECK: %[[LOOP3:.+]] = scf.for %[[IV3:.+]] = %[[C0]] to %[[C36]] step %[[C4]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
69+
// CHECK: scf.for {{.+}} = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
70+
// CHECK: scf.for {{.+}} = %[[C0]] to %[[C3]] step %[[C1]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
71+
// CHECK: scf.for {{.+}} = %[[C0]] to %[[C36]] step %[[C4]] {{.*}} -> (vector<1x1x1x1x4x1xf32>)
7272
// CHECK: gpu.barrier
73-
// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read %[[BUF0]]{{.*}} vector<4xf16>
73+
// CHECK-DAG: %[[LHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<4xf16>
7474
// CHECK-DAG: vector.transfer_write %[[LHS_RD]]
75-
// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read %[[BUF1]]{{.*}} vector<8xf16>
75+
// CHECK-DAG: %[[RHS_RD:.+]] = vector.transfer_read {{.+}} : {{.*}}vector<8xf16>
7676
// CHECK-DAG: vector.transfer_write %[[RHS_RD]]
7777
// CHECK: gpu.barrier
78-
// CHECK-DAG: %[[LHS_MM:.+]] = vector.transfer_read {{.*}} vector<4x4xf16>
79-
// CHECK-DAG: %[[RHS_MM:.+]] = vector.transfer_read {{.*}} vector<4x4xf16>
78+
// CHECK-DAG: vector.transfer_read {{.*}} vector<4x4xf16>
79+
// CHECK-DAG: vector.transfer_read {{.*}} vector<4x4xf16>
8080
// CHECK-COUNT-4: amdgpu.mfma 16x16x16
8181
// CHECK: vector.transfer_write %{{.*}}, %[[BUF2]]
8282
// CHECK: } {mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_elementwise_f8fnuz.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@ hal.executable @ext_fp8_dispatch {
4343

4444
// ERRORS: F8E5M2FNUZ and F8E4M3FNUZ types are not supported on non-gfx942 (MI-300) chipsets; try F8E5M2 or F8E4M3FN instead.
4545

46-
// CDNA3-LABEL: hal.executable public @ext_fp8_dispatch
46+
// CDNA3-LABEL: hal.executable public @ext_fp8_dispatch {
4747
// CDNA3: hal.executable.variant public @rocm
4848
// CDNA3-COUNT-8: rocdl.cvt.pk.f32.fp8 %{{.*}} : vector<2xf32>
4949
// CDNA3-COUNT-8: rocdl.cvt.pk.f32.bf8 %{{.*}} : vector<2xf32>

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_elementwise_f8ocp.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ hal.executable @ext_fp8_dispatch {
4444

4545
// ERRORS: F8E5M2 and F8E4M3FN types are not supported on gfx942 (MI-300) or older chipsets; try F8E5M2FNUZ or F8E4M3FNUZ instead.
4646

47-
// OCP-LABEL: hal.executable public @ext_fp8_dispatch
47+
// OCP-LABEL: hal.executable public @ext_fp8_dispatch {
4848
// OCP: hal.executable.variant public @rocm
4949
// OCP-COUNT-8: rocdl.cvt.pk.f32.fp8 %{{.*}} : vector<2xf32>
5050
// OCP-COUNT-8: rocdl.cvt.pk.f32.bf8 %{{.*}} : vector<2xf32>

0 commit comments

Comments
 (0)