Skip to content

Commit 41f14d3

Browse files
kuharclaude
andauthored
Cherry-pick ODS double-space fix and update tests (#23690)
Cherry-pick llvm/llvm-project#184253 into third_party/llvm-project and fix double spaces in tests for `gpu.thread_id`, `gpu.subgroup_reduce`, `gpu.shuffle`, `gpu.block_dim`, `nvvm.shfl.sync`, and `transform.print`. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 3a4c991 commit 41f14d3

33 files changed

+168
-168
lines changed

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ func.func @add_tensor() attributes {translation_info = #translation} {
3535
// CHECK: #[[$MAP:.*]] = affine_map<(d0) -> (d0 * 4)>
3636
// CHECK-LABEL: func.func @add_tensor
3737
// CHECK: %[[C0:.*]] = arith.constant 0 : index
38-
// CHECK: %[[TX:.*]] = gpu.thread_id x
38+
// CHECK: %[[TX:.*]] = gpu.thread_id x
3939
// CHECK: %[[OFF:.*]] = affine.apply #[[$MAP]](%[[TX]])
4040
// CHECK: %[[S:.*]] = memref.subview %{{.*}}[0, %[[OFF]]] [1, 4] [1, 1] : memref<1x256xf32, #{{.*}}> to memref<1x4xf32, #{{.*}}>
4141
// CHECK: %[[A:.*]] = vector.transfer_read %{{.*}}[%[[C0]], %[[OFF]]], %{{.*}} {in_bounds = [true]} : memref<1x256xf32, #{{.*}}>, vector<4xf32>

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_distribute_shared_memory.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -110,8 +110,8 @@ module {
110110
// CHECK-DAG: %[[C56:.+]] = arith.constant 56 : index
111111
// CHECK-DAG: %[[C128:.+]] = arith.constant 128 : index
112112

113-
// CHECK-DAG: %[[TID_X:.+]] = gpu.thread_id x
114-
// CHECK-DAG: %[[TID_Y:.+]] = gpu.thread_id y
113+
// CHECK-DAG: %[[TID_X:.+]] = gpu.thread_id x
114+
// CHECK-DAG: %[[TID_Y:.+]] = gpu.thread_id y
115115

116116
// CHECK: scf.for %[[IV_Y:.+]] = %[[TID_Y]] to %[[C56]] step %[[C8]] {
117117
// CHECK: %[[OFFSET_X:.+]] = affine.apply #[[$OFFSET_MAP]]()[%[[TID_X]]]

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ builtin.module attributes { transform.with_named_sequence } {
2929
}
3030
}
3131

32-
// CHECK: %[[IDX:.+]] = gpu.thread_id x
32+
// CHECK: %[[IDX:.+]] = gpu.thread_id x
3333
// CHECK: %[[YX:.+]]:3 = affine.delinearize_index %[[IDX]] into (4, 8)
3434
// CHECK: %[[Y_SCALED:.+]] = affine.linearize_index disjoint [%[[YX]]#1, %c0] by (4, 4)
3535
// CHECK: %[[RD00:.+]] = vector.transfer_read %arg0[%[[Y_SCALED]], %[[YX]]#2], {{.*}} : memref<32x32xf16>, vector<4x1xf16>
@@ -73,7 +73,7 @@ builtin.module attributes { transform.with_named_sequence } {
7373
// CHECK-LABEL: @distribute_transfer_read_row_major_with_nontrivial_index
7474
// CHECK-SAME: %[[I0:.+]]: index, %[[I1:.+]]: index
7575

76-
// CHECK: %[[IDX:.+]] = gpu.thread_id x
76+
// CHECK: %[[IDX:.+]] = gpu.thread_id x
7777
// CHECK: %[[X:.+]]:2 = affine.delinearize_index %[[IDX]] into (8) : index, index
7878
// CHECK: %[[OFF0:.+]] = affine.linearize_index [%[[X]]#1, %[[I0]]] by (8, 1)
7979
// CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[OFF0]], %[[I1]]]
@@ -161,7 +161,7 @@ builtin.module attributes { transform.with_named_sequence } {
161161
// CHECK-LABEL: @distribute_transfer_read_row_major_transpose
162162
// CHECK-SAME: %[[I0:.+]]: index, %[[I1:.+]]: index
163163

164-
// CHECK: %[[IDX:.+]] = gpu.thread_id x
164+
// CHECK: %[[IDX:.+]] = gpu.thread_id x
165165
// CHECK: %[[X:.+]]:2 = affine.delinearize_index %[[IDX]] into (8) : index, index
166166
// CHECK: %[[LIN_ID0:.+]] = affine.linearize_index [%[[X]]#1, %[[I1]]] by (8, 1)
167167
// CHECK: vector.transfer_read %{{.*}}[%c0, %c0, %[[I0]], %[[LIN_ID0]]], {{.*}} permutation_map = #[[$PERM]]
@@ -278,7 +278,7 @@ builtin.module attributes { transform.with_named_sequence } {
278278
}
279279
}
280280

281-
// CHECK: %[[IDX:.+]] = gpu.thread_id x
281+
// CHECK: %[[IDX:.+]] = gpu.thread_id x
282282
// CHECK: %[[YX:.+]]:3 = affine.delinearize_index %[[IDX]] into (4, 16)
283283
// CHECK: %[[LANEY:.+]] = affine.linearize_index disjoint [%[[YX]]#1, %c0] by (4, 4)
284284
// CHECK: %[[RD:.+]] = vector.transfer_read %{{.*}}[%c0, %[[LANEY:.+]]], {{.*}} : memref<32x32xf16>, vector<4xf16>
@@ -314,7 +314,7 @@ builtin.module attributes { transform.with_named_sequence } {
314314
}
315315
}
316316

317-
// CHECK: %[[IDX:.+]] = gpu.thread_id x
317+
// CHECK: %[[IDX:.+]] = gpu.thread_id x
318318
// CHECK: %[[YX:.+]]:3 = affine.delinearize_index %[[IDX]] into (2, 64)
319319
// CHECK: %[[SUBGROUP:.+]]:2 = affine.delinearize_index %[[IDX]] into (16)
320320
// CHECK: %[[LANEY:.+]] = affine.linearize_index disjoint [%[[YX]]#1, %[[SUBGROUP]]#1, %c0] by (2, 16, 4)
@@ -387,7 +387,7 @@ builtin.module attributes { transform.with_named_sequence } {
387387
}
388388
}
389389

390-
// CHECK: %[[IDX:.+]] = gpu.thread_id x
390+
// CHECK: %[[IDX:.+]] = gpu.thread_id x
391391
// CHECK: %[[LANEX:.+]]:2 = affine.delinearize_index %[[IDX]] into (8)
392392
// CHECK: %[[SLICE:.+]] = vector.extract %{{.*}}[0, 0, 0, 0] : vector<1x8xf16> from vector<2x2x1x1x1x8xf16>
393393
// CHECK: vector.transfer_write %[[SLICE]], %{{.*}}[%[[LANEX]]#1, %c0] {in_bounds = [true, true]} : vector<1x8xf16>, memref<64x64xf16>
@@ -430,7 +430,7 @@ builtin.module attributes { transform.with_named_sequence } {
430430
}
431431
}
432432

433-
// CHECK: %[[IDX:.+]] = gpu.thread_id x
433+
// CHECK: %[[IDX:.+]] = gpu.thread_id x
434434
// CHECK: %[[YX:.+]]:3 = affine.delinearize_index %[[IDX]] into (4, 8)
435435
// CHECK: %[[LANEY:.+]] = affine.linearize_index disjoint [%[[YX]]#1, %c0] by (4, 4)
436436
// CHECK: vector.extract %{{.*}}[0, 0, 0, 0]
@@ -475,7 +475,7 @@ builtin.module attributes { transform.with_named_sequence } {
475475
// CHECK-LABEL: @distribute_transfer_write_row_major_with_nontrivial_index
476476
// CHECK-SAME: vector<16x16xf16>, %[[I0:.+]]: index, %[[I1:.+]]: index
477477

478-
// CHECK: %[[IDX:.+]] = gpu.thread_id x
478+
// CHECK: %[[IDX:.+]] = gpu.thread_id x
479479
// CHECK: %[[LANE:.+]]:2 = affine.delinearize_index %[[IDX]] into (8)
480480
// CHECK: %[[LIN_ID0:.+]] = affine.linearize_index [%[[LANE]]#1, %[[I1]]] by (8, 1)
481481
// CHECK: vector.extract %{{.*}}[0, 0, 0, 0]
@@ -585,7 +585,7 @@ func.func @mfma_64x128x8_read(%mem: memref<128x8xf16>,
585585
%c0 = arith.constant 0 : index
586586
%cst = arith.constant 0.0 : f16
587587

588-
// CHECK: %[[IDX:.+]] = gpu.thread_id x
588+
// CHECK: %[[IDX:.+]] = gpu.thread_id x
589589
// CHECK-DAG: %[[WG:.+]]:4 = affine.delinearize_index %[[IDX]] into (4, 2, 64)
590590
// CHECK-DAG: %[[LANE:.+]]:3 = affine.delinearize_index %[[IDX]] into (2, 32)
591591
// This doesn't canonicalize away currently, but could be equivalent to %WG
@@ -675,7 +675,7 @@ builtin.module attributes { transform.with_named_sequence } {
675675

676676
// CHECK-LABEL: @transposed_read_64x8
677677

678-
// CHECK: %[[IDX:.+]] = gpu.thread_id x
678+
// CHECK: %[[IDX:.+]] = gpu.thread_id x
679679
// CHECK-DAG: %[[WG:.+]]:4 = affine.delinearize_index %[[IDX]] into (2, 2, 64)
680680
// CHECK-DAG: %[[LANE:.+]]:3 = affine.delinearize_index %[[IDX]] into (2, 32)
681681
// CHECK-DAG: %[[M:.+]] = affine.linearize_index disjoint [%[[WG]]#1, %[[LANE]]#2] by (2, 32)
@@ -934,7 +934,7 @@ builtin.module attributes { transform.with_named_sequence } {
934934
}
935935

936936
// CHECK-LABEL: func @transpose_3d
937-
// CHECK-DAG: %[[IDX:.+]] = gpu.thread_id x
937+
// CHECK-DAG: %[[IDX:.+]] = gpu.thread_id x
938938
// CHECK-DAG: %[[WG:.+]]:3 = affine.delinearize_index %[[IDX]] into (2, 64)
939939
// CHECK-DAG: %[[LANE:.+]]:4 = affine.delinearize_index %[[IDX]] into (4, 8, 2)
940940
// CHECK-DAG: %[[DIM:.+]] = affine.linearize_index disjoint [%[[WG]]#1, %[[LANE]]#1, %c0] by (2, 4, 4)
@@ -1373,7 +1373,7 @@ builtin.module attributes { transform.with_named_sequence } {
13731373
}
13741374

13751375
// CHECK-LABEL: @distribute_map_store_row_major
1376-
// CHECK-DAG: %[[IDX:.+]] = gpu.thread_id x
1376+
// CHECK-DAG: %[[IDX:.+]] = gpu.thread_id x
13771377
// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : index
13781378
// CHECK-DAG: %[[LANEX:.+]]:2 = affine.delinearize_index %[[IDX]] into (8)
13791379
// CHECK-DAG: %[[SLICE0:.+]] = vector.extract %{{.*}}[0, 0, 0, 0]
@@ -1411,7 +1411,7 @@ builtin.module attributes { transform.with_named_sequence } {
14111411
// CHECK-LABEL: @undistributed_write
14121412
func.func @undistributed_write(%out: memref<f32, #amdgpu.address_space<fat_raw_buffer>>, %v: vector<f32>) {
14131413
// CHECK-DAG: %[[ZERO:.*]] = arith.constant 0 : index
1414-
// CHECK-DAG: %[[TID:.*]] = gpu.thread_id x
1414+
// CHECK-DAG: %[[TID:.*]] = gpu.thread_id x
14151415
// CHECK-DAG: %[[COND:.+]] = arith.cmpi eq, %[[TID]], %[[ZERO]] : index
14161416
// CHECK-NEXT: scf.if %[[COND]] {
14171417
// CHECK: vector.transfer_write
@@ -1446,7 +1446,7 @@ builtin.module attributes { transform.with_named_sequence } {
14461446
// across all threads (note the thread strides). This test checks if we account
14471447
// for such broadcasts when generating conditional writes.
14481448
// CHECK-LABEL: @partially_distributed_write
1449-
// CHECK-DAG: %[[TID:.+]] = gpu.thread_id x
1449+
// CHECK-DAG: %[[TID:.+]] = gpu.thread_id x
14501450
// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
14511451
// CHECK: %[[DELIN:.*]]:5 = affine.delinearize_index %[[TID:.+]] into (4, 2, 4, 8)
14521452
// CHECK-DAG: %[[SUBGROUP_COND:.+]] = arith.cmpi eq, %[[DELIN]]#0, %[[C0]] : index

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution_multi_reduce.mlir

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,8 @@ builtin.module attributes { transform.with_named_sequence } {
153153
// Local reduction
154154
// CHECK: vector.multi_reduction <maximumf>, %{{.*}}, %{{.*}} [1, 3, 5] : vector<2x1x1x1x1x4xf32> to vector<2x1x1xf32>
155155
// Thread reduction
156-
// CHECK: %[[THREAD_RED0:.+]] = gpu.subgroup_reduce maximumf %{{.*}} cluster(size = 4, stride = 16) : (f32) -> f32
157-
// CHECK: %[[THREAD_RED2:.+]] = gpu.subgroup_reduce maximumf %{{.*}} cluster(size = 4, stride = 16) : (f32) -> f32
156+
// CHECK: %[[THREAD_RED0:.+]] = gpu.subgroup_reduce maximumf %{{.*}} cluster(size = 4, stride = 16) : (f32) -> f32
157+
// CHECK: %[[THREAD_RED2:.+]] = gpu.subgroup_reduce maximumf %{{.*}} cluster(size = 4, stride = 16) : (f32) -> f32
158158
// Subgroup reduction
159159
// CHECK-DAG: %[[ALLOC:.+]] = memref.alloc() : memref<32x2xf32, #gpu.address_space<workgroup>>
160160
// CHECK: gpu.barrier memfence [#gpu.address_space<workgroup>]
@@ -172,9 +172,9 @@ builtin.module attributes { transform.with_named_sequence } {
172172
// CHECK-DAG: %[[SG_READ1:.+]] = vector.transfer_read %alloc[%[[BATCH1]], %[[BATCH0]]#1], %{{.*}} : memref<32x2xf32, #gpu.address_space<workgroup>>, vector<1x1xf32>
173173
// CHECK-DAG: %[[ACC:.+]] = iree_vector_ext.to_simt %{{.*}} : vector<32xf32> -> vector<2x1x1xf32>
174174
// CHECK-DAG: %[[DISTR0:.+]] = vector.extract %[[SG_READ0]][0, 0] : f32 from vector<1x1xf32>
175-
// CHECK-DAG: %[[RED0:.+]] = gpu.subgroup_reduce maximumf %[[DISTR0]] cluster(size = 2, stride = 16) : (f32) -> f32
175+
// CHECK-DAG: %[[RED0:.+]] = gpu.subgroup_reduce maximumf %[[DISTR0]] cluster(size = 2, stride = 16) : (f32) -> f32
176176
// CHECK-DAG: %[[DISTR1:.+]] = vector.extract %[[SG_READ1]][0, 0] : f32 from vector<1x1xf32>
177-
// CHECK-DAG: %[[RED1:.+]] = gpu.subgroup_reduce maximumf %[[DISTR1]] cluster(size = 2, stride = 16) : (f32) -> f32
177+
// CHECK-DAG: %[[RED1:.+]] = gpu.subgroup_reduce maximumf %[[DISTR1]] cluster(size = 2, stride = 16) : (f32) -> f32
178178
// CHECK-DAG: %[[INS:.+]] = vector.from_elements %[[RED0]], %[[RED1]] : vector<2x1x1xf32>
179179
// CHECK-DAG: arith.maximumf %[[INS]], %[[ACC]] : vector<2x1x1xf32>
180180

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_pipeline.mlir

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ func.func @_matmul_f16_f16_dispatch_0_fill_3456x1024() {
1414
%c0 = arith.constant 0 : index
1515
%cst = arith.constant 0.000000e+00 : f16
1616
%0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf16, "COp">
17-
%1 = gpu.thread_id x
18-
%2 = gpu.thread_id y
19-
%3 = gpu.thread_id z
17+
%1 = gpu.thread_id x
18+
%2 = gpu.thread_id y
19+
%3 = gpu.thread_id z
2020
%4 = memref.alloc() : memref<4x32x40xf16, 3>
2121
%5 = memref.alloc() : memref<4x32x40xf16, 3>
2222
%6 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<3456x2048xf16>
@@ -73,9 +73,9 @@ func.func @nvidia_tenscore_schedule_f16() {
7373
%c1280 = arith.constant 1280 : index
7474
%cst_0 = arith.constant 0.000000e+00 : f16
7575
%c0 = arith.constant 0 : index
76-
%0 = gpu.thread_id x
77-
%1 = gpu.thread_id y
78-
%2 = gpu.thread_id z
76+
%0 = gpu.thread_id x
77+
%1 = gpu.thread_id y
78+
%2 = gpu.thread_id z
7979
%alloc = memref.alloc() : memref<128x256xf16, #gpu.address_space<workgroup>>
8080
%alloc_1 = memref.alloc() : memref<3x128x32xf16, #gpu.address_space<workgroup>>
8181
%alloc_2 = memref.alloc() : memref<3x32x256xf16, #gpu.address_space<workgroup>>
@@ -522,9 +522,9 @@ func.func @nvidia_tenscore_schedule_f32() {
522522
%c256 = arith.constant 256 : index
523523
%cst_1 = arith.constant 0.000000e+00 : f32
524524
%c0 = arith.constant 0 : index
525-
%0 = gpu.thread_id x
526-
%1 = gpu.thread_id y
527-
%2 = gpu.thread_id z
525+
%0 = gpu.thread_id x
526+
%1 = gpu.thread_id y
527+
%2 = gpu.thread_id z
528528
%alloc = memref.alloc() : memref<128x128xf32, #gpu.address_space<workgroup>>
529529
%alloc_2 = memref.alloc() : memref<3x128x32xf32, #gpu.address_space<workgroup>>
530530
%alloc_3 = memref.alloc() : memref<3x32x128xf32, #gpu.address_space<workgroup>>

compiler/src/iree/compiler/Codegen/Common/GPU/test/vector_reduction_to_gpu.mlir

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ module {
1818
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) : memref<128x384xf32>
1919
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) : memref<128xf32>
2020
%workgroup_id_x = hal.interface.workgroup.id[0] : index
21-
%thread_id_x = gpu.thread_id x
21+
%thread_id_x = gpu.thread_id x
2222
%2 = affine.apply #map()[%thread_id_x, %workgroup_id_x]
2323
%3 = scf.for %arg0 = %c0 to %c384 step %c32 iter_args(%arg1 = %cst) -> (vector<1xf32>) {
2424
%5 = vector.transfer_read %0[%2, %arg0], %cst_0 {in_bounds = [true]} : memref<128x384xf32>, vector<32xf32>
@@ -41,22 +41,22 @@ module {
4141
// CHECK-DAG: %[[C16:.*]] = arith.constant 16 : i32
4242
// CHECK-DAG: %[[C32:.*]] = arith.constant 32 : i32
4343
// CHECK-DAG: %[[C32I:.*]] = arith.constant 32 : index
44-
// CHECK-DAG: %[[TID:.*]] = gpu.thread_id x
44+
// CHECK-DAG: %[[TID:.*]] = gpu.thread_id x
4545
// CHECK-DAG: %[[VCST:.*]] = arith.constant dense<0.000000e+00> : vector<1xf32>
4646
// CHECK: %[[F:.*]] = scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} iter_args(%[[V0:.*]] = %[[VCST]]) -> (vector<1xf32>) {
4747
// CHECK-DAG: %[[E:.*]] = vector.extract %[[V0]][0] : f32 from vector<1xf32>
4848
// CHECK-DAG: %[[ID:.*]] = affine.apply
4949
// CHECK-DAG: %[[V1:.*]] = vector.transfer_read %{{.*}}[%{{.*}}, %[[ID]]], %{{.*}} {in_bounds = [true]} : memref<128x384xf32>, vector<1xf32>
5050
// CHECK: %[[S:.*]] = vector.extract %[[V1]][0] : f32 from vector<1xf32>
51-
// CHECK: %[[S0:.*]], %{{.*}} = gpu.shuffle xor %[[S]], %[[C1]], %[[C32]] : f32
51+
// CHECK: %[[S0:.*]], %{{.*}} = gpu.shuffle xor %[[S]], %[[C1]], %[[C32]] : f32
5252
// CHECK: %[[S1:.*]] = arith.addf %[[S]], %[[S0]] : f32
53-
// CHECK: %[[S2:.*]], %{{.*}} = gpu.shuffle xor %[[S1]], %[[C2]], %[[C32]] : f32
53+
// CHECK: %[[S2:.*]], %{{.*}} = gpu.shuffle xor %[[S1]], %[[C2]], %[[C32]] : f32
5454
// CHECK: %[[S3:.*]] = arith.addf %[[S1]], %[[S2]] : f32
55-
// CHECK: %[[S4:.*]], %{{.*}} = gpu.shuffle xor %[[S3]], %[[C4]], %[[C32]] : f32
55+
// CHECK: %[[S4:.*]], %{{.*}} = gpu.shuffle xor %[[S3]], %[[C4]], %[[C32]] : f32
5656
// CHECK: %[[S5:.*]] = arith.addf %[[S3]], %[[S4]] : f32
57-
// CHECK: %[[S6:.*]], %{{.*}} = gpu.shuffle xor %[[S5]], %[[C8]], %[[C32]] : f32
57+
// CHECK: %[[S6:.*]], %{{.*}} = gpu.shuffle xor %[[S5]], %[[C8]], %[[C32]] : f32
5858
// CHECK: %[[S7:.*]] = arith.addf %[[S5]], %[[S6]] : f32
59-
// CHECK: %[[S8:.*]], %{{.*}} = gpu.shuffle xor %[[S7]], %[[C16]], %[[C32]] : f32
59+
// CHECK: %[[S8:.*]], %{{.*}} = gpu.shuffle xor %[[S7]], %[[C16]], %[[C32]] : f32
6060
// CHECK: %[[S9:.*]] = arith.addf %[[S7]], %[[S8]] : f32
6161
// CHECK: %[[S10:.*]] = arith.addf %[[S9]], %[[E]] : f32
6262
// CHECK: %[[B:.*]] = vector.broadcast %[[S10]] : f32 to vector<1xf32>
@@ -98,7 +98,7 @@ module {
9898
%6 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%4) : memref<128x384xf32>
9999
%7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%5) : memref<128xf32>
100100
%workgroup_id_x = hal.interface.workgroup.id[0] : index
101-
%thread_id_x = gpu.thread_id x
101+
%thread_id_x = gpu.thread_id x
102102
%8 = affine.apply #map()[%thread_id_x, %workgroup_id_x]
103103
%9 = scf.for %arg0 = %c0 to %c384 step %c32 iter_args(%arg1 = %cst) -> (vector<1xf32>) {
104104
%11 = vector.transfer_read %6[%8, %arg0], %cst_0 {in_bounds = [true]} : memref<128x384xf32>, vector<32xf32>
@@ -156,7 +156,7 @@ module {
156156
%6 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%4) : memref<128x384xf32>
157157
%7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%5) : memref<128xf32>
158158
%workgroup_id_x = hal.interface.workgroup.id[0] : index
159-
%thread_id_x = gpu.thread_id x
159+
%thread_id_x = gpu.thread_id x
160160
%8 = affine.apply #map()[%thread_id_x, %workgroup_id_x]
161161
%9 = scf.for %arg0 = %c0 to %c384 step %c32 iter_args(%arg1 = %cst) -> (vector<1xf32>) {
162162
%11 = vector.transfer_read %6[%8, %arg0], %cst_0 {in_bounds = [true]} : memref<128x384xf32>, vector<32xf32>
@@ -238,7 +238,7 @@ module {
238238
%c4096 = arith.constant 4096 : index
239239
%c512 = arith.constant 512 : index
240240
%cst_1 = arith.constant 0.000000e+00 : f16
241-
%thread_id_x = gpu.thread_id x
241+
%thread_id_x = gpu.thread_id x
242242
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<1x4096xf16, #hal.descriptor_type<storage_buffer>>
243243
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<32000x4096xf16, #hal.descriptor_type<storage_buffer>>
244244
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : memref<1x32000xf16, #hal.descriptor_type<storage_buffer>>

compiler/src/iree/compiler/Codegen/Common/test/convert_unsupported_float_to_int_buffers.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@ func.func @bf16_conversion() {
1919
%0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xbf16, #spirv.storage_class<StorageBuffer>>{%c8}
2020
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<?xbf16, #spirv.storage_class<StorageBuffer>>{%c8}
2121
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : memref<?xbf16, #spirv.storage_class<StorageBuffer>>{%c8}
22-
%3 = gpu.thread_id x
23-
%4 = gpu.block_dim x
22+
%3 = gpu.thread_id x
23+
%4 = gpu.block_dim x
2424
scf.for %arg0 = %3 to %c8 step %4 {
2525
%5 = memref.load %0[%arg0] : memref<?xbf16, #spirv.storage_class<StorageBuffer>>
2626
%6 = memref.load %1[%arg0] : memref<?xbf16, #spirv.storage_class<StorageBuffer>>

compiler/src/iree/compiler/Codegen/Common/test/hoist_unrolled_vector_extract_insert_slice.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ func.func @hoist_unrolled_vector_for_mma() {
1515
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : memref<2048x1024xf16>
1616
%2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) : memref<3456x1024xf32>
1717
%workgroup_id_x = hal.interface.workgroup.id[0] : index
18-
%3 = gpu.thread_id x
19-
%4 = gpu.thread_id y
18+
%3 = gpu.thread_id x
19+
%4 = gpu.thread_id y
2020
%5 = affine.apply affine_map<()[s0, s1] -> (s1 * 32 + (s0 floordiv 8) * 128)>()[%workgroup_id_x, %4]
2121
%6 = affine.apply affine_map<()[s0, s1] -> (s0 * 128 + s1 * 32 - (s0 floordiv 8) * 1024)>()[%workgroup_id_x, %3]
2222
%7 = scf.for %arg0 = %c0 to %c2048 step %c64 iter_args(%arg1 = %cst_0) -> (vector<32x32xf32>) {

compiler/src/iree/compiler/Codegen/Common/test/materialize_tuning_specs.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
// SKIPLINK-LABEL: module @user_spec
3232
// SKIPLINK-SAME: iree_codegen.tuning_spec_with_default_entrypoint
3333
// SKIPLINK-SAME: transform.with_named_sequence
34-
// SKIPLINK: transform.print {name = "Hello Tuning Spec"}
34+
// SKIPLINK: transform.print {name = "Hello Tuning Spec"}
3535
// SKIPLINK-NOT: module @{{.+}}
3636
// SKIPLINK: module attributes
3737
// SKIPLINK-SAME: iree_codegen.tuning_spec_mlirbc = dense<{{.+}}> : vector<{{[0-9]+}}xi8>

compiler/src/iree/compiler/Codegen/LLVMGPU/test/ROCDL/pipeline_lower_to_llvmgpu.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ hal.executable public @main {
3333
%c0 = arith.constant 0 : index
3434
%alloc = memref.alloc() : memref<1x1x16x40xi8, #gpu.address_space<workgroup>>
3535
%alloc_0 = memref.alloc() : memref<1x32x40xi8, #gpu.address_space<workgroup>>
36-
%thread_id_x = gpu.thread_id x upper_bound 128
36+
%thread_id_x = gpu.thread_id x upper_bound 128
3737
%1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<32x32x16x16xi8, #hal.descriptor_type<storage_buffer>>
3838
%3 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : memref<32x32x32x3x3xi8, #hal.descriptor_type<storage_buffer>>
3939
%5:2 = affine.delinearize_index %thread_id_x into (16, 8) : index, index

0 commit comments

Comments
 (0)