1- // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-convert-to-coalesced-dma))" %s --split-input-file | FileCheck %s
1+ // RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-convert-to-coalesced-dma,canonicalize ))" %s --split-input-file | FileCheck %s
22
33#gpu_target_copy = #iree_gpu.target <arch = " gfx942" , features = " " , wgp = <
44 compute = fp32 , storage = b32 , subgroup = shuffle ,
@@ -24,8 +24,8 @@ func.func @copy(%source: tensor<64x512xf32>, %init: tensor<64x512xf32>) -> tenso
2424 // With 16 warps (128*512/64/64) and 64 rows: step = ceil(64/16) = 4 rows, 512 cols (whole)
2525 // CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (64, 512) step (4, 512)
2626 // CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[INIT]]) -> (tensor<64x512xf32>) {
27- // CHECK-DAG: %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], %[[IV1]] ] [4, 512] [1, 1]
28- // CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], %[[IV1]] ] [4, 512] [1, 1]
27+ // CHECK-DAG: %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], 0 ] [4, 512] [1, 1]
28+ // CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], 0 ] [4, 512] [1, 1]
2929
3030 // Thread-level forall:
3131 // CHECK: %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64)
@@ -37,7 +37,7 @@ func.func @copy(%source: tensor<64x512xf32>, %init: tensor<64x512xf32>) -> tenso
3737 // CHECK: } {mapping = [#iree_gpu.lane_id<0>]}
3838
3939 // CHECK: scf.forall.in_parallel {
40- // CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], %[[IV1]] ] [4, 512] [1, 1]
40+ // CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], 0 ] [4, 512] [1, 1]
4141 // CHECK: }
4242 // CHECK: } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
4343
@@ -75,22 +75,20 @@ func.func @gather(%source: tensor<64x512xf32>, %indices: tensor<64xi32>, %init:
7575 // With 64 warps and 64 rows: step = ceil(64/64) = 1 row, 512 cols (whole)
7676 // CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (64, 512) step (1, 512)
7777 // CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[INIT]]) -> (tensor<64x512xf32>) {
78- // CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], %[[IV1]] ] [1, 512] [1, 1]
78+ // CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], 0 ] [1, 512] [1, 1]
7979 // CHECK-DAG: %[[SLICE_INDICES:.+]] = tensor.extract_slice %[[INDICES]][%[[IV0]]] [1] [1]
80- // CHECK-DAG: %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][0, %[[IV1]]] [64, 512] [1, 1]
8180
8281 // Thread-level forall:
8382 // CHECK: %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64)
8483 // CHECK-SAME: shared_outs(%[[THREAD_INIT:.+]] = %[[SLICE_DST]]) -> (tensor<1x512xf32>) {
85- // CHECK-DAG: %[[SLICE_INDICES_INNER:.+]] = tensor.extract_slice %[[SLICE_INDICES]][0] [1] [1]
8684 // CHECK: scf.forall.in_parallel {
87- // CHECK: iree_gpu.coalesced_gather_dma %[[SLICE_SRC ]][%[[SLICE_INDICES_INNER ]]] into %[[THREAD_INIT]] lane(%[[LANE]])
85+ // CHECK: iree_gpu.coalesced_gather_dma %[[SRC ]][%[[SLICE_INDICES ]]] into %[[THREAD_INIT]] lane(%[[LANE]])
8886 // CHECK-SAME: : tensor<64x512xf32>, tensor<1xi32>, tensor<1x512xf32>, index
8987 // CHECK: }
9088 // CHECK: } {mapping = [#iree_gpu.lane_id<0>]}
9189
9290 // CHECK: scf.forall.in_parallel {
93- // CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], %[[IV1]] ] [1, 512] [1, 1]
91+ // CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], 0 ] [1, 512] [1, 1]
9492 // CHECK: }
9593 // CHECK: } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
9694
@@ -170,9 +168,9 @@ func.func @copy_prefer_contiguous_subview(%source: tensor<64x128xf32>, %init: te
170168 // CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[INIT]]) -> (tensor<64x128xf32>) {
171169
172170 // Key check: subviews are 16x128 (contiguous) not 64x64 (non-contiguous)
173- // CHECK-DAG: %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], %[[IV1]] ] [16, 128] [1, 1]
171+ // CHECK-DAG: %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], 0 ] [16, 128] [1, 1]
174172 // CHECK-SAME: : tensor<64x128xf32> to tensor<16x128xf32>
175- // CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], %[[IV1]] ] [16, 128] [1, 1]
173+ // CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], 0 ] [16, 128] [1, 1]
176174 // CHECK-SAME: : tensor<64x128xf32> to tensor<16x128xf32>
177175
178176 // Thread-level forall distributes across lanes:
@@ -185,7 +183,7 @@ func.func @copy_prefer_contiguous_subview(%source: tensor<64x128xf32>, %init: te
185183 // CHECK: } {mapping = [#iree_gpu.lane_id<0>]}
186184
187185 // CHECK: scf.forall.in_parallel {
188- // CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], %[[IV1]] ] [16, 128] [1, 1]
186+ // CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], 0 ] [16, 128] [1, 1]
189187 // CHECK-SAME: : tensor<16x128xf32> into tensor<64x128xf32>
190188 // CHECK: }
191189 // CHECK: } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
@@ -231,9 +229,9 @@ func.func @copy_small_innermost_linearized(%source: tensor<128x16xf32>) -> tenso
231229 // Warp-level forall: step (32, 16) distributes 128 rows across 4 warps
232230 // CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (128, 16) step (32, 16)
233231 // CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[EMPTY]]) -> (tensor<128x16xf32>) {
234- // CHECK-DAG: %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], %[[IV1]] ] [32, 16] [1, 1]
232+ // CHECK-DAG: %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], 0 ] [32, 16] [1, 1]
235233 // CHECK-SAME: : tensor<128x16xf32> to tensor<32x16xf32>
236- // CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], %[[IV1]] ] [32, 16] [1, 1]
234+ // CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], 0 ] [32, 16] [1, 1]
237235 // CHECK-SAME: : tensor<128x16xf32> to tensor<32x16xf32>
238236
239237 // Thread-level forall with 64 lanes
@@ -246,7 +244,7 @@ func.func @copy_small_innermost_linearized(%source: tensor<128x16xf32>) -> tenso
246244 // CHECK: } {mapping = [#iree_gpu.lane_id<0>]}
247245
248246 // CHECK: scf.forall.in_parallel {
249- // CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], %[[IV1]] ] [32, 16] [1, 1]
247+ // CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], 0 ] [32, 16] [1, 1]
250248 // CHECK-SAME: : tensor<32x16xf32> into tensor<128x16xf32>
251249 // CHECK: }
252250 // CHECK: } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
@@ -451,11 +449,13 @@ func.func @copy_with_tensor_pad_fusion(%source: tensor<121x64xf32>, %init: tenso
451449
452450 // Key check: tensor.pad is fused - source is the extract_slice result, not the padded tensor
453451 // in_bounds = [false, true] because M dim has dynamic padding, K dim has no padding
454- // CHECK: scf.forall (%[[LANE:.+]]) in (64)
455- // CHECK: scf.forall.in_parallel {
456- // CHECK: iree_gpu.coalesced_gather_dma %[[EXTRACTED:.*]] into %{{.+}} lane(%[[LANE]]) in_bounds [false, true]
452+ // CHECK: %[[EXTRACTED:.+]] = tensor.extract_slice %[[SRC]]
453+ // CHECK: scf.forall {{.*}} shared_outs(%[[OUTER_INIT:.+]] = %[[INIT]])
454+ // CHECK: scf.forall (%[[LANE:.+]]) in (64) shared_outs(%[[INNER_INIT:.+]] = %[[OUTER_INIT]])
455+ // CHECK: scf.forall.in_parallel {
456+ // CHECK: iree_gpu.coalesced_gather_dma %[[EXTRACTED]] into %[[INNER_INIT]] lane(%[[LANE]]) in_bounds [false, true]
457457 // CHECK-SAME: : tensor<?x64xf32>, tensor<4x64xf32>, index
458- // CHECK: }
458+ // CHECK: }
459459 // CHECK-NOT: tensor.pad
460460
461461 return %result : tensor <4 x64 xf32 >
@@ -504,24 +504,22 @@ func.func @copy_with_tensor_pad_fusion_multi_warp(%source: tensor<121x64xf32>, %
504504 // forall with step (1, 64) producing 4 iterations with 1x64 subviews.
505505 // For tensor.pad fusion, we instead create a single-iteration wrapper forall
506506 // with step (4, 64) - the full shape - so the DMA operates on 4x64 directly.
507+ // After canonicalization, identity extract_slices are eliminated.
507508 //
509+ // CHECK: %[[EXTRACTED:.+]] = tensor.extract_slice %[[SRC]]
508510 // CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (4, 64) step (4, 64)
509511 // CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[INIT]]) -> (tensor<4x64xf32>) {
510512 //
511- // The subviews should be the full 4x64 shape:
512- // CHECK-DAG: %[[SLICE_SRC:.+]] = tensor.extract_slice {{.+}}[%[[IV0]], %[[IV1]]] [4, 64] [1, 1]
513- // CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [4, 64] [1, 1]
514- //
515- // Thread-level forall with 64 lanes:
516- // CHECK: %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64)
513+ // Thread-level forall with 64 lanes (uses outer forall's shared_out directly):
514+ // CHECK: %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64) shared_outs(%[[INNER_INIT:.+]] = %[[INIT_TILE]])
517515 // CHECK: scf.forall.in_parallel {
518- // CHECK: iree_gpu.coalesced_gather_dma {{.+}} into %{{.+}} lane(%[[LANE]]) in_bounds [false, true]
516+ // CHECK: iree_gpu.coalesced_gather_dma %[[EXTRACTED]] into %[[INNER_INIT]] lane(%[[LANE]]) in_bounds [false, true]
519517 // CHECK-SAME: : tensor<?x64xf32>, tensor<4x64xf32>, index
520518 // CHECK: }
521519 // CHECK: } {mapping = [#iree_gpu.lane_id<0>]}
522520 //
523521 // CHECK: scf.forall.in_parallel {
524- // CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], %[[IV1]] ] [4, 64] [1, 1]
522+ // CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][0, 0 ] [4, 64] [1, 1]
525523 // CHECK: }
526524 // CHECK: } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
527525 // CHECK-NOT: tensor.pad
0 commit comments