Skip to content

Commit efd0e54

Browse files
committed
update
1 parent 5d3415a commit efd0e54

File tree

2 files changed

+43
-49
lines changed

2 files changed

+43
-49
lines changed

compiler/src/iree/compiler/Codegen/Common/GPU/GPUConvertToCoalescedDMA.cpp

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,15 @@ static SmallVector<Attribute> getThreadMapping(MLIRContext *ctx) {
5959
return mapping;
6060
}
6161

62+
/// Trace through extract_slice operations to find an underlying tensor.pad.
63+
/// Returns the PadOp if found, nullptr otherwise.
64+
static tensor::PadOp traceToTensorPad(Value source) {
65+
while (auto extractSlice = source.getDefiningOp<tensor::ExtractSliceOp>()) {
66+
source = extractSlice.getSource();
67+
}
68+
return source.getDefiningOp<tensor::PadOp>();
69+
}
70+
6271
/// Check if a value traces back to tensor.empty (possibly through forall args).
6372
static bool tracesToTensorEmpty(Value value) {
6473
// Direct tensor.empty.
@@ -310,14 +319,7 @@ static LogicalResult createDMAInForall(scf::ForallOp threadForallOp,
310319
// After tiling, the input is typically:
311320
// tensor.extract_slice %padded[...] [...] [1, 1]
312321
// We need to trace through extract_slice to find if source is tensor.pad.
313-
Value traceSource = input;
314-
while (auto extractSlice =
315-
traceSource.getDefiningOp<tensor::ExtractSliceOp>()) {
316-
traceSource = extractSlice.getSource();
317-
}
318-
319-
// Check if we traced back to a tensor.pad.
320-
if (auto pad = traceSource.getDefiningOp<tensor::PadOp>()) {
322+
if (auto pad = traceToTensorPad(input)) {
321323
// Verify pad constraints: low padding must be all zeros, pad value must
322324
// be 0.
323325
bool validPad = true;
@@ -328,7 +330,8 @@ static LogicalResult createDMAInForall(scf::ForallOp threadForallOp,
328330
}
329331
}
330332
Value padVal = pad.getConstantPaddingValue();
331-
if (!padVal || !matchPattern(padVal, m_AnyZeroFloat())) {
333+
if (!padVal || !(matchPattern(padVal, m_AnyZeroFloat()) ||
334+
matchPattern(padVal, m_Zero()))) {
332335
validPad = false;
333336
}
334337

@@ -490,12 +493,7 @@ struct ConvertPadFusionCopyToCoalescedDMA
490493
}
491494

492495
// Check if this is a tensor.pad fusion case
493-
Value source = copyOp.getInputs()[0];
494-
// Trace through extract_slice to find tensor.pad
495-
while (auto extractSlice = source.getDefiningOp<tensor::ExtractSliceOp>()) {
496-
source = extractSlice.getSource();
497-
}
498-
auto pad = source.getDefiningOp<tensor::PadOp>();
496+
auto pad = traceToTensorPad(copyOp.getInputs()[0]);
499497
if (!pad) {
500498
return failure(); // Not a pad fusion case
501499
}
@@ -874,13 +872,7 @@ struct GPUConvertToCoalescedDMAPass final
874872
// Check if this is a tensor.pad fusion case.
875873
bool isPadFusion = false;
876874
if (auto copyOp = dyn_cast<linalg::CopyOp>(op.getOperation())) {
877-
Value source = copyOp.getInputs()[0];
878-
// Trace through extract_slice to find tensor.pad
879-
while (auto extractSlice =
880-
source.getDefiningOp<tensor::ExtractSliceOp>()) {
881-
source = extractSlice.getSource();
882-
}
883-
if (auto pad = source.getDefiningOp<tensor::PadOp>()) {
875+
if (auto pad = traceToTensorPad(copyOp.getInputs()[0])) {
884876
// Check if padding exists (non-zero low/high pad)
885877
for (auto [low, high] :
886878
llvm::zip(pad.getMixedLowPad(), pad.getMixedHighPad())) {
@@ -900,6 +892,10 @@ struct GPUConvertToCoalescedDMAPass final
900892
// by setting tile sizes to the full shape. This allows the DMA to
901893
// operate on the full buffer while satisfying the warp-mapped parent
902894
// requirement.
895+
// Bail out if any dimension is dynamic since we need static tile sizes.
896+
if (llvm::any_of(shape, ShapedType::isDynamic)) {
897+
return failure();
898+
}
903899
for (int64_t i = 0; i < rank; ++i) {
904900
tileSizes.push_back(rewriter.getIndexAttr(shape[i]));
905901
++numTiledDims;

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_convert_to_coalesced_dma.mlir

Lines changed: 25 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-convert-to-coalesced-dma))" %s --split-input-file | FileCheck %s
1+
// RUN: iree-opt --pass-pipeline="builtin.module(func.func(iree-codegen-gpu-convert-to-coalesced-dma,canonicalize))" %s --split-input-file | FileCheck %s
22

33
#gpu_target_copy = #iree_gpu.target<arch = "gfx942", features = "", wgp = <
44
compute = fp32, storage = b32, subgroup = shuffle,
@@ -24,8 +24,8 @@ func.func @copy(%source: tensor<64x512xf32>, %init: tensor<64x512xf32>) -> tenso
2424
// With 16 warps (128*512/64/64) and 64 rows: step = ceil(64/16) = 4 rows, 512 cols (whole)
2525
// CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (64, 512) step (4, 512)
2626
// CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[INIT]]) -> (tensor<64x512xf32>) {
27-
// CHECK-DAG: %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], %[[IV1]]] [4, 512] [1, 1]
28-
// CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [4, 512] [1, 1]
27+
// CHECK-DAG: %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], 0] [4, 512] [1, 1]
28+
// CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], 0] [4, 512] [1, 1]
2929

3030
// Thread-level forall:
3131
// CHECK: %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64)
@@ -37,7 +37,7 @@ func.func @copy(%source: tensor<64x512xf32>, %init: tensor<64x512xf32>) -> tenso
3737
// CHECK: } {mapping = [#iree_gpu.lane_id<0>]}
3838

3939
// CHECK: scf.forall.in_parallel {
40-
// CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [4, 512] [1, 1]
40+
// CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], 0] [4, 512] [1, 1]
4141
// CHECK: }
4242
// CHECK: } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
4343

@@ -75,22 +75,20 @@ func.func @gather(%source: tensor<64x512xf32>, %indices: tensor<64xi32>, %init:
7575
// With 64 warps and 64 rows: step = ceil(64/64) = 1 row, 512 cols (whole)
7676
// CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (64, 512) step (1, 512)
7777
// CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[INIT]]) -> (tensor<64x512xf32>) {
78-
// CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [1, 512] [1, 1]
78+
// CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], 0] [1, 512] [1, 1]
7979
// CHECK-DAG: %[[SLICE_INDICES:.+]] = tensor.extract_slice %[[INDICES]][%[[IV0]]] [1] [1]
80-
// CHECK-DAG: %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][0, %[[IV1]]] [64, 512] [1, 1]
8180

8281
// Thread-level forall:
8382
// CHECK: %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64)
8483
// CHECK-SAME: shared_outs(%[[THREAD_INIT:.+]] = %[[SLICE_DST]]) -> (tensor<1x512xf32>) {
85-
// CHECK-DAG: %[[SLICE_INDICES_INNER:.+]] = tensor.extract_slice %[[SLICE_INDICES]][0] [1] [1]
8684
// CHECK: scf.forall.in_parallel {
87-
// CHECK: iree_gpu.coalesced_gather_dma %[[SLICE_SRC]][%[[SLICE_INDICES_INNER]]] into %[[THREAD_INIT]] lane(%[[LANE]])
85+
// CHECK: iree_gpu.coalesced_gather_dma %[[SRC]][%[[SLICE_INDICES]]] into %[[THREAD_INIT]] lane(%[[LANE]])
8886
// CHECK-SAME: : tensor<64x512xf32>, tensor<1xi32>, tensor<1x512xf32>, index
8987
// CHECK: }
9088
// CHECK: } {mapping = [#iree_gpu.lane_id<0>]}
9189

9290
// CHECK: scf.forall.in_parallel {
93-
// CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [1, 512] [1, 1]
91+
// CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], 0] [1, 512] [1, 1]
9492
// CHECK: }
9593
// CHECK: } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
9694

@@ -170,9 +168,9 @@ func.func @copy_prefer_contiguous_subview(%source: tensor<64x128xf32>, %init: te
170168
// CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[INIT]]) -> (tensor<64x128xf32>) {
171169

172170
// Key check: subviews are 16x128 (contiguous) not 64x64 (non-contiguous)
173-
// CHECK-DAG: %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], %[[IV1]]] [16, 128] [1, 1]
171+
// CHECK-DAG: %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], 0] [16, 128] [1, 1]
174172
// CHECK-SAME: : tensor<64x128xf32> to tensor<16x128xf32>
175-
// CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [16, 128] [1, 1]
173+
// CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], 0] [16, 128] [1, 1]
176174
// CHECK-SAME: : tensor<64x128xf32> to tensor<16x128xf32>
177175

178176
// Thread-level forall distributes across lanes:
@@ -185,7 +183,7 @@ func.func @copy_prefer_contiguous_subview(%source: tensor<64x128xf32>, %init: te
185183
// CHECK: } {mapping = [#iree_gpu.lane_id<0>]}
186184

187185
// CHECK: scf.forall.in_parallel {
188-
// CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [16, 128] [1, 1]
186+
// CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], 0] [16, 128] [1, 1]
189187
// CHECK-SAME: : tensor<16x128xf32> into tensor<64x128xf32>
190188
// CHECK: }
191189
// CHECK: } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
@@ -231,9 +229,9 @@ func.func @copy_small_innermost_linearized(%source: tensor<128x16xf32>) -> tenso
231229
// Warp-level forall: step (32, 16) distributes 128 rows across 4 warps
232230
// CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (128, 16) step (32, 16)
233231
// CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[EMPTY]]) -> (tensor<128x16xf32>) {
234-
// CHECK-DAG: %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], %[[IV1]]] [32, 16] [1, 1]
232+
// CHECK-DAG: %[[SLICE_SRC:.+]] = tensor.extract_slice %[[SRC]][%[[IV0]], 0] [32, 16] [1, 1]
235233
// CHECK-SAME: : tensor<128x16xf32> to tensor<32x16xf32>
236-
// CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [32, 16] [1, 1]
234+
// CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], 0] [32, 16] [1, 1]
237235
// CHECK-SAME: : tensor<128x16xf32> to tensor<32x16xf32>
238236

239237
// Thread-level forall with 64 lanes
@@ -246,7 +244,7 @@ func.func @copy_small_innermost_linearized(%source: tensor<128x16xf32>) -> tenso
246244
// CHECK: } {mapping = [#iree_gpu.lane_id<0>]}
247245

248246
// CHECK: scf.forall.in_parallel {
249-
// CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [32, 16] [1, 1]
247+
// CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], 0] [32, 16] [1, 1]
250248
// CHECK-SAME: : tensor<32x16xf32> into tensor<128x16xf32>
251249
// CHECK: }
252250
// CHECK: } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
@@ -451,11 +449,13 @@ func.func @copy_with_tensor_pad_fusion(%source: tensor<121x64xf32>, %init: tenso
451449

452450
// Key check: tensor.pad is fused - source is the extract_slice result, not the padded tensor
453451
// in_bounds = [false, true] because M dim has dynamic padding, K dim has no padding
454-
// CHECK: scf.forall (%[[LANE:.+]]) in (64)
455-
// CHECK: scf.forall.in_parallel {
456-
// CHECK: iree_gpu.coalesced_gather_dma %[[EXTRACTED:.*]] into %{{.+}} lane(%[[LANE]]) in_bounds [false, true]
452+
// CHECK: %[[EXTRACTED:.+]] = tensor.extract_slice %[[SRC]]
453+
// CHECK: scf.forall {{.*}} shared_outs(%[[OUTER_INIT:.+]] = %[[INIT]])
454+
// CHECK: scf.forall (%[[LANE:.+]]) in (64) shared_outs(%[[INNER_INIT:.+]] = %[[OUTER_INIT]])
455+
// CHECK: scf.forall.in_parallel {
456+
// CHECK: iree_gpu.coalesced_gather_dma %[[EXTRACTED]] into %[[INNER_INIT]] lane(%[[LANE]]) in_bounds [false, true]
457457
// CHECK-SAME: : tensor<?x64xf32>, tensor<4x64xf32>, index
458-
// CHECK: }
458+
// CHECK: }
459459
// CHECK-NOT: tensor.pad
460460

461461
return %result : tensor<4x64xf32>
@@ -504,24 +504,22 @@ func.func @copy_with_tensor_pad_fusion_multi_warp(%source: tensor<121x64xf32>, %
504504
// forall with step (1, 64) producing 4 iterations with 1x64 subviews.
505505
// For tensor.pad fusion, we instead create a single-iteration wrapper forall
506506
// with step (4, 64) - the full shape - so the DMA operates on 4x64 directly.
507+
// After canonicalization, identity extract_slices are eliminated.
507508
//
509+
// CHECK: %[[EXTRACTED:.+]] = tensor.extract_slice %[[SRC]]
508510
// CHECK: %[[WARP_RESULT:.+]] = scf.forall (%[[IV0:.+]], %[[IV1:.+]]) = (0, 0) to (4, 64) step (4, 64)
509511
// CHECK-SAME: shared_outs(%[[INIT_TILE:.+]] = %[[INIT]]) -> (tensor<4x64xf32>) {
510512
//
511-
// The subviews should be the full 4x64 shape:
512-
// CHECK-DAG: %[[SLICE_SRC:.+]] = tensor.extract_slice {{.+}}[%[[IV0]], %[[IV1]]] [4, 64] [1, 1]
513-
// CHECK-DAG: %[[SLICE_DST:.+]] = tensor.extract_slice %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [4, 64] [1, 1]
514-
//
515-
// Thread-level forall with 64 lanes:
516-
// CHECK: %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64)
513+
// Thread-level forall with 64 lanes (uses outer forall's shared_out directly):
514+
// CHECK: %[[THREAD_RESULT:.+]] = scf.forall (%[[LANE:.+]]) in (64) shared_outs(%[[INNER_INIT:.+]] = %[[INIT_TILE]])
517515
// CHECK: scf.forall.in_parallel {
518-
// CHECK: iree_gpu.coalesced_gather_dma {{.+}} into %{{.+}} lane(%[[LANE]]) in_bounds [false, true]
516+
// CHECK: iree_gpu.coalesced_gather_dma %[[EXTRACTED]] into %[[INNER_INIT]] lane(%[[LANE]]) in_bounds [false, true]
519517
// CHECK-SAME: : tensor<?x64xf32>, tensor<4x64xf32>, index
520518
// CHECK: }
521519
// CHECK: } {mapping = [#iree_gpu.lane_id<0>]}
522520
//
523521
// CHECK: scf.forall.in_parallel {
524-
// CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][%[[IV0]], %[[IV1]]] [4, 64] [1, 1]
522+
// CHECK: tensor.parallel_insert_slice %[[THREAD_RESULT]] into %[[INIT_TILE]][0, 0] [4, 64] [1, 1]
525523
// CHECK: }
526524
// CHECK: } {mapping = [#gpu.warp<linear_dim_1>, #gpu.warp<linear_dim_0>]}
527525
// CHECK-NOT: tensor.pad

0 commit comments

Comments
 (0)