Skip to content

Commit 33e2146

Browse files
authored
[Codegen] Add corner case for SwapExtractWithCollapsePattern (iree-org#21773)
Fix the second case in iree-org#21660. The performance got regression because the `collapse_shape` wasn't fused into the nested loops properly. The example is a special case that the collapsed offset is not from `affine.apply` op but directly a block argument. After this fix, the performance of the example configuration is improved together with the pre-padding work. To get an idea of perf change for `convbfp16 -n 16 -c 40 -H 192 -W 128 -k 40 -y 3 -x 3 -p 1 -q 1 -u 2 -v 2 -l 1 -j 1 -m conv -g 1 -F 1 -t 1 --in_layout NHWC --out_layout NHWC --fil_layout NHWC --iter 100` - Baseline without pre-padding: 452 us. - Pre-padding without the fix: 920 us. - Pre-padding with the fix: 220 us. --------- Signed-off-by: yzhang93 <[email protected]>
1 parent f1e9219 commit 33e2146

File tree

2 files changed

+68
-30
lines changed

2 files changed

+68
-30
lines changed

compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -548,15 +548,41 @@ func.func @partial_reduction(%3: tensor<?x?xf32>) -> tensor<?xf32> {
548548

549549
// -----
550550

551+
#config = #iree_gpu.lowering_config<{reduction = [0, 1]}>
552+
func.func @swap_collapse_shape_with_extract_slice_block_arg(%arg0: tensor<16x1x1x16xf32>) -> tensor<16x16xf32> {
553+
%collapsed = tensor.collapse_shape %arg0 [[0], [1, 2, 3]] : tensor<16x1x1x16xf32> into tensor<16x16xf32>
554+
%empty = tensor.empty() : tensor<16x16xf32>
555+
%0 = linalg.copy {lowering_config = #config} ins(%collapsed : tensor<16x16xf32>) outs(%empty : tensor<16x16xf32>) -> tensor<16x16xf32>
556+
return %0: tensor<16x16xf32>
557+
}
558+
559+
// NORM-REDUCTION-LABEL: func.func @swap_collapse_shape_with_extract_slice_block_arg
560+
// NORM-REDUCTION-DAG: %[[C1:.+]] = arith.constant 1 : index
561+
// NORM-REDUCTION-DAG: %[[C16:.+]] = arith.constant 16 : index
562+
// NORM-REDUCTION-DAG: %[[C0:.+]] = arith.constant 0 : index
563+
// NORM-REDUCTION: scf.for %[[ARG1:.+]] = %[[C0]] to %[[C16]] step %[[C1]]
564+
// NORM-REDUCTION: %[[SLICE:.+]] = tensor.extract_slice %{{.*}}[0, 0, 0, %[[ARG1]]] [16, 1, 1, 1] [1, 1, 1, 1] : tensor<16x1x1x16xf32> to tensor<16x1x1x1xf32>
565+
// NORM-REDUCTION: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[SLICE]] {{\[}}[0], [1, 2, 3]] : tensor<16x1x1x1xf32> into tensor<16x1xf32>
566+
// NORM-REDUCTION: linalg.copy {{.*}} ins(%[[COLLAPSE]]
567+
568+
// Without loop normalization, no swap would happen.
569+
// CHECK: tensor.collapse_shape
570+
// CHECK: scf.for
571+
// CHECK: tensor.extract_slice
572+
// CHECK-NOT: tensor.collapse_shape
573+
// CHECK: linalg.copy
574+
575+
// -----
576+
551577
#config = #iree_gpu.lowering_config<{reduction = [0, 32]}>
552-
func.func @swap_collapse_shape_with_extract_slice(%arg0: tensor<32x3x3x288xf32>) -> tensor<32x2592xf32> {
578+
func.func @swap_collapse_shape_with_extract_slice_apply_op(%arg0: tensor<32x3x3x288xf32>) -> tensor<32x2592xf32> {
553579
%collapsed = tensor.collapse_shape %arg0 [[0], [1, 2, 3]] : tensor<32x3x3x288xf32> into tensor<32x2592xf32>
554580
%empty = tensor.empty() : tensor<32x2592xf32>
555581
%0 = linalg.copy {lowering_config = #config} ins(%collapsed : tensor<32x2592xf32>) outs(%empty : tensor<32x2592xf32>) -> tensor<32x2592xf32>
556582
return %0: tensor<32x2592xf32>
557583
}
558584

559-
// NORM-REDUCTION-LABEL: func.func @swap_collapse_shape_with_extract_slice
585+
// NORM-REDUCTION-LABEL: func.func @swap_collapse_shape_with_extract_slice_apply_op
560586
// NORM-REDUCTION-DAG: %[[C1:.+]] = arith.constant 1 : index
561587
// NORM-REDUCTION-DAG: %[[C81:.+]] = arith.constant 81 : index
562588
// NORM-REDUCTION-DAG: %[[C0:.+]] = arith.constant 0 : index

compiler/src/iree/compiler/Codegen/Common/Transforms.cpp

Lines changed: 40 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -476,41 +476,53 @@ swapCollapseShapeWithSlice(RewriterBase &rewriter,
476476
// IGEMM, while the offset is dynamic and the size is static.
477477
if (isa<Attribute>(collapsedSize) && isa<Value>(collapsedOffset) &&
478478
reassocIndices.size() != 1) {
479-
// Check if offset is from affine.apply of form (d0 * K) or (K * d0).
480-
auto applyOp = collapsedOffset.dyn_cast<Value>()
481-
.getDefiningOp<affine::AffineApplyOp>();
482-
if (!applyOp) {
483-
return rewriter.notifyMatchFailure(sliceOp,
484-
"offset is not from affine.apply");
485-
}
486-
487-
AffineMap map = applyOp.getAffineMap();
488-
if (map.getNumResults() != 1) {
489-
return rewriter.notifyMatchFailure(
490-
sliceOp, "affine.apply must have only one result");
491-
}
492-
493479
auto maybeStaticSize = getConstantIntValue(collapsedSize);
494480
if (!maybeStaticSize) {
495481
return rewriter.notifyMatchFailure(sliceOp,
496482
"collapsed size must be static");
497483
}
498-
499-
// Compose all nested affine.apply chains and check if the offset is
500-
// multiple of collapsed size.
501-
SmallVector<Value> operands(applyOp.getOperands());
502-
affine::fullyComposeAffineMapAndOperands(&map, &operands);
503-
map = simplifyAffineMap(map);
504-
if (!map.getResult(0).isMultipleOf(maybeStaticSize.value())) {
505-
return rewriter.notifyMatchFailure(
506-
sliceOp, "offset multiplier must be multiple of collapsed size");
507-
}
508-
509-
unsigned lastReassocSize = srcShape[reassocIndices.back()];
510-
if (lastReassocSize % maybeStaticSize.value() != 0) {
484+
auto staticSize = maybeStaticSize.value();
485+
486+
// Check if offset is from a block argument or an affine.apply op of
487+
// form (d0 * K) or (K * d0).
488+
auto offsetVal = cast<Value>(collapsedOffset);
489+
auto collapseDefOp = offsetVal.getDefiningOp();
490+
if (isa<BlockArgument>(offsetVal)) {
491+
// The loop is already normalized.
492+
if (staticSize != 1) {
493+
return rewriter.notifyMatchFailure(
494+
sliceOp, "collapsed size must be 1 when the collapsed offset "
495+
"is a block argument");
496+
}
497+
} else if (auto applyOp =
498+
dyn_cast<affine::AffineApplyOp>(collapseDefOp)) {
499+
AffineMap map = applyOp.getAffineMap();
500+
if (map.getNumResults() != 1) {
501+
return rewriter.notifyMatchFailure(
502+
sliceOp, "affine.apply must have only one result");
503+
}
504+
505+
// Compose all nested affine.apply chains and check if the offset is
506+
// multiple of collapsed size.
507+
SmallVector<Value> operands(applyOp.getOperands());
508+
affine::fullyComposeAffineMapAndOperands(&map, &operands);
509+
map = simplifyAffineMap(map);
510+
if (!map.getResult(0).isMultipleOf(staticSize)) {
511+
return rewriter.notifyMatchFailure(
512+
sliceOp,
513+
"offset multiplier must be multiple of collapsed size");
514+
}
515+
516+
unsigned lastReassocSize = srcShape[reassocIndices.back()];
517+
if (lastReassocSize % staticSize != 0) {
518+
return rewriter.notifyMatchFailure(
519+
sliceOp,
520+
"the last expanded size is not divisible by collapse size");
521+
}
522+
} else {
511523
return rewriter.notifyMatchFailure(
512524
sliceOp,
513-
"the last expanded size is not divisible by collapse size");
525+
"offset is not from a block argument or affine.apply op");
514526
}
515527

516528
// Calculate expanded offsets and sizes.

0 commit comments

Comments
 (0)