[Codegen] Add corner case for SwapExtractWithCollapsePattern (iree-org#21773)

yzhang93 · web-flow · commit 33e214606939 · 2025-08-26T10:32:40.000-07:00
Fix the second case in iree-org#21660. The performance got regression because the `collapse_shape` wasn't fused into the nested loops properly. The example is a special case that the collapsed offset is not from `affine.apply` op but directly a block argument. After this fix, the performance of the example configuration is improved together with the pre-padding work. To get an idea of perf change for `convbfp16 -n 16 -c 40 -H 192 -W 128 -k 40 -y 3 -x 3 -p 1 -q 1 -u 2 -v 2 -l 1 -j 1 -m conv -g 1 -F 1 -t 1 --in_layout NHWC --out_layout NHWC --fil_layout NHWC --iter 100` - Baseline without pre-padding: 452 us. - Pre-padding without the fix: 920 us. - Pre-padding with the fix: 220 us. --------- Signed-off-by: yzhang93 <zhyuhang88@gmail.com>
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_apply_tiling_level.mlir
@@ -548,15 +548,41 @@ func.func @partial_reduction(%3: tensor<?x?xf32>) -> tensor<?xf32> {
 
 // -----
 
+#config = #iree_gpu.lowering_config<{reduction = [0, 1]}>
+func.func @swap_collapse_shape_with_extract_slice_block_arg(%arg0: tensor<16x1x1x16xf32>) -> tensor<16x16xf32> {
+  %collapsed = tensor.collapse_shape %arg0 [[0], [1, 2, 3]] : tensor<16x1x1x16xf32> into tensor<16x16xf32>
+  %empty = tensor.empty() : tensor<16x16xf32>
+  %0 = linalg.copy {lowering_config = #config} ins(%collapsed : tensor<16x16xf32>) outs(%empty : tensor<16x16xf32>) -> tensor<16x16xf32>
+  return %0: tensor<16x16xf32>
+}
+
+// NORM-REDUCTION-LABEL: func.func @swap_collapse_shape_with_extract_slice_block_arg
+//   NORM-REDUCTION-DAG:   %[[C1:.+]] = arith.constant 1 : index
+//   NORM-REDUCTION-DAG:   %[[C16:.+]] = arith.constant 16 : index
+//   NORM-REDUCTION-DAG:   %[[C0:.+]] = arith.constant 0 : index
+//       NORM-REDUCTION:   scf.for %[[ARG1:.+]] = %[[C0]] to %[[C16]] step %[[C1]]
+//       NORM-REDUCTION:     %[[SLICE:.+]] = tensor.extract_slice %{{.*}}[0, 0, 0, %[[ARG1]]] [16, 1, 1, 1] [1, 1, 1, 1] : tensor<16x1x1x16xf32> to tensor<16x1x1x1xf32>
+//       NORM-REDUCTION:     %[[COLLAPSE:.+]] = tensor.collapse_shape %[[SLICE]] {{\[}}[0], [1, 2, 3]] : tensor<16x1x1x1xf32> into tensor<16x1xf32>
+//       NORM-REDUCTION:     linalg.copy {{.*}} ins(%[[COLLAPSE]]
+
+// Without loop normalization, no swap would happen.
+//                CHECK:   tensor.collapse_shape
+//                CHECK:   scf.for
+//                CHECK:     tensor.extract_slice
+//            CHECK-NOT:     tensor.collapse_shape
+//                CHECK:     linalg.copy
+
+// -----
+
 #config = #iree_gpu.lowering_config<{reduction = [0, 32]}>
-func.func @swap_collapse_shape_with_extract_slice(%arg0: tensor<32x3x3x288xf32>) -> tensor<32x2592xf32> {
+func.func @swap_collapse_shape_with_extract_slice_apply_op(%arg0: tensor<32x3x3x288xf32>) -> tensor<32x2592xf32> {
   %collapsed = tensor.collapse_shape %arg0 [[0], [1, 2, 3]] : tensor<32x3x3x288xf32> into tensor<32x2592xf32>
   %empty = tensor.empty() : tensor<32x2592xf32>
   %0 = linalg.copy {lowering_config = #config} ins(%collapsed : tensor<32x2592xf32>) outs(%empty : tensor<32x2592xf32>) -> tensor<32x2592xf32>
   return %0: tensor<32x2592xf32>
 }
 
-// NORM-REDUCTION-LABEL: func.func @swap_collapse_shape_with_extract_slice
+// NORM-REDUCTION-LABEL: func.func @swap_collapse_shape_with_extract_slice_apply_op
 //   NORM-REDUCTION-DAG:   %[[C1:.+]] = arith.constant 1 : index
 //   NORM-REDUCTION-DAG:   %[[C81:.+]] = arith.constant 81 : index
 //   NORM-REDUCTION-DAG:   %[[C0:.+]] = arith.constant 0 : index
diff --git a/compiler/src/iree/compiler/Codegen/Common/Transforms.cpp b/compiler/src/iree/compiler/Codegen/Common/Transforms.cpp
@@ -476,41 +476,53 @@ swapCollapseShapeWithSlice(RewriterBase &rewriter,
       // IGEMM, while the offset is dynamic and the size is static.
       if (isa<Attribute>(collapsedSize) && isa<Value>(collapsedOffset) &&
           reassocIndices.size() != 1) {
-        // Check if offset is from affine.apply of form (d0 * K) or (K * d0).
-        auto applyOp = collapsedOffset.dyn_cast<Value>()
-                           .getDefiningOp<affine::AffineApplyOp>();
-        if (!applyOp) {
-          return rewriter.notifyMatchFailure(sliceOp,
-                                             "offset is not from affine.apply");
-        }
-
-        AffineMap map = applyOp.getAffineMap();
-        if (map.getNumResults() != 1) {
-          return rewriter.notifyMatchFailure(
-              sliceOp, "affine.apply must have only one result");
-        }
-
         auto maybeStaticSize = getConstantIntValue(collapsedSize);
         if (!maybeStaticSize) {
           return rewriter.notifyMatchFailure(sliceOp,
                                              "collapsed size must be static");
         }
-
-        // Compose all nested affine.apply chains and check if the offset is
-        // multiple of collapsed size.
-        SmallVector<Value> operands(applyOp.getOperands());
-        affine::fullyComposeAffineMapAndOperands(&map, &operands);
-        map = simplifyAffineMap(map);
-        if (!map.getResult(0).isMultipleOf(maybeStaticSize.value())) {
-          return rewriter.notifyMatchFailure(
-              sliceOp, "offset multiplier must be multiple of collapsed size");
-        }
-
-        unsigned lastReassocSize = srcShape[reassocIndices.back()];
-        if (lastReassocSize % maybeStaticSize.value() != 0) {
+        auto staticSize = maybeStaticSize.value();
+
+        // Check if offset is from a block argument or an affine.apply op of
+        // form (d0 * K) or (K * d0).
+        auto offsetVal = cast<Value>(collapsedOffset);
+        auto collapseDefOp = offsetVal.getDefiningOp();
+        if (isa<BlockArgument>(offsetVal)) {
+          // The loop is already normalized.
+          if (staticSize != 1) {
+            return rewriter.notifyMatchFailure(
+                sliceOp, "collapsed size must be 1 when the collapsed offset "
+                         "is a block argument");
+          }
+        } else if (auto applyOp =
+                       dyn_cast<affine::AffineApplyOp>(collapseDefOp)) {
+          AffineMap map = applyOp.getAffineMap();
+          if (map.getNumResults() != 1) {
+            return rewriter.notifyMatchFailure(
+                sliceOp, "affine.apply must have only one result");
+          }
+
+          // Compose all nested affine.apply chains and check if the offset is
+          // multiple of collapsed size.
+          SmallVector<Value> operands(applyOp.getOperands());
+          affine::fullyComposeAffineMapAndOperands(&map, &operands);
+          map = simplifyAffineMap(map);
+          if (!map.getResult(0).isMultipleOf(staticSize)) {
+            return rewriter.notifyMatchFailure(
+                sliceOp,
+                "offset multiplier must be multiple of collapsed size");
+          }
+
+          unsigned lastReassocSize = srcShape[reassocIndices.back()];
+          if (lastReassocSize % staticSize != 0) {
+            return rewriter.notifyMatchFailure(
+                sliceOp,
+                "the last expanded size is not divisible by collapse size");
+          }
+        } else {
           return rewriter.notifyMatchFailure(
               sliceOp,
-              "the last expanded size is not divisible by collapse size");
+              "offset is not from a block argument or affine.apply op");
         }
 
         // Calculate expanded offsets and sizes.