Adding additional negative test cases

jerryyin · jerryyin · commit 68b53288b087 · 2024-12-09T16:12:35.000Z
- Added additional test cases to demonstrate insert/extract slice will
  block producer/consumer fusion
- Readability enahncements
diff --git a/mlir/test/Dialect/Linalg/transform-lower-pack.mlir b/mlir/test/Dialect/Linalg/transform-lower-pack.mlir
@@ -98,15 +98,15 @@ module attributes {transform.with_named_sequence} {
 
 // This is same as pack_as_pad but since we explicitly added {lowerPadLikeWithInsertSlice = false}, it should not
 // be lowered to insert_slice.
-// CHECK-LABEL: func.func @pack_disallowed_as_pad(
-func.func @pack_disallowed_as_pad(%arg0: tensor<129x47x16x16xf32>, %arg1: tensor<1x1x1x1x136x64x16x16xf32>) -> tensor<1x1x1x1x136x64x16x16xf32> {
+// CHECK-LABEL: func.func @pack_as_pad_disabled_insert_slice(
+func.func @pack_as_pad_disabled_insert_slice(%arg0: tensor<129x47x16x16xf32>, %arg1: tensor<1x1x1x1x136x64x16x16xf32>) -> tensor<1x1x1x1x136x64x16x16xf32> {
   %cst_0 = arith.constant 0.0 : f32
   // tensor.pack is lowered to tensor.pad + tensor.expand_shape + linalg.transpose
   // CHECK-SAME: %[[ARG0:[^:]*]]: tensor<129x47x16x16xf32>
-  //      CHECK: %[[PAD:.*]] = tensor.pad %[[ARG0]]
+  //  CHECK-DAG: %[[PAD:.*]] = tensor.pad %[[ARG0]]
   //  CHECK-NOT: %[[RES:.*]] = tensor.insert_slice %[[PAD]]
   //      CHECK: %[[PAD_EXPANDED:.*]] = tensor.expand_shape %[[PAD]]
-  //      CHECK: %[[RES:.*]] = linalg.transpose ins(%[[PAD_EXPANDED]]
+  //  CHECK-DAG: %[[RES:.*]] = linalg.transpose ins(%[[PAD_EXPANDED]]
   %pack = tensor.pack %arg0 padding_value(%cst_0 : f32) inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1
     : tensor<129x47x16x16xf32> -> tensor<1x1x1x1x136x64x16x16xf32>
   return %pack :  tensor<1x1x1x1x136x64x16x16xf32>
@@ -261,18 +261,18 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
-// This is same as upack_as_pad but since we explicitly added {lowerUnpadLikeWithExtractSlice = false}, it should not 
+// This is same as upack_as_pad but since we explicitly added {lowerUnpadLikeWithExtractSlice = false}, it should not
 // be lowered to extract_slice.
-// CHECK-LABEL: func.func @unpack_disallowed_as_pad(
-func.func @unpack_disallowed_as_pad(%arg0: tensor<1x1x1x1x136x64x16x16xf32>, %arg1: tensor<129x47x16x16xf32>) -> tensor<129x47x16x16xf32> {
+// CHECK-LABEL: func.func @unpack_as_pad_disabled_extract_slice(
+func.func @unpack_as_pad_disabled_extract_slice(%arg0: tensor<1x1x1x1x136x64x16x16xf32>, %arg1: tensor<129x47x16x16xf32>) -> tensor<129x47x16x16xf32> {
   %cst_0 = arith.constant 0.0 : f32
 
   // tensor.unpack is lowered to tensor.extract_slice + linalg.transpose + tensor.collapse_shape
-  // CHECK-SAME: %[[ARG0:[^:]*]]: tensor<1x1x1x1x136x64x16x16xf32>
-  //  CHECK-NOT: %[[RES:.*]] = tensor.extract_slice %[[ARG0]]
-  //      CHECK: %[[TRANSPOSED:.*]] = linalg.transpose ins(%[[ARG0]]
-  //      CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[TRANSPOSED]]
-  //      CHECK: %[[RES:.*]] = tensor.extract_slice %[[COLLAPSED]]
+  // CHECK-DAG: %[[ARG0:[^:]*]]: tensor<1x1x1x1x136x64x16x16xf32>
+  // CHECK-NOT: %[[RES:.*]] = tensor.extract_slice %[[ARG0]]
+  //     CHECK: %[[TRANSPOSED:.*]] = linalg.transpose ins(%[[ARG0]]
+  //     CHECK: %[[COLLAPSED:.*]] = tensor.collapse_shape %[[TRANSPOSED]]
+  // CHECK-DAG: %[[RES:.*]] = tensor.extract_slice %[[COLLAPSED]]
   %pack = tensor.unpack %arg0 inner_dims_pos = [0, 1, 2, 3] inner_tiles = [136, 64, 16, 16] into %arg1
     : tensor<1x1x1x1x136x64x16x16xf32> -> tensor<129x47x16x16xf32>
   return %pack : tensor<129x47x16x16xf32>
@@ -632,7 +632,7 @@ func.func @unpack_fully_dynamic(%source: tensor<?x?x?x?xf32>, %dest: tensor<?x?x
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%module_op: !transform.any_op {transform.readonly}) {
     %unpack = transform.structured.match ops{["tensor.unpack"]} in %module_op
-      : (!transform.any_op) -> !transform.op<"tensor.unpack"> 
+      : (!transform.any_op) -> !transform.op<"tensor.unpack">
     transform.structured.lower_unpack %unpack : (!transform.op<"tensor.unpack">)
           -> (!transform.op<"tensor.empty">,
           !transform.op<"linalg.transpose">,
@@ -687,9 +687,9 @@ module attributes {transform.with_named_sequence} {
 // CHECK-LABEL: @unpack_with_outer_dims_perm
 //  CHECK-SAME: %[[ARG0:.*]]: tensor<32x64xf32>, %[[ARG1:.*]]: tensor<2x4x32x8xf32>
 //       CHECK: %[[EMPTY:.*]] = tensor.empty() : tensor<4x8x2x32xf32>
-//       CHECK: %[[TRAN:.*]] = linalg.transpose 
-//  CHECK-SAME:   ins(%[[ARG1]] : tensor<2x4x32x8xf32>) 
-//  CHECK-SAME:   outs(%[[EMPTY]] : tensor<4x8x2x32xf32>) 
+//       CHECK: %[[TRAN:.*]] = linalg.transpose
+//  CHECK-SAME:   ins(%[[ARG1]] : tensor<2x4x32x8xf32>)
+//  CHECK-SAME:   outs(%[[EMPTY]] : tensor<4x8x2x32xf32>)
 //  CHECK-SAME:   permutation = [1, 3, 0, 2]
 //       CHECK: %[[CLP:.*]] = tensor.collapse_shape %[[TRAN]] {{\[}}[0, 1], [2, 3]]
 //  CHECK-SAME:   : tensor<4x8x2x32xf32> into tensor<32x64xf32>
@@ -698,7 +698,7 @@ module attributes {transform.with_named_sequence} {
 //       CHECK: linalg.copy ins(%[[SLICE]]
 //  CHECK-SAME:   : tensor<32x64xf32>) outs(%[[ARG0]] : tensor<32x64xf32>) -> tensor<32x64xf32>
 func.func @unpack_with_outer_dims_perm(%arg0: tensor<32x64xf32>, %arg1: tensor<2x4x32x8xf32>) -> tensor<32x64xf32> {
-  %unpack = tensor.unpack %arg1 outer_dims_perm = [1, 0] 
+  %unpack = tensor.unpack %arg1 outer_dims_perm = [1, 0]
     inner_dims_pos = [1, 0] inner_tiles = [32, 8] into %arg0 : tensor<2x4x32x8xf32> -> tensor<32x64xf32>
   return %unpack : tensor<32x64xf32>
 }
diff --git a/mlir/test/Dialect/Linalg/transform-tile-and-fuse-pack-unpack.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-fuse-pack-unpack.mlir
@@ -58,6 +58,62 @@ module {
   }
 }
 
+// -----
+// For pack op, by default lowerPadLikeWithInsertSlice = true, which generates insert_slice and blocks fusion.
+
+module {
+  // CHECK-label: func @fuse_pack_as_producer_blocked_by_insert_slice
+  // CHECK:       tensor.insert_slice
+  // CHECK:       scf.forall {{.*}} {
+  // CHECK:         scf.forall.in_parallel
+  // CHECK:       }
+  func.func @fuse_pack_as_producer_blocked_by_insert_slice(%src: tensor<128x256xf32>, %other: tensor<4x4x128x256xf32>)
+      -> tensor<4x4x128x256xf32> {
+    %dest = tensor.empty() : tensor<1x1x128x256xf32>
+    %pack = tensor.pack %src inner_dims_pos = [0, 1] inner_tiles = [128, 256]
+        into %dest : tensor<128x256xf32> -> tensor<1x1x128x256xf32>
+
+    %out = tensor.empty() : tensor<4x4x128x256xf32>
+    %res = linalg.generic
+        {indexing_maps = [affine_map<(i, j, k, l) -> (0, 0, k, l)>,
+                          affine_map<(i, j, k, l) -> (i, j, k, l)>,
+                          affine_map<(i, j, k, l) -> (i, j, k, l)>],
+         iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+        ins(%pack, %other: tensor<1x1x128x256xf32>, tensor<4x4x128x256xf32>)
+        outs(%out: tensor<4x4x128x256xf32>) {
+      ^bb0(%pack_elem: f32, %other_elem: f32, %out_elem: f32):
+        %r = arith.addf %pack_elem, %other_elem : f32
+        linalg.yield %r : f32
+    } -> tensor<4x4x128x256xf32>
+
+    return %res : tensor<4x4x128x256xf32>
+  }
+
+  module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+      // Find and lower pack operation.
+      %pack = transform.structured.match ops{["tensor.pack"]} in %arg1
+        : (!transform.any_op) -> !transform.op<"tensor.pack">
+      %paded, %expanded, %transpose = transform.structured.lower_pack %pack
+        : (!transform.op<"tensor.pack">)
+        -> (!transform.op<"tensor.pad">,
+            !transform.op<"tensor.expand_shape">,
+            !transform.op<"linalg.transpose">)
+
+      %root = transform.structured.match ops{["linalg.generic"]} in %arg1
+          : (!transform.any_op) -> !transform.any_op
+      // Tile the lialg operation with parallel forall loop tiling [4, 4].
+      %tiled_op, %forall_op = transform.structured.tile_using_forall %root num_threads [4, 4]
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+      // Fuse the transpose operation into the tiled loop.
+      transform.structured.fuse_into_containing_op %transpose into %forall_op
+          : (!transform.op<"linalg.transpose">, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.yield
+    }
+  }
+}
+
 // -----
 // For unpack op, we use lowerUnpadLikeWithExtractSlice = false to ensure no extract_slice is generated.
 // This allows linalg.transpose to be fused as a consumer operation. Alternatively, without this attribute
@@ -119,3 +175,64 @@ module {
     }
   }
 }
+
+// -----
+// For unpack op, by default lowerUnpadLikeWithExtractSlice = true, which generates extract_slice and blocks fusion.
+
+module {
+  // CHECK-label: func @fuse_unpack_as_consumer_blocked_by_extract_slice
+  // CHECK:       scf.forall {{.*}} {
+  // CHECK:         linalg.generic
+  // CHECK:         scf.forall.in_parallel
+  // CHECK:       }
+  // CHECK:       tensor.extract_slice
+  func.func @fuse_unpack_as_consumer_blocked_by_extract_slice(%src: tensor<4x4x128x256xf32>, %other: tensor<4x4x128x256xf32>)
+      -> tensor<128x256xf32> {
+    %out = tensor.empty() : tensor<1x1x128x256xf32>
+    %res = linalg.generic
+        {indexing_maps = [affine_map<(i, j, k, l) -> (i, j, k, l)>,
+                          affine_map<(i, j, k, l) -> (i, j, k, l)>,
+                          affine_map<(i, j, k, l) -> (0, 0, k, l)>],
+         iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+        ins(%src, %other: tensor<4x4x128x256xf32>, tensor<4x4x128x256xf32>)
+        outs(%out: tensor<1x1x128x256xf32>) {
+      ^bb0(%unpack_elem: f32, %other_elem: f32, %out_elem: f32):
+        %r = arith.addf %unpack_elem, %other_elem : f32
+        linalg.yield %r : f32
+    } -> tensor<1x1x128x256xf32>
+
+    %dest = tensor.empty() : tensor<128x256xf32>
+    %unpack = tensor.unpack %res inner_dims_pos = [0, 1] inner_tiles = [128, 256]
+        into %dest : tensor<1x1x128x256xf32> -> tensor<128x256xf32>
+
+    return %unpack : tensor<128x256xf32>
+  }
+
+  module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+      // Find and lower unpack operation.
+      %unpack = transform.structured.match ops{["tensor.unpack"]} in %arg1
+          : (!transform.any_op) -> !transform.op<"tensor.unpack">
+      transform.structured.lower_unpack %unpack
+        : (!transform.op<"tensor.unpack">)
+        -> (!transform.op<"tensor.empty">,
+            !transform.op<"linalg.transpose">,
+            !transform.op<"tensor.collapse_shape">,
+            !transform.op<"tensor.extract_slice">)
+
+      %root = transform.structured.match ops{["linalg.generic"]} in %arg1
+          : (!transform.any_op) -> !transform.any_op
+      // Tile the lialg operation with parallel forall loop tiling [4, 4].
+      %tiled_op, %forall_op = transform.structured.tile_using_forall %root num_threads [4, 4]
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+      // Fuse the consumer operation into the tiled loop.
+      %slice_op = transform.structured.match ops{["tensor.parallel_insert_slice"]} in %forall_op
+          : (!transform.any_op) -> !transform.op<"tensor.parallel_insert_slice">
+      // Note that we cannot apply transform.test.fuse_consumer here because the extract_slice
+      // is not qualified consumer operation. Forcing this will yeild "could not fetch consumer
+      // to fuse" error.
+      transform.yield
+    }
+  }
+}