Address review comments

RoboTux · RoboTux · commit 6fc320ddd569 · 2025-05-20T22:43:32.000+01:00
Utils:
- drop comments on implementation
- rename from into src

Fusion:
- restrict live range of droppedDims
- clarify comment for rank-reduction check

Test:
- Use more descriptive SSA and FileCheck variables
- Emphasize the rank-reducing extract_slice in the input IR as the key
  aspect of the test.
diff --git a/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h b/mlir/include/mlir/Dialect/Tensor/Utils/Utils.h
@@ -44,8 +44,8 @@ computeTransposedType(RankedTensorType rankedTensorType,
                       ArrayRef<int64_t> transposeVector);
 
 /// Create tensor.collapse_shape to drop unit dimensions in `dropDims` in tensor
-/// `from`.
-CollapseShapeOp dropGivenUnitDims(OpBuilder &b, Location loc, Value from,
+/// `src`.
+CollapseShapeOp dropGivenUnitDims(OpBuilder &b, Location loc, Value src,
                                   const llvm::SmallBitVector &dropDims);
 
 /// A tensor.insert_slice is a cast-like operation if it merely rank-extends the
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
@@ -256,7 +256,6 @@ mlir::linalg::fuseProducerOfTensor(OpBuilder &b, OpResult producerOpResult,
                << "\nNot fusable, not an extract_slice op: " << inputTensor);
     return failure();
   }
-  llvm::SmallBitVector droppedDims = sliceOp.getDroppedDims();
 
   // If producer is already in the same block as consumer, we are done.
   if (consumerOpOperand.get().getParentBlock() ==
@@ -276,11 +275,14 @@ mlir::linalg::fuseProducerOfTensor(OpBuilder &b, OpResult producerOpResult,
   // Replace use.
   Value def = fusedProducer->getResult(producerOpResult.getResultNumber());
   Type consumerType = consumerOpOperand.get().getType();
-  // Rank-reduction occurred as part of the extract_slice.
+  // Check if rank-reduction occurred as part of the extract_slice. If yes,
+  // collapse the dropped dimensions.
   if (cast<ShapedType>(consumerType).getRank() !=
-      cast<ShapedType>(def.getType()).getRank())
+      cast<ShapedType>(def.getType()).getRank()) {
+    llvm::SmallBitVector droppedDims = sliceOp.getDroppedDims();
     def =
         tensor::dropGivenUnitDims(b, fusedProducer.getLoc(), def, droppedDims);
+  }
   // Canonicalizations are not guaranteed to have happened before constructing
   // `fusedProducer`. In the tensor case this can result in temporary type
   // mismatches. Insert a `tensor.cast` op to propagate the transformation
diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
@@ -94,18 +94,16 @@ mlir::tensor::computeTransposedType(RankedTensorType rankedTensorType,
   return transposedTensorType;
 }
 
-/// Create tensor.collapse_shape to drop unit dimensions in `dropDims` in tensor
-/// `from`.
 CollapseShapeOp
-mlir::tensor::dropGivenUnitDims(OpBuilder &b, Location loc, Value from,
+mlir::tensor::dropGivenUnitDims(OpBuilder &b, Location loc, Value src,
                                 const llvm::SmallBitVector &dropDims) {
-  auto fromType = cast<ShapedType>(from.getType());
-  int64_t rank = fromType.getRank();
+  auto srcType = cast<ShapedType>(src.getType());
+  int64_t rank = srcType.getRank();
   assert(rank == static_cast<int64_t>(dropDims.size()) &&
-         "dropDims dimension does not match from tensor rank");
+         "dropDims dimension does not match src tensor rank");
   assert(llvm::all_of(
              dropDims.set_bits(),
-             [&](unsigned dim) { return fromType.getShape()[dim] == 1; }) &&
+             [&](unsigned dim) { return srcType.getShape()[dim] == 1; }) &&
          "Dropping non unit dimension");
   // Computed reassociation map for the corresponding tensor.collapse_shape.
   SmallVector<ReassociationIndices, 2> reassocMaps;
@@ -124,7 +122,7 @@ mlir::tensor::dropGivenUnitDims(OpBuilder &b, Location loc, Value from,
     reassocMaps.emplace_back(llvm::make_range(seq.begin(), seq.end()));
     nextDimToGroup = setBit + 1;
   }
-  return b.create<tensor::CollapseShapeOp>(loc, from, reassocMaps);
+  return b.create<tensor::CollapseShapeOp>(loc, src, reassocMaps);
 }
 
 bool mlir::tensor::isCastLikeInsertSliceOp(InsertSliceOp op) {
diff --git a/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir b/mlir/test/Dialect/Linalg/tile-and-fuse-tensors.mlir
@@ -328,66 +328,71 @@ func.func @pad_generic_static(%small_input: tensor<58x1xf32>, %large_input: tens
 #map4 = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map5 = affine_map<(d0, d1, d2) -> (d0, d1)>
 func.func @rank_reduced_extract_slice(
-    %arg0: tensor<1x6x5xf32>, %arg1: tensor<1x5x6xf32>, %arg2: tensor<4x6xf32>,
-    %arg3: tensor<1x6x6xf32>, %arg4: tensor<4x6xf32>, %arg5: tensor<4x2xf32>
+    %prod_in: tensor<1x6x5xf32>, %prod_weight: tensor<1x5x6xf32>,
+    %cons_in: tensor<4x6xf32>, %prod_init: tensor<1x6x6xf32>,
+    %for_iv_init: tensor<4x6xf32>, %cons_init: tensor<4x2xf32>
 ) -> tensor<4x6xf32> {
   %c0 = arith.constant 0 : index
   %c2 = arith.constant 2 : index
   %c6 = arith.constant 6 : index
-  %0 = linalg.generic
+  %mmul_prod = linalg.generic
     {indexing_maps = [#map0, #map1, #map2], iterator_types = ["parallel", "parallel", "parallel", "reduction"]}
-    ins(%arg0, %arg1 : tensor<1x6x5xf32>, tensor<1x5x6xf32>) outs(%arg3 : tensor<1x6x6xf32>) {
+    ins(%prod_in, %prod_weight : tensor<1x6x5xf32>, tensor<1x5x6xf32>) outs(%prod_init : tensor<1x6x6xf32>) {
   ^bb0(%in: f32, %in_1: f32, %out: f32):
     %10 = arith.mulf %in, %in_1 : f32
     %11 = arith.addf %out, %10 : f32
     linalg.yield %11 : f32
   } -> tensor<1x6x6xf32>
-  %1 = scf.for %arg7 = %c0 to %c6 step %c2 iter_args(%arg6 = %arg4) -> (tensor<4x6xf32>) {
-    %2 = tensor.extract_slice %0[0, 0, %arg7] [1, 6, 2] [1, 1, 1] : tensor<1x6x6xf32> to tensor<6x2xf32>
-    %3 = linalg.generic
+  %for = scf.for %arg7 = %c0 to %c6 step %c2 iter_args(%arg6 = %for_iv_init) -> (tensor<4x6xf32>) {
+
+    // Extract slice with rank-reduced result type. When fused in the loop
+    // with sliced operands, the producer linalg must have its now sliced
+    // result be rank-reduced as well to match consumer's use type.
+    %prod_slice = tensor.extract_slice %mmul_prod[0, 0, %arg7] [1, 6, 2] [1, 1, 1] : tensor<1x6x6xf32> to tensor<6x2xf32>
+    %mmul_cons = linalg.generic
      {indexing_maps = [#map3, #map4, #map5], iterator_types = ["parallel", "parallel", "reduction"]}
-     ins(%arg2, %2 : tensor<4x6xf32>, tensor<6x2xf32>) outs(%arg5 : tensor<4x2xf32>) {
+     ins(%cons_in, %prod_slice : tensor<4x6xf32>, tensor<6x2xf32>) outs(%cons_init : tensor<4x2xf32>) {
     ^bb0(%in: f32, %in_1: f32, %out: f32):
       %20 = arith.mulf %in, %in_1 : f32
       %21 = arith.addf %out, %20 : f32
       linalg.yield %21 : f32
     } -> tensor<4x2xf32>
-    %4 = tensor.insert_slice %3 into %arg6[0, %arg7] [4, 2] [1, 1]  : tensor<4x2xf32> into tensor<4x6xf32>
+    %4 = tensor.insert_slice %mmul_cons into %arg6[0, %arg7] [4, 2] [1, 1]  : tensor<4x2xf32> into tensor<4x6xf32>
     scf.yield %4 : tensor<4x6xf32>
   }
-  return %1 : tensor<4x6xf32>
+  return %for : tensor<4x6xf32>
 }
 
 //       CHECK: func @rank_reduced_extract_slice(
-//  CHECK-SAME: %[[ARG0:[0-9a-z]*]]: tensor<1x6x5xf32>
-//  CHECK-SAME: %[[ARG1:[0-9a-z]*]]: tensor<1x5x6xf32>
-//  CHECK-SAME: %[[ARG2:[0-9a-z]*]]: tensor<4x6xf32>
-//  CHECK-SAME: %[[ARG3:[0-9a-z]*]]: tensor<1x6x6xf32>
-//  CHECK-SAME: %[[ARG4:[0-9a-z]*]]: tensor<4x6xf32>
-//  CHECK-SAME: %[[ARG5:[0-9a-z]*]]: tensor<4x2xf32>
+//  CHECK-SAME: %[[PROD_IN:[0-9a-z]*]]: tensor<1x6x5xf32>
+//  CHECK-SAME: %[[PROD_WEIGHT:[0-9a-z]*]]: tensor<1x5x6xf32>
+//  CHECK-SAME: %[[CONS_IN:[0-9a-z]*]]: tensor<4x6xf32>
+//  CHECK-SAME: %[[PROD_INIT:[0-9a-z]*]]: tensor<1x6x6xf32>
+//  CHECK-SAME: %[[FOR_IV_INIT:[0-9a-z]*]]: tensor<4x6xf32>
+//  CHECK-SAME: %[[CONS_INIT:[0-9a-z]*]]: tensor<4x2xf32>
 
 //   CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
 //   CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index
 //   CHECK-DAG: %[[C6:.*]] = arith.constant 6 : index
 
 //  For loop right after tensor alloc & fill, no linalg.generic.
 //   CHECK-NOT: linalg.generic
-//  CHECK-NEXT: %[[FOR:.*]] = scf.for %[[I:[0-9a-z]*]] = %[[C0]] to %[[C6]] step %[[C2]] iter_args(%[[ARG_ITER:.*]] = %[[ARG4]])
+//  CHECK-NEXT: %[[FOR:.*]] = scf.for %[[I:[0-9a-z]*]] = %[[C0]] to %[[C6]] step %[[C2]] iter_args(%[[ARG_ITER:.*]] = %[[FOR_IV_INIT]])
 
 //  Producer linalg.generic now inside the loop, with tiled args sliced before
 //  it.
-//   CHECK-DAG:   %[[ARG1_SLICE:.*]] = tensor.extract_slice %[[ARG1]][0, 0, %[[I]]] [1, 5, 2] [1, 1, 1]  : tensor<1x5x6xf32> to tensor<1x5x2xf32>
-//   CHECK-DAG:   %[[PROD_SLICE:.*]] = tensor.extract_slice %[[ARG3]][0, 0, %[[I]]] [1, 6, 2] [1, 1, 1]  : tensor<1x6x6xf32> to tensor<1x6x2xf32>
+//   CHECK-DAG:   %[[PROD_WEIGHT_SLICE:.*]] = tensor.extract_slice %[[PROD_WEIGHT]][0, 0, %[[I]]] [1, 5, 2] [1, 1, 1]  : tensor<1x5x6xf32> to tensor<1x5x2xf32>
+//   CHECK-DAG:   %[[PROD_INIT_SLICE:.*]] = tensor.extract_slice %[[PROD_INIT]][0, 0, %[[I]]] [1, 6, 2] [1, 1, 1]  : tensor<1x6x6xf32> to tensor<1x6x2xf32>
 //       CHECK:    %[[MMUL_PROD:.*]] = linalg.generic
-//  CHECK-SAME:        ins(%[[ARG0]], %[[ARG1_SLICE]] : tensor<1x6x5xf32>, tensor<1x5x2xf32>)
-//  CHECK-SAME:        outs(%[[PROD_SLICE]] : tensor<1x6x2xf32>)
+//  CHECK-SAME:        ins(%[[PROD_IN]], %[[PROD_WEIGHT_SLICE]] : tensor<1x6x5xf32>, tensor<1x5x2xf32>)
+//  CHECK-SAME:        outs(%[[PROD_INIT_SLICE]] : tensor<1x6x2xf32>)
 //
 //  Consumer uses a rank-reduced version of producer result so a collapse_shape
 //  is generated.
 //       CHECK:    %[[PROD_COLLAPSE:.*]] = tensor.collapse_shape %[[MMUL_PROD]] {{\[\[0, 1\], \[2\]\]}} : tensor<1x6x2xf32> into tensor<6x2xf32>
 //       CHECK:    %[[MMUL_CONS:.*]] = linalg.generic
-//  CHECK-SAME:        ins(%[[ARG2]], %[[PROD_COLLAPSE]] : tensor<4x6xf32>, tensor<6x2xf32>)
-//  CHECK-SAME:        outs(%[[ARG5]] : tensor<4x2xf32>)
+//  CHECK-SAME:        ins(%[[CONS_IN]], %[[PROD_COLLAPSE]] : tensor<4x6xf32>, tensor<6x2xf32>)
+//  CHECK-SAME:        outs(%[[CONS_INIT]] : tensor<4x2xf32>)
 //       CHECK:   %[[CONS_SLICE:.*]] = tensor.insert_slice %[[MMUL_CONS]] into %[[ARG_ITER]][0, %[[I]]] [4, 2] [1, 1] : tensor<4x2xf32> into tensor<4x6xf32>
 //       CHECK:   scf.yield %[[CONS_SLICE]] : tensor<4x6xf32>
 //       CHECK: return %[[FOR]] : tensor<4x6xf32>