Merge pull request #470 from Xilinx/jose.fix_insert_tile_problem

mgehre-amd · web-flow · commit e69f38941b61 · 2025-02-19T14:00:54.000+01:00
Fix problem where the shape of the insert shape was calculated incorrectly
diff --git a/mlir/include/mlir/IR/AffineExpr.h b/mlir/include/mlir/IR/AffineExpr.h
@@ -110,6 +110,11 @@ class AffineExpr {
   /// floordiv, ceildiv, and mod is only allowed w.r.t constants.
   bool isPureAffine() const;
 
+  /// Returns true if this expression is monotonicically increasing with respect
+  /// to the AffineDimExprs, i.e. increasing the value of any AffineDimExpr will
+  /// never decrease the value of the result.
+  bool isMonotonicallyIncreasing() const;
+
   /// Returns the greatest known integral divisor of this affine expression. The
   /// result is always positive.
   int64_t getLargestKnownDivisor() const;
diff --git a/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/Transforms/TilingInterfaceImpl.cpp
@@ -218,10 +218,15 @@ struct LinalgOpTilingInterface
         }));
 
     OpOperand *outOperand = linalgOp.getDpsInitOperand(resultNumber);
+    SmallVector<OpFoldResult> allShapeSizes =
+        linalgOp.createFlatListOfOperandDims(b, linalgOp.getLoc());
+    SmallVector<OpFoldResult> sizeBounds =
+        mlir::affine::makeComposedFoldedMultiResultAffineApply(
+            b, loc, linalgOp.getShapesToLoopsMap(), allShapeSizes);
     SliceParameters sliceParams = computeSliceParameters(
         b, loc, outOperand->get(), sizes,
         linalgOp.getMatchingIndexingMap(outOperand), offsets,
-        /*ubs*/ {}, subShapeSizes, true);
+        /*ubs*/ sizeBounds, subShapeSizes, true);
     resultOffsets = sliceParams.offsets;
     resultSizes = sliceParams.sizes;
     return success();
diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp
@@ -56,19 +56,24 @@ namespace {
 //   `d0 + 2 * d1 + d3` is tiled by [0, 0, 0, 2] but not by [0, 0, 2, 0]
 //
 struct TileCheck : public AffineExprVisitor<TileCheck> {
-  TileCheck(ArrayRef<OpFoldResult> tileSizes, ArrayRef<OpFoldResult> sizeBounds)
-      : tileSizes(tileSizes), sizeBounds(sizeBounds) {}
+  TileCheck(ArrayRef<OpFoldResult> tileSizes, ArrayRef<OpFoldResult> sizeBounds,
+            bool isMonotonicallyIncreasing)
+      : tileSizes(tileSizes), sizeBounds(sizeBounds),
+        isMonotonicallyIncreasing(isMonotonicallyIncreasing) {}
 
   void visitDimExpr(AffineDimExpr expr) {
     unsigned pos = expr.getPosition();
 
-    // This dimension is tiled if the tile size is larger than zero and not
-    // equal to its domain size (if statically known).
-    std::optional<int64_t> tileSize = getConstantIntValue(tileSizes[pos]);
-    if (tileSize && !sizeBounds.empty()) {
-      std::optional<int64_t> sizeBound = getConstantIntValue(sizeBounds[pos]);
-      if (sizeBound && *sizeBound == *tileSize) {
-        return;
+    // If the expression is non monotonic, this dimension is tiled if the tile
+    // size is larger than zero and not equal to its domain size (if statically
+    // known).
+    if (!isMonotonicallyIncreasing) {
+      std::optional<int64_t> tileSize = getConstantIntValue(tileSizes[pos]);
+      if (tileSize && !sizeBounds.empty()) {
+        std::optional<int64_t> sizeBound = getConstantIntValue(sizeBounds[pos]);
+        if (sizeBound && *sizeBound == *tileSize) {
+          return;
+        }
       }
     }
 
@@ -84,6 +89,7 @@ struct TileCheck : public AffineExprVisitor<TileCheck> {
   bool isTiled = false;
   ArrayRef<OpFoldResult> tileSizes;
   ArrayRef<OpFoldResult> sizeBounds;
+  bool isMonotonicallyIncreasing;
 };
 
 } // namespace
@@ -92,7 +98,7 @@ static bool isTiled(AffineExpr expr, ArrayRef<OpFoldResult> tileSizes,
                     ArrayRef<OpFoldResult> sizeBounds) {
   if (!expr)
     return false;
-  TileCheck t(tileSizes, sizeBounds);
+  TileCheck t(tileSizes, sizeBounds, expr.isMonotonicallyIncreasing());
   t.visit(expr);
   return t.isTiled;
 }
diff --git a/mlir/lib/IR/AffineExpr.cpp b/mlir/lib/IR/AffineExpr.cpp
@@ -239,6 +239,42 @@ bool AffineExpr::isPureAffine() const {
   llvm_unreachable("Unknown AffineExpr");
 }
 
+static bool isNonNegativeConstant(AffineExpr expr) {
+  auto constant = dyn_cast<AffineConstantExpr>(expr);
+  return constant && constant.getValue() >= 0;
+}
+
+bool AffineExpr::isMonotonicallyIncreasing() const {
+  switch (getKind()) {
+  case AffineExprKind::SymbolId:
+  case AffineExprKind::DimId:
+  case AffineExprKind::Constant:
+    return true;
+  case AffineExprKind::Add: {
+    auto op = llvm::cast<AffineBinaryOpExpr>(*this);
+    return op.getLHS().isMonotonicallyIncreasing() &&
+           op.getRHS().isMonotonicallyIncreasing();
+  }
+  case AffineExprKind::Mul: {
+    // One operand must be a non-negative constant.
+    auto op = llvm::cast<AffineBinaryOpExpr>(*this);
+    return op.getLHS().isMonotonicallyIncreasing() &&
+           op.getRHS().isMonotonicallyIncreasing() &&
+           (isNonNegativeConstant(op.getLHS()) ||
+            isNonNegativeConstant(op.getRHS()));
+  }
+  case AffineExprKind::FloorDiv:
+  case AffineExprKind::CeilDiv: {
+    auto op = llvm::cast<AffineBinaryOpExpr>(*this);
+    return op.getLHS().isMonotonicallyIncreasing() &&
+           isNonNegativeConstant(op.getRHS());
+  }
+  case AffineExprKind::Mod:
+    return false;
+  }
+  llvm_unreachable("Unknown AffineExpr");
+}
+
 // Returns the greatest known integral divisor of this affine expression.
 int64_t AffineExpr::getLargestKnownDivisor() const {
   AffineBinaryOpExpr binExpr(nullptr);
diff --git a/mlir/test/Dialect/Linalg/tile-tensors.mlir b/mlir/test/Dialect/Linalg/tile-tensors.mlir
@@ -177,9 +177,14 @@ func.func @non_monotonic_affine_expr(%arg0 : tensor<7xf32>) -> tensor<7xf32> {
   %0 = tensor.dim %arg0, %c0 : tensor<7xf32>
   %empty = tensor.empty() : tensor<7xf32>
 
-  // CHECK: %[[OUT:.*]] = tensor.empty() : tensor<7xf32>
-  // CHECK: scf.for {{.*}} to {{.*}} step {{.*}} iter_args(%[[TC0:.*]] = %[[OUT]]) -> (tensor<7xf32>) {
-  // CHECK: tensor.extract_slice %[[TC0]][0] [7] [1] : tensor<7xf32> to tensor<7xf32>
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[OUT:.*]] = tensor.empty() : tensor<7xf32>
+  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index
+  // CHECK-DAG: %[[C7:.*]] = arith.constant 7 : index
+  // CHECK-DAG: %[[C7_1:.*]] = arith.constant 7 : index
+  // CHECK: scf.for %[[IV0:.+]] = %[[C0]] to %[[C7]] step %[[C7_1]] iter_args(%[[TC0:.*]] = %[[OUT]]) -> (tensor<7xf32>) {
+  // CHECK: tensor.extract_slice %[[ARG0]][0] [7] [1] : tensor<7xf32> to tensor<7xf32>
+  // CHECK: tensor.extract_slice %[[TC0]][%[[IV0]]] [7] [1] : tensor<7xf32> to tensor<7xf32>
   %generic = linalg.generic
     {indexing_maps = [affine_map<(d0) -> (d0 mod 4)>,
                       affine_map<(d0) -> (d0)>],
@@ -199,3 +204,44 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+#identity = affine_map<(d0, d1) -> (d0, d1)>
+#identity1 = affine_map<(d0, d1) -> (d0 mod 3, d1)>
+
+// CHECK-LABEL: func @tile_monotonic_outer_dim
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9_]+]]: tensor<4x10xf32>
+func.func @tile_monotonic_outer_dim(%in: tensor<4x10xf32>) -> tensor<4x10xf32> {
+  %empty = tensor.empty() : tensor<4x10xf32>
+  %1 = linalg.generic {indexing_maps = [#identity, #identity1], iterator_types = ["parallel", "parallel"]}
+    ins(%in : tensor<4x10xf32>) outs(%empty : tensor<4x10xf32>) {
+    ^bb1(%a: f32, %b: f32):
+      linalg.yield %a : f32
+  } -> tensor<4x10xf32>
+
+  // CHECK: %[[C4:.+]] = arith.constant 4 : index
+  // CHECK: %[[C4_1:.+]] = arith.constant 4 : index
+  // CHECK: %[[C5:.+]] = arith.constant 5 : index
+  // CHECK: scf.for %[[IV0:.+]] = %{{.+}} to %[[C4]] step %[[C4_1]] iter_args(%[[ARG1:.+]] = %[[OUT:.+]]) -> (tensor<4x10xf32>) {
+  // CHECK:   scf.for %[[IV1:.+]] = %{{.+}} to %{{.+}} step %[[C5]] iter_args(%[[ARG2:.+]] = %[[ARG1]]) -> (tensor<4x10xf32>) {
+  // CHECK:         %[[INSLICE:.+]] = tensor.extract_slice %[[ARG0]][%[[IV0]], %[[IV1]]] [4, 5] [1, 1] : tensor<4x10xf32> to tensor<4x5xf32>
+  // CHECK:         %[[OUTSLICE:.+]] = tensor.extract_slice %[[ARG2]][0, %[[IV1]]] [4, 5] [1, 1] : tensor<4x10xf32> to tensor<4x5xf32>
+  // CHECK:         %[[RES:.+]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%[[INSLICE]] : tensor<4x5xf32>) outs(%[[OUTSLICE]] : tensor<4x5xf32>) {
+  // CHECK:         ^bb0(%in: f32, %out: f32):
+  // CHECK:           linalg.yield %in : f32
+  // CHECK:         } -> tensor<4x5xf32>
+  // CHECK:         %[[INSERT_SLICE:.+]] = tensor.insert_slice %[[RES]] into %[[ARG2]][0, %[[IV1]]] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<4x10xf32>
+  // CHECK:         scf.yield %[[INSERT_SLICE]] : tensor<4x10xf32>
+  // CHECK:       }
+
+  return %1 : tensor<4x10xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg1 : (!transform.any_op) -> !transform.any_op
+    %1, %loops:2 = transform.structured.tile_using_for %0 tile_sizes [4, 5] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+   transform.yield
+  }
+}