Fix result type mismatch of tiled ops (#20211)

pravg-amd · web-flow · commit 1c10dac77e2c · 2025-03-13T11:28:53.000+05:30
Use OpFoldResult wherever possible in LinalgExt tiling implementation. This helps in avoiding result type mismtach issues that otherwise may occur when the tiled producers of slices are fused. Fixes issue: #17526 --------- Signed-off-by: Praveen G <praveen.g2@amd.com>
diff --git a/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile_and_fuse.mlir b/compiler/src/iree/compiler/Codegen/LLVMCPU/test/tile_and_fuse.mlir
@@ -214,3 +214,38 @@ func.func @ukernel_generic(%arg0: tensor<1x192x1x16xf32>, %arg1: tensor<1x768x1x
 // CHECK:           linalg.generic
 // CHECK-SAME:        ins(%[[UK_SLICE]], %[[ARG3_SLICE]]
 // CHECK-SAME:        outs(%[[ITER_SLICE]]
+
+// -----
+
+func.func @tile_linalg_ext_scan() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDefault>} {
+  %c0_i64 = arith.constant 0 : i64
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<128x2xf32>>
+  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<128x2xi64>>
+  %2 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [128, 2], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<128x2xf32>> -> tensor<128x2xf32>
+  %3 = tensor.empty() : tensor<2xi64>
+  %4 = tensor.empty() : tensor<128x2xi64>
+  %5 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%2 : tensor<128x2xf32>) outs(%4 : tensor<128x2xi64>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1], [0, 1], [0, 0], [1, 0]]>} {
+  ^bb0(%in: f32, %out: i64):
+    %9 = arith.fptosi %in : f32 to i64
+    linalg.yield %9 : i64
+  } -> tensor<128x2xi64>
+  %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0], [0], [0], [1]]>} ins(%c0_i64 : i64) outs(%3 : tensor<2xi64>) -> tensor<2xi64>
+  %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1], [0, 1], [0, 0], [1, 0]]>} ins(%c0_i64 : i64) outs(%4 : tensor<128x2xi64>) -> tensor<128x2xi64>
+  %8:2 = iree_linalg_ext.scan {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[0, 1], [0, 1]]>} dimension(0) inclusive(true) ins(%5 : tensor<128x2xi64>) outs(%7, %6 : tensor<128x2xi64>, tensor<2xi64>) {
+  ^bb0(%arg0: i64, %arg1: i64):
+    %9 = arith.addi %arg0, %arg1 : i64
+    iree_linalg_ext.yield %9 : i64
+  } -> tensor<128x2xi64>, tensor<2xi64>
+  flow.dispatch.tensor.store %8#0, %1, offsets = [0, 0], sizes = [128, 2], strides = [1, 1] : tensor<128x2xi64> -> !flow.dispatch.tensor<writeonly:tensor<128x2xi64>>
+  return
+}
+// CHECK-LABEL: func.func @tile_linalg_ext_scan
+// CHECK:         scf.for
+// CHECK-SAME:    {
+// CHECK:           linalg.generic
+// CHECK:           linalg.fill
+// CHECK:           linalg.fill
+// CHECK:           iree_linalg_ext.scan
+// CHECK:           scf.yield
+// CHECK:         }
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/IR/TilingInterfaceImpl.cpp b/compiler/src/iree/compiler/Dialect/LinalgExt/IR/TilingInterfaceImpl.cpp
@@ -83,8 +83,8 @@ SmallVector<utils::IteratorType> ScatterOp::getLoopIteratorTypes() {
 
 SmallVector<Range> ScatterOp::getIterationDomain(OpBuilder &builder) {
   Location loc = getLoc();
-  Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
-  Value one = builder.create<arith::ConstantIndexOp>(loc, 1);
+  OpFoldResult zero = builder.getIndexAttr(0);
+  OpFoldResult one = builder.getIndexAttr(1);
   SmallVector<Range> ranges;
   for (auto dim : llvm::seq<int64_t>(0, getUpdateType().getRank())) {
     OpFoldResult ub = getDim(builder, loc, getUpdates(), dim);
@@ -293,12 +293,12 @@ SmallVector<Range> SortOp::getIterationDomain(OpBuilder &builder) {
   int64_t operandRank = getOperandRank();
   SmallVector<Range> loopBounds(operandRank);
   Location loc = getLoc();
-  Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
-  Value one = builder.create<arith::ConstantIndexOp>(loc, 1);
+  OpFoldResult zero = builder.getIndexAttr(0);
+  OpFoldResult one = builder.getIndexAttr(1);
   Value source = getOperand(0);
   for (auto dim : llvm::seq<int64_t>(0, operandRank)) {
     loopBounds[dim].offset = zero;
-    loopBounds[dim].size = getDimValue(builder, loc, source, dim);
+    loopBounds[dim].size = getDim(builder, loc, source, dim);
     loopBounds[dim].stride = one;
   }
   return loopBounds;
@@ -435,16 +435,16 @@ SmallVector<Range> FftOp::getIterationDomain(OpBuilder &builder) {
   Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
   Value one = builder.create<arith::ConstantIndexOp>(loc, 1);
   for (auto [idx, val] : llvm::enumerate(getOperandShape().drop_back())) {
-    Value size;
+    OpFoldResult size;
     if (ShapedType::isDynamic(val)) {
       size = getDimValue(builder, loc, getReal(), idx);
     } else {
-      size = builder.create<arith::ConstantIndexOp>(loc, val);
+      size = builder.getIndexAttr(val);
     }
     res.emplace_back(Range{/*offset=*/zero, size, /*stride=*/one});
   }
 
-  Value size = getDimValue(builder, loc, getReal(), getOperandRank() - 1);
+  OpFoldResult size = getDim(builder, loc, getReal(), getOperandRank() - 1);
   Value stride = builder.create<arith::ShLIOp>(loc, one, getStage());
   res.emplace_back(Range{/*offset=*/zero, size, /*stride=*/stride});
   return res;
@@ -643,12 +643,12 @@ SmallVector<Range> ScanOp::getIterationDomain(OpBuilder &builder) {
   int64_t operandRank = getOperandRank();
   SmallVector<Range> loopBounds(operandRank);
   Location loc = getLoc();
-  Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
-  Value one = builder.create<arith::ConstantIndexOp>(loc, 1);
+  OpFoldResult zero = builder.getIndexAttr(0);
+  OpFoldResult one = builder.getIndexAttr(1);
   Value source = getInput();
   for (auto dim : llvm::seq<int64_t>(0, operandRank)) {
     loopBounds[dim].offset = zero;
-    loopBounds[dim].size = getDimValue(builder, loc, source, dim);
+    loopBounds[dim].size = getDim(builder, loc, source, dim);
     loopBounds[dim].stride = one;
   }
   return loopBounds;
@@ -836,12 +836,12 @@ SmallVector<Range> TopkOp::getIterationDomain(OpBuilder &builder) {
   int64_t operandRank = getInputRank();
   SmallVector<Range> loopBounds(operandRank);
   Location loc = getLoc();
-  Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
-  Value one = builder.create<arith::ConstantIndexOp>(loc, 1);
+  OpFoldResult zero = builder.getIndexAttr(0);
+  OpFoldResult one = builder.getIndexAttr(1);
   Value source = getValues();
   for (auto [idx, val] : llvm::enumerate(getInputType().getShape())) {
     loopBounds[idx].offset = zero;
-    loopBounds[idx].size = getDimValue(builder, loc, source, idx);
+    loopBounds[idx].size = getDim(builder, loc, source, idx);
     loopBounds[idx].stride = one;
   }
   return loopBounds;
@@ -1285,7 +1285,7 @@ SmallVector<Range> Im2colOp::getIterationDomain(OpBuilder &builder) {
   SmallVector<Range> loopBounds(getOutputRank());
   for (int dim = 0; dim < getOutputRank(); ++dim) {
     loopBounds[dim].offset = zero;
-    loopBounds[dim].size = getDimValue(builder, loc, dest, dim);
+    loopBounds[dim].size = getDim(builder, loc, dest, dim);
     loopBounds[dim].stride = one;
   }
   return loopBounds;
@@ -1391,15 +1391,15 @@ LogicalResult Im2colOp::getResultTilePosition(
 SmallVector<Range>
 WinogradInputTransformOp::getIterationDomain(OpBuilder &builder) {
   Location loc = getLoc();
-  Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
-  Value one = builder.create<arith::ConstantIndexOp>(loc, 1);
+  OpFoldResult zero = builder.getIndexAttr(0);
+  OpFoldResult one = builder.getIndexAttr(1);
   Value dest = getOutput();
   SmallVector<Range> loopBounds(getIterationDomainRank());
   int count = 0;
   for (auto dim :
        llvm::seq<int64_t>(getImageDimensions().size(), getOutputRank())) {
     loopBounds[count].offset = zero;
-    loopBounds[count].size = getDimValue(builder, loc, dest, dim);
+    loopBounds[count].size = getDim(builder, loc, dest, dim);
     loopBounds[count].stride = one;
     count++;
   }
@@ -1537,7 +1537,7 @@ WinogradFilterTransformOp::getIterationDomain(OpBuilder &builder) {
   for (auto dim : llvm::seq<int64_t>(numKernelDims, outRank)) {
     int64_t loopDim = dim - numKernelDims;
     loopBounds[loopDim].offset = zero;
-    loopBounds[loopDim].size = getDimValue(builder, loc, source, dim);
+    loopBounds[loopDim].size = getDim(builder, loc, source, dim);
     loopBounds[loopDim].stride = one;
   }
   return loopBounds;
@@ -1640,15 +1640,15 @@ LogicalResult WinogradFilterTransformOp::getResultTilePosition(
 SmallVector<Range>
 WinogradOutputTransformOp::getIterationDomain(OpBuilder &builder) {
   Location loc = getLoc();
-  Value zero = builder.create<arith::ConstantIndexOp>(loc, 0);
-  Value one = builder.create<arith::ConstantIndexOp>(loc, 1);
+  OpFoldResult zero = builder.getIndexAttr(0);
+  OpFoldResult one = builder.getIndexAttr(1);
   Value source = getInput();
   SmallVector<Range> loopBounds(getIterationDomainRank());
   int count = 0;
   for (auto dim :
        llvm::seq<int64_t>(getImageDimensions().size(), getInputRank())) {
     loopBounds[count].offset = zero;
-    loopBounds[count].size = getDimValue(builder, loc, source, dim);
+    loopBounds[count].size = getDim(builder, loc, source, dim);
     loopBounds[count].stride = one;
     count++;
   }