[LinalgExt] Add support for fusing scatter with producers (iree-org#19584)

qedawkins · web-flow · commit 9df82fd4b583 · 2025-01-03T13:05:56.000-05:00
This adds implementations for "getIterationDomainTileFromOperandTile"
and "getTiledImplementationFromOperandTile" to linalg_ext.scatter. This
allows fusing scatters with producer loops during tiling. The
implementation of these methods is trivial because the iteration domain
is already defined in terms of the input operands, so we can just invoke
the tiling implementation.
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_workgroups_using_forall.mlir b/compiler/src/iree/compiler/Codegen/Common/test/tile_and_distribute_workgroups_using_forall.mlir
@@ -672,3 +672,32 @@ func.func @v_shaped_graph(%0: tensor<12xf32>, %1: tensor<12xf32>) -> tensor<12xf
 //   CHECK-DAG:     %[[RIGHT:.+]] = linalg.generic {{.*}} ins(%[[SLICE1]]
 //       CHECK:     linalg.generic {{.*}} ins(%[[LEFT]], %[[RIGHT]]
 //       CHECK:   return %[[RESULT]]
+
+// -----
+
+func.func @consumer_fuse_scatter(%arg0: tensor<3x2048x2048xf32>,
+                                 %arg1: tensor<3x2048x2048xf32>,
+                                 %arg2: tensor<3x1xi32>) -> tensor<3x2048x2048xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %0 = tensor.empty() : tensor<3x2048x2048xf32>
+  %1 = linalg.add {lowering_config = #iree_gpu.lowering_config<{workgroup = [1, 1, 256]}>}
+    ins(%arg0, %arg1 : tensor<3x2048x2048xf32>, tensor<3x2048x2048xf32>) outs(%0 : tensor<3x2048x2048xf32>) -> tensor<3x2048x2048xf32>
+  %2 = iree_linalg_ext.scatter dimension_map = [0] unique_indices(true)
+    ins(%1, %arg2 : tensor<3x2048x2048xf32>, tensor<3x1xi32>) outs(%0 : tensor<3x2048x2048xf32>) {
+  ^bb0(%arg3: f32, %arg4: f32):
+    iree_linalg_ext.yield %arg3 : f32
+  } -> tensor<3x2048x2048xf32>
+  return %2 : tensor<3x2048x2048xf32>
+}
+
+// CHECK-LABEL: func @consumer_fuse_scatter(
+//  CHECK-SAME:   %[[LHS:[A-Za-z0-9]+]]: tensor<3x2048x2048xf32>
+//  CHECK-SAME:   %[[RHS:[A-Za-z0-9]+]]: tensor<3x2048x2048xf32>
+//  CHECK-SAME:   %[[IND:[A-Za-z0-9]+]]: tensor<3x1xi32>
+//       CHECK:   %[[RESULT:.+]] = scf.forall (%[[ID0:.+]], %[[ID1:.+]], %[[ID2:[A-Za-z0-9]+]]) {{.*}} shared_outs(%[[DEST:.+]] = %{{.*}})
+//   CHECK-DAG:     %[[SRC:.+]] = linalg.add
+//   CHECK-DAG:     %[[IND_SLICE:.+]] = tensor.extract_slice %[[IND]][%[[ID0]], 0] {{.*}} : tensor<3x1xi32> to tensor<1x1xi32>
+//   CHECK-DAG:     %[[DEST_SLICE:.+]] = tensor.extract_slice %[[DEST]][0, %[[ID1]], %[[ID2]]] {{.*}} to tensor<3x1x256xf32>
+//       CHECK:     %[[SCATTER:.+]] = iree_linalg_ext.scatter dimension_map = [0] unique_indices(true)
+//  CHECK-SAME:       ins(%[[SRC]], %[[IND_SLICE]]{{.*}} outs(%[[DEST_SLICE]]
+//       CHECK:       tensor.parallel_insert_slice %[[SCATTER]] into %[[DEST]][0, %[[ID1]], %[[ID2]]]
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.td b/compiler/src/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.td
@@ -102,7 +102,9 @@ def IREELinalgExt_ScatterOp : IREELinalgExt_Op<"scatter",
          "getIterationDomain",
          "getLoopIteratorTypes",
          "getResultTilePosition",
-         "getTiledImplementation"]>]> {
+         "getTiledImplementation",
+         "getIterationDomainTileFromOperandTile",
+         "getTiledImplementationFromOperandTile"]>]> {
   let summary = "Scatter operator";
   let description = [{
     Based on XLA operation semantics, takes two `inputs` (`update` and
diff --git a/compiler/src/iree/compiler/Dialect/LinalgExt/IR/TilingInterfaceImpl.cpp b/compiler/src/iree/compiler/Dialect/LinalgExt/IR/TilingInterfaceImpl.cpp
@@ -181,6 +181,44 @@ LogicalResult ScatterOp::getResultTilePosition(
   return success();
 }
 
+/// Method to return the position of the result tile computed by the tiled
+/// operation.
+LogicalResult ScatterOp::getIterationDomainTileFromOperandTile(
+    OpBuilder &b, unsigned operandNumber, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes,
+    SmallVectorImpl<OpFoldResult> &iterDomainOffsets,
+    SmallVectorImpl<OpFoldResult> &iterDomainSizes) {
+  // Fusion with producers is not possible in general if `unique_indices` is not
+  // true as reductions along the scattered indices are not tilable in parallel.
+  if (!getUniqueIndices()) {
+    return failure();
+  }
+  // TODO: Support fusion along the index operand. For the index operand, the
+  // offset + size must be the full size for the inner most dim.
+  if (getInputs().getBeginOperandIndex() != operandNumber) {
+    return failure();
+  }
+
+  // The iteration domain is defined in terms of the |input|, so simply
+  // use the given offsets/sizes.
+  iterDomainOffsets.assign(offsets.begin(), offsets.end());
+  iterDomainSizes.assign(sizes.begin(), sizes.end());
+  return success();
+}
+
+/// Method to generate the tiled implementation of an operation from the tile
+/// of the operand.
+FailureOr<TilingResult> ScatterOp::getTiledImplementationFromOperandTile(
+    OpBuilder &b, unsigned operandNumber, ArrayRef<OpFoldResult> offsets,
+    ArrayRef<OpFoldResult> sizes) {
+  SmallVector<OpFoldResult> mappedOffsets, mappedSizes;
+  if (failed(getIterationDomainTileFromOperandTile(
+          b, operandNumber, offsets, sizes, mappedOffsets, mappedSizes))) {
+    return failure();
+  }
+  return getTiledImplementation(b, mappedOffsets, mappedSizes);
+}
+
 LogicalResult ScatterOp::generateScalarImplementation(OpBuilder &b,
                                                       Location loc,
                                                       ValueRange ivs) {