[GPU] Add vector distribution pattern for map_scatter (#21124)

Max191 · web-flow · commit 8a2f20147c2a · 2025-08-01T18:51:58.000-07:00
Adds a vector distribution pattern for `iree_linalg_ext.map_scatter`.
The implementation is similar to that of vector.transfer_write without
masking, and the main difference is in how the distributed offsets are
handled by the distributed op.

This distribution will be used after the map_scatter op is vectorized,
but before it is decomposed. This keeps the distribution pattern simple,
because only the input vector needs to be distributed, and the index
mapping to the distributed space is very simple.

---------

Signed-off-by: Max Dawkins &lt;max.dawkins@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUNestedLayoutDistributionPatterns.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUNestedLayoutDistributionPatterns.cpp
@@ -10,6 +10,7 @@
 #include "iree/compiler/Codegen/Common/VectorLayoutAnalysis.h"
 #include "iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtDialect.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
+#include "iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.h"
 #include "iree/compiler/Utils/Permutation.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
@@ -282,6 +283,17 @@ static VectorValue projectVector(RewriterBase &rewriter, Location loc,
   return cast<VectorValue>(sliced.getResult());
 }
 
+static VectorValue extractSliceAsVector(RewriterBase &rewriter, Location loc,
+                                        Value src, ArrayRef<int64_t> offsets) {
+  Value slice = rewriter.create<vector::ExtractOp>(loc, src, offsets);
+  // Promote the slicedVector to 0-d vector if it is a scalar.
+  if (!isa<VectorType>(slice.getType())) {
+    auto promotedType = VectorType::get({}, getElementTypeOrSelf(slice));
+    slice = rewriter.create<vector::BroadcastOp>(loc, promotedType, slice);
+  }
+  return cast<VectorValue>(slice);
+}
+
 namespace {
 
 /// Pattern to distribute `vector.transfer_read` ops with nested layouts.
@@ -476,16 +488,9 @@ struct DistributeTransferWrite final
       // dimensions are either unrolled or distributed such that this is a
       // contiguous slice.
       ArrayRef<int64_t> offsetArray(offsets);
-      Value slicedVector = rewriter.create<vector::ExtractOp>(
-          writeOp.getLoc(), distributedVector,
-          offsetArray.take_front(rank * 2));
-      // Promote the slicedVector to 0-d vector if it is a scalar.
-      if (!isa<VectorType>(slicedVector.getType())) {
-        auto promotedType =
-            VectorType::get({}, getElementTypeOrSelf(slicedVector));
-        slicedVector = rewriter.create<vector::BroadcastOp>(
-            writeOp.getLoc(), promotedType, slicedVector);
-      }
+      VectorValue slicedVector =
+          extractSliceAsVector(rewriter, writeOp.getLoc(), distributedVector,
+                               offsetArray.take_front(rank * 2));
 
       VectorValue slicedMask = nullptr;
       if (mask) {
@@ -676,6 +681,104 @@ struct DistributeTransferGather final
   int64_t subgroupSize;
 };
 
+/// Pattern to distribute `iree_linalg_ext.map_scatter` ops with nested layouts.
+/// Only the input is distributed, since the output is never a vector. The
+/// distribution of the input is similar to that of a vector.transfer_write.
+struct DistributeMapScatter final
+    : OpDistributionPattern<IREE::LinalgExt::MapScatterOp> {
+  using OpDistributionPattern::OpDistributionPattern;
+
+  DistributeMapScatter(MLIRContext *context, Value threadId,
+                       int64_t subgroupSize)
+      : OpDistributionPattern(context), threadId(threadId),
+        subgroupSize(subgroupSize) {}
+
+  LogicalResult matchAndRewrite(IREE::LinalgExt::MapScatterOp mapScatterOp,
+                                DistributionSignature &signature,
+                                PatternRewriter &rewriter) const override {
+    auto input = dyn_cast<VectorValue>(mapScatterOp.getInput());
+    if (!input) {
+      return rewriter.notifyMatchFailure(mapScatterOp, "input is not a vector");
+    }
+    NestedLayoutAttr vectorLayout =
+        dyn_cast<NestedLayoutAttr>(signature[input]);
+    if (!vectorLayout) {
+      return rewriter.notifyMatchFailure(mapScatterOp,
+                                         "non-nested map_scatter layout");
+    }
+    if (!isa<MemRefType>(mapScatterOp.getOutput().getType())) {
+      return rewriter.notifyMatchFailure(mapScatterOp,
+                                         "distribution expects memrefs");
+    }
+    SmallVector<Value> warpIndices, threadIndices;
+    if (failed(populateWarpAndThreadIndices(rewriter, threadId, subgroupSize,
+                                            vectorLayout, warpIndices,
+                                            threadIndices))) {
+      return rewriter.notifyMatchFailure(
+          mapScatterOp, "warp or thread tiles have overlapping strides");
+    }
+
+    Value distributedVector = getDistributed(rewriter, input, vectorLayout);
+
+    Location loc = mapScatterOp.getLoc();
+    Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    SmallVector<int64_t> distShape = vectorLayout.getDistributedShape();
+    SmallVector<int64_t> tileShape = getElementVectorTileShape(vectorLayout);
+    for (auto [idx, offsets] :
+         llvm::enumerate(StaticTileOffsetRange(distShape, tileShape))) {
+      // Extract the "element vector" from the inner most dimensions. All outer
+      // dimensions are either unrolled or distributed such that this is a
+      // contiguous slice.
+      ArrayRef<int64_t> offsetArray(offsets);
+      VectorValue distributedInput = extractSliceAsVector(
+          rewriter, loc, distributedVector,
+          offsetArray.take_front(vectorLayout.getRank() * 2));
+
+      // Clone the map_scatter op with the "element vector" as the input, and
+      // adjust the transformation region to account for the distributed
+      // offsets.
+      AffineMap permutationMap =
+          rewriter.getMultiDimIdentityMap(input.getType().getRank());
+      SmallVector<Value> indices(input.getType().getRank(), zero);
+      SmallVector<Value> distributedOffsets =
+          getTransferIndicesFromNestedLayout(rewriter, indices, offsets,
+                                             vectorLayout, permutationMap,
+                                             warpIndices, threadIndices);
+      IREE::LinalgExt::MapScatterOp distributedMapScatter =
+          clone(rewriter, mapScatterOp, mapScatterOp.getResultTypes(),
+                {distributedInput, mapScatterOp.getOutput()});
+      int64_t sliceRank = distributedInput.getType().getRank();
+      int64_t rankDiff = input.getType().getRank() - sliceRank;
+      // Add the distributed offsets in the map_scatter transformation body.
+      auto transformationBuilder = [&](ArrayRef<BlockArgument> newIndices) {
+        SmallVector<Value> replacementIndices(distributedOffsets);
+        for (auto [i, replacementIdx] : llvm::enumerate(replacementIndices)) {
+          // Rank-reduced dimensions can be directly replaced by the distributed
+          // index, since their size is 1 in the new map_scatter input.
+          if (i < rankDiff) {
+            continue;
+          }
+          // Otherwise, the dimension is a contiguous element dimension, so
+          // the mapping is achieved by adding the corresponding block argument
+          // to the sliced index.
+          BlockArgument newTransformationIdx = newIndices[i - rankDiff];
+          replacementIdx = rewriter.create<arith::AddIOp>(
+              loc, newTransformationIdx, replacementIdx);
+        }
+        return replacementIndices;
+      };
+      distributedMapScatter.insertTransformationAtStart(
+          rewriter, transformationBuilder, sliceRank);
+    }
+
+    rewriter.eraseOp(mapScatterOp);
+    return success();
+  }
+
+  Value threadId;
+  int64_t subgroupSize;
+};
+
 static VectorValue broadcastToShape(RewriterBase &rewriter, Value source,
                                     ArrayRef<int64_t> shape,
                                     ArrayRef<bool> broadcastedDims) {
@@ -2030,8 +2133,8 @@ void populateGPUDistributeNestedLayoutAttrPatterns(RewritePatternSet &patterns,
                                                    int64_t subgroupSize,
                                                    int64_t maxBitsPerShuffle) {
   patterns.add<DistributeTransferRead, DistributeTransferWrite,
-               DistributeTransferGather>(patterns.getContext(), threadId,
-                                         subgroupSize);
+               DistributeTransferGather, DistributeMapScatter>(
+      patterns.getContext(), threadId, subgroupSize);
   patterns.add<DistributeBroadcast, DistributeTranspose>(patterns.getContext());
   patterns.add<DistributeMultiReduction>(patterns.getContext(), subgroupSize,
                                          maxBitsPerShuffle);
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_nested_layout_vector_distribution.mlir
@@ -1346,3 +1346,66 @@ builtin.module attributes { transform.with_named_sequence } {
 
 // CHECK-LABEL: @paged_transfer_gather_multi_index
 // CHECK-COUNT-4: vector_ext.transfer_gather
+
+// -----
+
+#layout_row_major = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile    = [2, 2],
+  outer_tile        = [1, 1],
+  thread_tile       = [8, 1],
+  element_tile     = [1, 8],
+
+  subgroup_strides        = [1, 1],
+  thread_strides          = [1, 1]
+>
+
+func.func @distribute_map_scatter_row_major(%root: vector<16x16xf16>, %output: memref<64x64xf16>) {
+  %rootl = iree_vector_ext.to_layout %root to layout(#layout_row_major) : vector<16x16xf16>
+  iree_linalg_ext.map_scatter %rootl into %output {
+    ^bb0(%idx0: index, %idx1: index):
+      %mask = arith.constant true
+      iree_linalg_ext.yield %idx0, %idx1, %mask : index, index, i1
+  } : vector<16x16xf16> into memref<64x64xf16>
+  func.return
+}
+
+builtin.module attributes { transform.with_named_sequence } {
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
+    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.iree.test_gpu_vector_distribution %top_level_func : !transform.any_op
+    transform.yield
+  }
+}
+
+// CHECK-LABEL: @distribute_map_scatter_row_major
+//   CHECK-DAG:   %[[IDX:.+]] = gpu.thread_id  x
+//   CHECK-DAG:   %[[C8:.+]] = arith.constant 8 : index
+//   CHECK-DAG:   %[[LANEX:.+]]:2 = affine.delinearize_index %[[IDX]] into (8)
+//   CHECK-DAG:   %[[SLICE0:.+]] = vector.extract %{{.*}}[0, 0, 0, 0]
+//       CHECK:   iree_linalg_ext.map_scatter %[[SLICE0]]
+//       CHECK:     ^bb0(%[[IDX0:.+]]: index, %[[IDX1:.+]]: index):
+//       CHECK:       %[[DISTRIBUTED_IDX0:.+]] = arith.addi %[[IDX0]], %[[LANEX]]#1
+//       CHECK:       iree_linalg_ext.yield %[[DISTRIBUTED_IDX0]], %[[IDX1]]
+//       CHECK:     : vector<1x8xf16> into memref<64x64xf16>
+//       CHECK:   %[[SLICE1:.+]] = vector.extract %{{.*}}[0, 1, 0, 0]
+//       CHECK:   iree_linalg_ext.map_scatter %[[SLICE1]]
+//       CHECK:     ^bb0(%[[IDX0:.+]]: index, %[[IDX1:.+]]: index):
+//   CHECK-DAG:       %[[DISTRIBUTED_IDX0:.+]] = arith.addi %[[IDX0]], %[[LANEX]]#1
+//   CHECK-DAG:       %[[DISTRIBUTED_IDX1:.+]] = arith.addi %[[IDX1]], %[[C8]]
+//       CHECK:       iree_linalg_ext.yield %[[DISTRIBUTED_IDX0]], %[[DISTRIBUTED_IDX1]]
+//       CHECK:     : vector<1x8xf16> into memref<64x64xf16>
+//   CHECK-DAG:   %[[LANEX_PLUS_VECDIMX:.+]] = affine.linearize_index disjoint [%c1, %[[LANEX]]#1] by (2, 8)
+//   CHECK-DAG:   %[[SLICE2:.+]] = vector.extract %{{.*}}[1, 0, 0, 0]
+//       CHECK:   iree_linalg_ext.map_scatter %[[SLICE2]]
+//       CHECK:     ^bb0(%[[IDX0:.+]]: index, %[[IDX1:.+]]: index):
+//       CHECK:       %[[DISTRIBUTED_IDX0:.+]] = arith.addi %[[IDX0]], %[[LANEX_PLUS_VECDIMX]]
+//       CHECK:       iree_linalg_ext.yield %[[DISTRIBUTED_IDX0]], %[[IDX1]]
+//       CHECK:     : vector<1x8xf16> into memref<64x64xf16>
+//       CHECK:   %[[SLICE3:.+]] = vector.extract %{{.*}}[1, 1, 0, 0]
+//       CHECK:   iree_linalg_ext.map_scatter %[[SLICE3]]
+//       CHECK:     ^bb0(%[[IDX0:.+]]: index, %[[IDX1:.+]]: index):
+//   CHECK-DAG:       %[[DISTRIBUTED_IDX0:.+]] = arith.addi %[[IDX0]], %[[LANEX_PLUS_VECDIMX]]
+//   CHECK-DAG:       %[[DISTRIBUTED_IDX1:.+]] = arith.addi %[[IDX1]], %[[C8]]
+//       CHECK:       iree_linalg_ext.yield %[[DISTRIBUTED_IDX0]], %[[DISTRIBUTED_IDX1]]
+//       CHECK:     : vector<1x8xf16> into memref<64x64xf16>