[VectorDistribute] Implement layout analysis for transfer_gather (#21164)

Groverkss · qedawkins · web-flow · commit 1110ac1b62a0 · 2025-06-23T16:49:20.000Z
Also improves the implementation of mask layout inference for
transfer_gather operations.

---------

Co-authored-by: Quinn Dawkins &lt;quinn.dawkins@gmail.com&gt;
diff --git a/compiler/src/iree/compiler/Codegen/Common/VectorLayoutAnalysis.cpp b/compiler/src/iree/compiler/Codegen/Common/VectorLayoutAnalysis.cpp
@@ -845,30 +845,15 @@ static void enforceLayoutToTransferReadOp(
     return;
   }
 
-  // Build a transposed layout.
-  SmallVector<unsigned> permutation;
-  AffineMap permMap = read.getPermutationMap();
-  bool isSupportedPerm =
-      permMap.isPermutationOfMinorIdentityWithBroadcasting(permutation);
-  VectorLayoutInterface layout = result->getLayout();
-  SmallVector<int64_t> transposePerm(permutation.begin(), permutation.end());
-  if (isSupportedPerm) {
-    layout = layout.permute(transposePerm);
-    AffineMap toMinorIdentity =
-        AffineMap::getPermutationMap(permutation, permMap.getContext());
-    AffineMap orderedMap = toMinorIdentity.compose(permMap);
-    SmallVector<bool> droppedDims(layout.getRank(), false);
-    for (unsigned bdim : orderedMap.getBroadcastDims()) {
-      droppedDims[bdim] = true;
-    }
-    layout = layout.project(droppedDims);
+  DistributionLayout *maskLattice = operandLattices[0];
 
-    for (auto [index, operandLattice] : llvm::enumerate(operandLattices)) {
-      ChangeResult changed = operandLattice->resolveWithPossibleConflict(
-          layout, getOpOperand(read, index));
-      update(operandLattice, changed);
-    }
-  }
+  VectorLayoutInterface layout = result->getLayout();
+  AffineMap maskMap =
+      inversePermutation(compressUnusedDims(read.getPermutationMap()));
+  VectorLayoutInterface maskLayout = layout.apply(maskMap);
+  ChangeResult changed = maskLattice->resolveWithPossibleConflict(
+      maskLayout, getOpOperand(read, 0));
+  update(maskLattice, changed);
 }
 
 static void enforceLayoutToTransferWriteOp(
@@ -890,22 +875,56 @@ static void enforceLayoutToTransferWriteOp(
     return;
   }
 
-  // Build a transposed layout.
-  SmallVector<unsigned> permutation;
-  AffineMap permMap = write.getPermutationMap();
-  bool isSupportedPerm =
-      permMap.isPermutationOfMinorIdentityWithBroadcasting(permutation);
+  DistributionLayout *maskLattice = operandLattices[1];
+
   VectorLayoutInterface layout = writeOperand->getLayout();
-  SmallVector<int64_t> transposePerm(permutation.begin(), permutation.end());
-  if (isSupportedPerm) {
-    layout = layout.permute(transposePerm);
+  AffineMap maskMap =
+      inversePermutation(compressUnusedDims(write.getPermutationMap()));
+  VectorLayoutInterface maskLayout = layout.apply(maskMap);
+  ChangeResult changed = maskLattice->resolveWithPossibleConflict(
+      maskLayout, getOpOperand(write, 1));
+  update(maskLattice, changed);
+}
+
+static void enforceLayoutToTransferGatherOp(
+    TransferGatherOp gather, ArrayRef<DistributionLayout *> operandLattices,
+    ArrayRef<const DistributionLayout *> resultLattices,
+    std::function<void(DistributionLayout *, ChangeResult)> update) {
+  if (resultLattices.empty()) {
+    return;
+  }
+
+  // transfer_gather has only one vector result.
+  const DistributionLayout *result = resultLattices[0];
+  // Cannot enforce layout if result is uninitialized.
+  if (result->isUninitialized()) {
+    return;
+  }
+  VectorLayoutInterface layout = result->getLayout();
+
+  ArrayRef<DistributionLayout *> indexVecLattices =
+      operandLattices.slice(0, gather.getIndexVecs().size());
+  AffineMap sourceMap =
+      inverseAndBroadcastProjectedPermutation(gather.getPermutationMap());
+  VectorLayoutInterface sourceLayout = layout.apply(sourceMap);
+  for (auto [i, lattice, operand] :
+       llvm::enumerate(indexVecLattices, gather.getIndexVecsMutable())) {
+    AffineMap indexVecMap = gather.getIndexedMapsArray()[i];
+    VectorLayoutInterface indexVecLayout = sourceLayout.apply(indexVecMap);
+    ChangeResult changed =
+        lattice->resolveWithPossibleConflict(indexVecLayout, operand);
+    update(lattice, changed);
   }
 
-  for (auto [index, operandLattice] :
-       llvm::enumerate(operandLattices.slice(1))) {
-    ChangeResult changed = operandLattice->resolveWithPossibleConflict(
-        layout, getOpOperand(write, index + 1));
-    update(operandLattice, changed);
+  if (gather.getMask()) {
+    DistributionLayout *maskLattice =
+        operandLattices[gather.getIndexVecs().size()];
+    AffineMap maskMap =
+        inversePermutation(compressUnusedDims(gather.getPermutationMap()));
+    VectorLayoutInterface maskLayout = layout.apply(maskMap);
+    ChangeResult changed = maskLattice->resolveWithPossibleConflict(
+        maskLayout, gather.getMaskMutable()[0]);
+    update(maskLattice, changed);
   }
 }
 
@@ -964,6 +983,12 @@ void enforcementTransferFunction(
                                    update);
     return;
   }
+
+  if (auto gather = dyn_cast<TransferGatherOp>(op)) {
+    enforceLayoutToTransferGatherOp(gather, operandLattices, resultLattices,
+                                    update);
+    return;
+  }
 }
 
 /// ==========================================================================
diff --git a/compiler/src/iree/compiler/Codegen/Common/test/vector_layout_analysis.mlir b/compiler/src/iree/compiler/Codegen/Common/test/vector_layout_analysis.mlir
@@ -37,6 +37,71 @@ builtin.module attributes { transform.with_named_sequence } {
 
 // -----
 
+#layout = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1],
+  batch_tile = [1, 1],
+  outer_tile = [1, 1],
+  thread_tile = [1, 1],
+  element_tile = [16, 32],
+
+  subgroup_strides = [0, 0],
+  thread_strides   = [0, 0]
+>
+
+builtin.module attributes { transform.with_named_sequence } {
+  func.func @transfer_read_mask(%arr: memref<16x32xf16>, %a: vector<16x32xf16>, %b: vector<16x32xf16>, %cond: i1) -> vector<16x32xf16> {
+    %c0 = arith.constant 0 : index
+    %c12 = arith.constant 12 : index
+    %mask = vector.create_mask %c12 : vector<16xi1>
+    // expected-remark @above {{element_tile = [16]}}
+    %cst_0 = arith.constant 0.0 : f16
+    %root = vector.transfer_read %arr[%c0, %c0], %cst_0, %mask {permutation_map = affine_map<(d0, d1) -> (d1, 0)>, in_bounds = [true, true]} : memref<16x32xf16>, vector<16x32xf16>
+    // expected-remark @above {{element_tile = [16, 32]}}
+    %rootl = iree_vector_ext.to_layout %root to layout(#layout) : vector<16x32xf16>
+    func.return %rootl : vector<16x32xf16>
+  }
+
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
+    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.iree.test_vector_layout_analysis %top_level_func : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+#layout = #iree_vector_ext.nested_layout<
+  subgroup_tile = [1, 1, 1],
+  batch_tile = [1, 1, 1],
+  outer_tile = [1, 1, 1],
+  thread_tile = [1, 1, 1],
+  element_tile = [16, 8, 4],
+
+  subgroup_strides = [0, 0, 0],
+  thread_strides   = [0, 0, 0]
+>
+
+builtin.module attributes { transform.with_named_sequence } {
+  func.func @transfer_write_mask(%arr: memref<32x32x32x32xf16>, %d: vector<16x8x4xf16>) {
+    %c0 = arith.constant 0 : index
+    %c12 = arith.constant 12 : index
+    %mask = vector.create_mask %c12, %c12, %c12 : vector<8x16x4xi1>
+    // expected-remark @above {{element_tile = [8, 16, 4]}}
+    %dl = iree_vector_ext.to_layout %d to layout(#layout) : vector<16x8x4xf16>
+    vector.transfer_write %dl, %arr[%c0, %c0, %c0, %c0], %mask {permutation_map = affine_map<(d0, d1, d2, d3) -> (d1, d0, d3)>, in_bounds = [true, true, true]} : vector<16x8x4xf16>, memref<32x32x32x32xf16>
+    return
+  }
+
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
+    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.iree.test_vector_layout_analysis %top_level_func : !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+
 #layout = #iree_vector_ext.nested_layout<
   subgroup_tile = [1, 1],
   batch_tile = [1, 1],
@@ -759,3 +824,60 @@ builtin.module attributes { transform.with_named_sequence } {
     transform.yield
   }
 }
+
+// -----
+
+#layout_1d = #iree_vector_ext.nested_layout<
+  subgroup_tile = [4],
+  batch_tile = [4],
+  outer_tile = [1],
+  thread_tile = [1],
+  element_tile = [1],
+
+  subgroup_strides = [1],
+  thread_strides = [0]
+>
+
+#layout = #iree_vector_ext.nested_layout<
+  subgroup_tile = [4, 1],
+  batch_tile = [4, 1],
+  outer_tile = [1, 1],
+  thread_tile = [1, 1],
+  element_tile = [1, 8],
+
+  subgroup_strides = [1, 0],
+  thread_strides = [0, 0]
+>
+
+builtin.module attributes { transform.with_named_sequence } {
+  func.func @paged_transfer_gather(%indices: vector<16xindex>,
+    %source: memref<4096x512x8xf16>) -> vector<16x8xf16> {
+
+    %cst0 = arith.constant 0.0 : f16
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant dense<1> : vector<16xindex>
+    // expected-remark @above {{element_tile = [1]}}
+    %c7 = arith.constant 7 : index
+    %dim = memref.dim %source, %c0 : memref<4096x512x8xf16>
+    %mask = vector.create_mask %c7, %c7 : vector<16x8xi1>
+    // expected-remark @above {{element_tile = [1, 8]}}
+    %indices1 = arith.addi %indices, %c1 : vector<16xindex>
+    // expected-remark @above {{element_tile = [1]}}
+    %out = iree_vector_ext.transfer_gather %source[%c0, %c0, %c0]
+    // expected-remark @above {{element_tile = [1, 8]}}
+    [None, %indices1: vector<16xindex>, None], %cst0, %mask { indexed_maps = [
+                                               affine_map<(d0, d1, d2) -> (d1)>],
+      permutation_map = affine_map<(d0, d1, d2) -> (d1, d2)>,
+      in_bounds = [true, true] }
+    : memref<4096x512x8xf16>, vector<16x8xf16>
+    %l_out = iree_vector_ext.to_layout %out to layout(#layout) : vector<16x8xf16>
+
+    return %l_out : vector<16x8xf16>
+  }
+
+  transform.named_sequence @__transform_main(%variant_op: !transform.any_op {transform.readonly}) {
+    %top_level_func = transform.structured.match ops{["func.func"]} in %variant_op : (!transform.any_op) -> !transform.any_op
+    transform.iree.test_vector_layout_analysis %top_level_func : !transform.any_op
+    transform.yield
+  }
+}
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.cpp b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtAttrs.cpp
@@ -68,6 +68,36 @@ NestedLayoutAttr::project(ArrayRef<bool> droppedDims) const {
                                subgroupStrides, threadStrides);
 }
 
+VectorLayoutInterface NestedLayoutAttr::apply(AffineMap map) const {
+  assert(map.getNumDims() == getRank() &&
+         "map domain size must match layout rank");
+
+  SmallVector<int64_t> subgroupCount(map.getNumResults(), 1);
+  SmallVector<int64_t> batchCount(map.getNumResults(), 1);
+  SmallVector<int64_t> outerCount(map.getNumResults(), 1);
+  SmallVector<int64_t> threadCount(map.getNumResults(), 1);
+  SmallVector<int64_t> elementCount(map.getNumResults(), 1);
+  SmallVector<int64_t> subgroupStrides(map.getNumResults(), 0);
+  SmallVector<int64_t> threadStrides(map.getNumResults(), 0);
+
+  for (auto [idx, expr] : llvm::enumerate(map.getResults())) {
+    if (auto dim = dyn_cast<AffineDimExpr>(expr)) {
+      int64_t pos = dim.getPosition();
+      subgroupCount[idx] = getSubgroupTile()[pos];
+      batchCount[idx] = getBatchTile()[pos];
+      outerCount[idx] = getOuterTile()[pos];
+      threadCount[idx] = getThreadTile()[pos];
+      elementCount[idx] = getElementTile()[pos];
+      subgroupStrides[idx] = getSubgroupStrides()[pos];
+      threadStrides[idx] = getThreadStrides()[pos];
+    }
+  }
+
+  return NestedLayoutAttr::get(getContext(), subgroupCount, batchCount,
+                               outerCount, threadCount, elementCount,
+                               subgroupStrides, threadStrides);
+}
+
 VectorLayoutInterface
 NestedLayoutAttr::permute(ArrayRef<int64_t> permutation) const {
   SmallVector<int64_t> invPerm = invertPermutationVector(permutation);
diff --git a/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtInterfaces.td b/compiler/src/iree/compiler/Codegen/Dialect/VectorExt/IR/VectorExtInterfaces.td
@@ -35,6 +35,12 @@ def VectorLayoutInterface : AttrInterface<"VectorLayoutInterface"> {
       /*methodName=*/"project",
       /*args=*/(ins "::llvm::ArrayRef<bool>":$droppedDims)
     >,
+    InterfaceMethod<
+      /*description=*/"Apply the given AffineMap to the layout.",
+      /*retTy=*/"VectorLayoutInterface",
+      /*methodName=*/"apply",
+      /*args=*/(ins "::mlir::AffineMap":$map)
+    >,
     InterfaceMethod<
       /*description=*/"Get the expected undistributed shape for the given vector type.",
       /*retTy=*/"SmallVector<int64_t>",