Address feedback

akroviakov · akroviakov · commit 3c4a5aa8e0a7 · 2025-10-28T16:11:34.000Z
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -171,7 +171,7 @@ def XeGPU_WGLevel: I32EnumAttrCase<"WG", 0, "wg">;
 def XeGPU_SGLevel: I32EnumAttrCase<"SG", 1, "sg">;
 def XeGPU_WILevel: I32EnumAttrCase<"WI", 2, "wi">;
 def XeGPU_DistributionLevel: I32EnumAttr<"DistributionLevel",
-      "The enumeration for the scope of fence operation.",
+      "Specify target level for offsets distribution utility.",
       [XeGPU_WGLevel, XeGPU_SGLevel, XeGPU_WILevel]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::xegpu";
@@ -243,7 +243,7 @@ def DistributeLayoutAttr: AttrInterface<"DistributeLayoutAttr"> {
                       represents the higher-level problem size. Each `level` may access
                       multiple blocks according to round-robin distribution rules.}],
                     "FailureOr<SmallVector<SmallVector<Value>>>",
-                    "computeDistributedCoords",
+                    "computeDistributedOffsets",
                     (ins "OpBuilder &": $builder, "Location":$loc, "Value":$linearId, "ArrayRef<int64_t>":$shape, "xegpu::DistributionLevel": $level)>,
     InterfaceMethod</*desc=*/[{Check if this layout can be achieved by applying a transpose
                      to some other layout according to given permutation of (0...n-1).}],
@@ -496,7 +496,7 @@ def XeGPU_LayoutAttr : XeGPUAttr<"Layout", "layout", [DistributeLayoutAttr]> {
     /// represents the higher-level problem size. Each `level` may access
     /// multiple blocks according to round-robin distribution rules.
     FailureOr<SmallVector<SmallVector<Value>>>
-    computeDistributedCoords(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape, xegpu::DistributionLevel level);
+    computeDistributedOffsets(OpBuilder &builder, Location loc, Value linearId, ArrayRef<int64_t> shape, xegpu::DistributionLevel level);
 
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other) { return false; }
@@ -661,7 +661,7 @@ def XeGPU_SliceAttr : XeGPUAttr<"Slice", "slice", [DistributeLayoutAttr]> {
     /// multiple blocks according to round-robin distribution rules.
 
     FailureOr<SmallVector<SmallVector<Value>>>
-    computeDistributedCoords(OpBuilder &builder, Location loc, Value linearId,ArrayRef<int64_t> shape, xegpu::DistributionLevel level);
+    computeDistributedOffsets(OpBuilder &builder, Location loc, Value linearId,ArrayRef<int64_t> shape, xegpu::DistributionLevel level);
 
     /// Check if this is slice of some other layout.
     bool isSliceOf(const xegpu::DistributeLayoutAttr &other);
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -41,6 +41,11 @@ void XeGPUDialect::initialize() {
 // A `srcShape` consists of N distribution units, each being `subShapesLayout` x
 // `subShape`. A `delinearizedId` is used to identify a particular `subShape`
 // within each distribution unit.
+// Example:
+// WG data is 128x256. SG data is 16x32, in 4x2 layout, this gives a
+// distribution unit of shape 64x64, we have 2x4 such distribution units.
+// `delinearizedId` is used to identify a 16x32 of a subgroup in each
+// distribution unit.
 static SmallVector<SmallVector<Value>>
 genOffsets(OpBuilder &builder, Location loc, SmallVector<Value> delinearizedId,
            ArrayRef<int64_t> subShapesLayout, ArrayRef<int64_t> subShape,
@@ -294,13 +299,13 @@ LayoutAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId,
   return affine::delinearizeIndex(builder, loc, linearId, dims);
 }
 
-/// Implements DistributeLayoutAttr::computeDistributedCoords to generate
+/// Implements DistributeLayoutAttr::computeDistributedOffsets to generate
 /// instructions for computing multi-dimensional offsets when distributed by
 /// LayoutAttr.
 FailureOr<SmallVector<SmallVector<Value>>>
-LayoutAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
-                                     Value linearId, ArrayRef<int64_t> shape,
-                                     xegpu::DistributionLevel targetLevel) {
+LayoutAttr::computeDistributedOffsets(OpBuilder &builder, Location loc,
+                                      Value linearId, ArrayRef<int64_t> shape,
+                                      xegpu::DistributionLevel targetLevel) {
   SmallVector<int64_t> layout;
   SmallVector<int64_t> subShape;
   if (targetLevel == DistributionLevel::SG) {
@@ -386,13 +391,13 @@ SliceAttr::delinearizeId(OpBuilder &builder, Location loc, Value linearId,
   return parent.delinearizeId(builder, loc, linearId, level);
 }
 
-// Implements DistributeLayoutAttr::computeDistributedCoords to generate
+// Implements DistributeLayoutAttr::computeDistributedOffsets to generate
 // instructions for computing multi-dimensional offsets when distributed by
 // LayoutAttr.
 FailureOr<SmallVector<SmallVector<Value>>>
-SliceAttr::computeDistributedCoords(OpBuilder &builder, Location loc,
-                                    Value linearId, ArrayRef<int64_t> shape,
-                                    xegpu::DistributionLevel targetLevel) {
+SliceAttr::computeDistributedOffsets(OpBuilder &builder, Location loc,
+                                     Value linearId, ArrayRef<int64_t> shape,
+                                     xegpu::DistributionLevel targetLevel) {
   assert(getRank() == static_cast<int64_t>(shape.size()) && "invalid shape.");
   if (!isForWorkgroup())
     return failure();
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -907,34 +907,48 @@ struct StoreDistribution final : public gpu::WarpDistributionPattern {
   }
 };
 
-template <class MatrixOp>
-struct MatrixOpDistribution final : public gpu::WarpDistributionPattern {
+static SmallVector<Value> computeDistributedOffsetsForMatrixOp(
+    PatternRewriter &rewriter, Location loc, xegpu::DistributeLayoutAttr layout,
+    Value laneId, ArrayRef<int64_t> payloadShape, ValueRange origOffsets) {
+  SmallVector<Value> newOffsets;
+  ;
+  auto maybeDescOffsets = layout.computeDistributedOffsets(
+      rewriter, loc, laneId, payloadShape, xegpu::DistributionLevel::WI);
+  if (failed(maybeDescOffsets))
+    return {};
+  assert(maybeDescOffsets.value().size() == 1 &&
+         "Expected one set of distributed offsets");
+  SmallVector<OpFoldResult> ofrVec = xegpu::addWithRightAligned(
+      rewriter, loc, getAsOpFoldResult(maybeDescOffsets.value()[0]),
+      getAsOpFoldResult(origOffsets));
+  newOffsets = llvm::to_vector(llvm::map_range(
+      ofrVec, [&](OpFoldResult ofr) -> Value { return cast<Value>(ofr); }));
+  return newOffsets;
+}
+
+/// Pattern for distributing xegpu::LoadMatrixOp.
+struct LoadMatrixDistribution final : public gpu::WarpDistributionPattern {
   using gpu::WarpDistributionPattern::WarpDistributionPattern;
   LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
                                 PatternRewriter &rewriter) const override {
     gpu::YieldOp yield = warpOp.getTerminator();
     Operation *lastNode = yield->getPrevNode();
-    auto matrixOp = dyn_cast_or_null<MatrixOp>(lastNode);
+    auto matrixOp = dyn_cast_or_null<xegpu::LoadMatrixOp>(lastNode);
     if (!matrixOp)
       return failure();
-    constexpr bool isLoad{std::is_same_v<MatrixOp, xegpu::LoadMatrixOp>};
-    int operandIdx{-1};
-
-    VectorType sgPayloadTy;
-    VectorType warpResultTy;
-    if constexpr (isLoad) {
-      OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
-        return isa<xegpu::LoadMatrixOp>(op) && matrixOp == op;
-      });
-      if (!producedByLastLoad)
-        return rewriter.notifyMatchFailure(
-            warpOp, "The last op is not xegpu::LoadMatrixOp");
-      operandIdx = producedByLastLoad->getOperandNumber();
-      sgPayloadTy = dyn_cast<VectorType>(matrixOp.getResult().getType());
-      warpResultTy = cast<VectorType>(warpOp.getResult(operandIdx).getType());
-    } else {
-      sgPayloadTy = dyn_cast<VectorType>(matrixOp.getData().getType());
-    }
+
+    OpOperand *producedByLastLoad = getWarpResult(warpOp, [&](Operation *op) {
+      return isa<xegpu::LoadMatrixOp>(op) && matrixOp == op;
+    });
+    if (!producedByLastLoad)
+      return rewriter.notifyMatchFailure(
+          warpOp, "The last op is not xegpu::LoadMatrixOp");
+    const int operandIdx = producedByLastLoad->getOperandNumber();
+
+    VectorType sgPayloadTy =
+        dyn_cast<VectorType>(matrixOp.getResult().getType());
+    VectorType warpResultTy =
+        cast<VectorType>(warpOp.getResult(operandIdx).getType());
     if (!sgPayloadTy)
       return rewriter.notifyMatchFailure(
           matrixOp, "the matrix op payload must be a vector type");
@@ -956,21 +970,14 @@ struct MatrixOpDistribution final : public gpu::WarpDistributionPattern {
         getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
     if (failed(distPayloadByWarpOpOrFailure))
       return rewriter.notifyMatchFailure(
-          matrixOp,
-          "The matrix op payload has no layouts, using defaults instead.");
-
-    SmallVector<Value> operands;
-    if constexpr (isLoad)
-      operands = {matrixOp.getMemDesc()};
-    else
-      operands = {matrixOp.getData(), matrixOp.getMemDesc()};
+          matrixOp, "The matrix op payload has no layout.");
+
+    SmallVector<Value> operands = {matrixOp.getMemDesc()};
     const unsigned offsetsStartIdx = operands.size();
     operands.append(offsetsAsValues);
 
     SmallVector<Type> operandTypes = llvm::to_vector(
         llvm::map_range(operands, [](Value v) { return v.getType(); }));
-    if constexpr (!isLoad)
-      operandTypes[0] = *distPayloadByWarpOpOrFailure;
 
     SmallVector<size_t> newRetIndices;
     gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
@@ -986,40 +993,97 @@ struct MatrixOpDistribution final : public gpu::WarpDistributionPattern {
     ValueRange currentOffsets =
         ValueRange(newOperands).drop_front(offsetsStartIdx);
 
-    rewriter.setInsertionPointAfter(newWarpOp);
     SmallVector<Value> newOffsets = currentOffsets;
+    rewriter.setInsertionPointAfter(newWarpOp);
+
     if (!matrixOp.getSubgroupBlockIoAttr()) {
-      auto maybeDescOffsets = layout.computeDistributedCoords(
-          rewriter, loc, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
-          xegpu::DistributionLevel::WI);
-      if (failed(maybeDescOffsets))
-        return failure();
-      assert(maybeDescOffsets.value().size() == 1 &&
-             "Expected same number of offset sets as number of accessed "
-             "sub-tensors or sub-memory descriptors.");
-      SmallVector<OpFoldResult> ofrVec = xegpu::addWithRightAligned(
-          rewriter, loc, getAsOpFoldResult(maybeDescOffsets.value()[0]),
-          offsets);
-      newOffsets = llvm::to_vector(llvm::map_range(
-          ofrVec, [&](OpFoldResult ofr) -> Value { return cast<Value>(ofr); }));
+      newOffsets = computeDistributedOffsetsForMatrixOp(
+          rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
+          currentOffsets);
     }
+    xegpu::LoadMatrixOp newOp = xegpu::LoadMatrixOp::create(
+        rewriter, newWarpOp.getLoc(), *distPayloadByWarpOpOrFailure,
+        newOperands[0], ValueRange(newOffsets), newConstOffsetsAttr,
+        matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
+    // Resolve the output type and replace all uses.
+    rewriter.replaceAllUsesWith(
+        newWarpOp.getResult(operandIdx),
+        resolveDistributedTy(newOp.getResult(), warpResultTy, rewriter));
+    return success();
+  }
+};
 
-    if constexpr (isLoad) {
-      xegpu::LoadMatrixOp newOp = xegpu::LoadMatrixOp::create(
-          rewriter, newWarpOp.getLoc(), *distPayloadByWarpOpOrFailure,
-          newOperands[0], ValueRange(newOffsets), newConstOffsetsAttr,
-          matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
-      // Resolve the output type and replace all uses.
-      rewriter.replaceAllUsesWith(
-          newWarpOp.getResult(operandIdx),
-          resolveDistributedTy(newOp.getResult(), warpResultTy, rewriter));
-    } else {
-      xegpu::StoreMatrixOp::create(
-          rewriter, loc, TypeRange{}, newOperands[0], newOperands[1],
-          ValueRange(newOffsets), newConstOffsetsAttr,
-          matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
-      rewriter.eraseOp(matrixOp);
+/// Pattern for distributing xegpu::StoreMatrixOp.
+struct StoreMatrixDistribution final : public gpu::WarpDistributionPattern {
+  using gpu::WarpDistributionPattern::WarpDistributionPattern;
+  LogicalResult matchAndRewrite(gpu::WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override {
+    gpu::YieldOp yield = warpOp.getTerminator();
+    Operation *lastNode = yield->getPrevNode();
+    auto matrixOp = dyn_cast_or_null<xegpu::StoreMatrixOp>(lastNode);
+    if (!matrixOp)
+      return failure();
+
+    VectorType sgPayloadTy = dyn_cast<VectorType>(matrixOp.getData().getType());
+    if (!sgPayloadTy)
+      return rewriter.notifyMatchFailure(
+          matrixOp, "the matrix op payload must be a vector type");
+
+    auto loc = matrixOp.getLoc();
+    auto offsets = matrixOp.getMixedOffsets();
+    if (offsets.empty())
+      return rewriter.notifyMatchFailure(matrixOp,
+                                         "the store op must have offsets");
+    SmallVector<Value> offsetsAsValues =
+        vector::getAsValues(rewriter, matrixOp.getLoc(), offsets);
+
+    auto layout = matrixOp.getLayoutAttr();
+    if (!layout)
+      return rewriter.notifyMatchFailure(
+          matrixOp, "the matrix operation lacks layout attribute");
+
+    FailureOr<VectorType> distPayloadByWarpOpOrFailure =
+        getDistVecTypeBasedOnLaneLayout(layout, sgPayloadTy);
+    if (failed(distPayloadByWarpOpOrFailure))
+      return rewriter.notifyMatchFailure(
+          matrixOp, "The matrix op payload has no layout.");
+
+    SmallVector<Value> operands = {matrixOp.getData(), matrixOp.getMemDesc()};
+    const unsigned offsetsStartIdx = operands.size();
+    operands.append(offsetsAsValues);
+
+    SmallVector<Type> operandTypes = llvm::to_vector(
+        llvm::map_range(operands, [](Value v) { return v.getType(); }));
+    operandTypes[0] = *distPayloadByWarpOpOrFailure;
+
+    SmallVector<size_t> newRetIndices;
+    gpu::WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, warpOp, operands, operandTypes, newRetIndices);
+    SmallVector<Value> newOperands = llvm::map_to_vector(
+        newRetIndices, [&](size_t idx) { return newWarpOp.getResult(idx); });
+
+    SmallVector<int64_t> newConstOffsets{matrixOp.getConstOffsets()};
+    std::fill(newConstOffsets.begin(), newConstOffsets.end(),
+              ShapedType::kDynamic);
+    DenseI64ArrayAttr newConstOffsetsAttr =
+        rewriter.getDenseI64ArrayAttr(newConstOffsets);
+    ValueRange currentOffsets =
+        ValueRange(newOperands).drop_front(offsetsStartIdx);
+
+    SmallVector<Value> newOffsets = currentOffsets;
+    rewriter.setInsertionPointAfter(newWarpOp);
+
+    if (!matrixOp.getSubgroupBlockIoAttr()) {
+      newOffsets = computeDistributedOffsetsForMatrixOp(
+          rewriter, loc, layout, newWarpOp.getLaneid(), sgPayloadTy.getShape(),
+          currentOffsets);
     }
+
+    xegpu::StoreMatrixOp::create(
+        rewriter, loc, TypeRange{}, newOperands[0], newOperands[1],
+        ValueRange(newOffsets), newConstOffsetsAttr,
+        matrixOp.getSubgroupBlockIoAttr(), xegpu::DistributeLayoutAttr{});
+    rewriter.eraseOp(matrixOp);
     return success();
   }
 };
@@ -1551,16 +1615,15 @@ struct XeGPUSubgroupDistributePass final
 
 void xegpu::populateXeGPUSubgroupDistributePatterns(
     RewritePatternSet &patterns) {
-  patterns
-      .add<CreateNdDescDistribution, StoreNdDistribution, LoadNdDistribution,
-           DpasDistribution, PrefetchNdDistribution, GpuBarrierDistribution,
-           VectorMultiReductionDistribution, LoadDistribution,
-           StoreDistribution, VectorTransposeDistribution,
-           VectorBitcastDistribution, MatrixOpDistribution<xegpu::LoadMatrixOp>,
-           MatrixOpDistribution<xegpu::StoreMatrixOp>,
-           MemrefExtractAlignedPointerAsIndexDistribution>(
-          patterns.getContext(),
-          /*pattern benefit=*/regularPatternBenefit);
+  patterns.add<CreateNdDescDistribution, StoreNdDistribution,
+               LoadNdDistribution, DpasDistribution, PrefetchNdDistribution,
+               GpuBarrierDistribution, VectorMultiReductionDistribution,
+               LoadDistribution, StoreDistribution, VectorTransposeDistribution,
+               VectorBitcastDistribution, LoadMatrixDistribution,
+               StoreMatrixDistribution,
+               MemrefExtractAlignedPointerAsIndexDistribution>(
+      patterns.getContext(),
+      /*pattern benefit=*/regularPatternBenefit);
   patterns.add<VectorShapeCastDistribution>(
       patterns.getContext(),
       /*pattern benefit=*/highPatternBenefit);
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -114,7 +114,7 @@ genOffsetsList(ConversionPatternRewriter &rewriter, OpType op,
   // Compute the list of subgroup-relative offsets for sub-tensors or sub-memory
   // descriptors to be accessed, based on the layout information.
   ArrayRef<int64_t> wgShape = op.getDataShape();
-  auto maybeDescOffsets = layout.computeDistributedCoords(
+  auto maybeDescOffsets = layout.computeDistributedOffsets(
       rewriter, loc, sgId, wgShape, xegpu::DistributionLevel::SG);
   if (failed(maybeDescOffsets))
     return failure();
@@ -831,7 +831,7 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
       // Get subgroup id
       Value sgId =
           gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
-      auto sgOffsets = layout.computeDistributedCoords(
+      auto sgOffsets = layout.computeDistributedOffsets(
           rewriter, loc, sgId, wgShape, xegpu::DistributionLevel::SG);
       if (failed(sgOffsets))
         return failure();
@@ -1053,7 +1053,7 @@ struct WgToSgVectorStepOp : public OpConversionPattern<vector::StepOp> {
 
     Value sgId =
         gpu::SubgroupIdOp::create(rewriter, loc, /*upper_bound=*/nullptr);
-    auto sgOffsets = layout.computeDistributedCoords(
+    auto sgOffsets = layout.computeDistributedOffsets(
         rewriter, loc, sgId, wgShape, xegpu::DistributionLevel::SG);
     if (failed(sgOffsets))
       return failure();
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
diff --git a/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp b/mlir/test/lib/Dialect/XeGPU/TestXeGPUTransforms.cpp