[MLIR][XeGPU] Add support for vector.multi_reduction in wg to sg pass [1/N] #157554

nbpatel · 2025-09-08T21:05:41Z

This PR adds pattern for lowering vector.multi_reduction from workgroup to subgroup IR. It currently only supports sg local reductions

llvmbot · 2025-09-08T21:06:12Z

@llvm/pr-subscribers-mlir-gpu

@llvm/pr-subscribers-mlir

Author: Nishant Patel (nbpatel)

Changes

This PR adds pattern for lowering vector.multi_reduction from workgroup to subgroup IR. It currently only supports simple reductions of form

<mx1> to <m>
<1xm> to <m>

Full diff: https://github.com/llvm/llvm-project/pull/157554.diff

2 Files Affected:

(modified) mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp (+64-3)
(modified) mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir (+28)

diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 5d0f1d18402f2..fab2b8773a6b8 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -757,8 +757,10 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
     auto sgAttr = DenseElementsAttr::get(newType, singleVal);
     auto cstOp =
         arith::ConstantOp::create(rewriter, op.getLoc(), newType, sgAttr);
-    if (auto newLayout = layout.dropSgLayoutAndData())
-      xegpu::setDistributeLayoutAttr(cstOp->getResult(0), newLayout);
+    if (!layout.getLaneLayoutAsInt().empty() ||
+        !layout.getLaneDataAsInt().empty())
+      xegpu::setDistributeLayoutAttr(cstOp->getResult(0),
+                                     layout.dropSgLayoutAndData());
     SmallVector<Value> newConsts(count, cstOp);
 
     rewriter.replaceOpWithMultiple(op, {newConsts});
@@ -919,6 +921,59 @@ struct WgToSgStoreMatrixOp : public OpConversionPattern<xegpu::StoreMatrixOp> {
   }
 };
 
+// Pattern for lowering vector.multi_reduction op to subgroup level.
+struct WgToSgMultiDimReductionOp
+    : public OpConversionPattern<vector::MultiDimReductionOp> {
+  using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(vector::MultiDimReductionOp op, OneToNOpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    VectorType srcType = dyn_cast<VectorType>(op.getSource().getType());
+    VectorType dstType = dyn_cast<VectorType>(op.getResult().getType());
+    if (!srcType || !dstType)
+      return failure();
+
+    // Only handle [m,1]->[m] or [1,m]->[m]
+    // TODO: generalize it
+    auto srcShape = srcType.getShape();
+    auto dstShape = dstType.getShape();
+    if (srcShape.size() != 2 || dstShape.size() != 1)
+      return failure();
+
+    if (!((srcShape[1] == 1 && srcShape[0] == dstShape[0]) ||
+          (srcShape[0] == 1 && srcShape[1] == dstShape[0])))
+      return failure();
+
+    xegpu::DistributeLayoutAttr layout =
+        xegpu::getDistributeLayoutAttr(op.getSource());
+    if (!layout || !layout.isForWorkgroup())
+      return failure();
+
+    SmallVector<int64_t> sgShape = getSgShapeAndCount(srcShape, layout).first;
+    VectorType newDstType;
+    if (op.getReductionDims() == ArrayRef<int64_t>({0}))
+      newDstType = VectorType::get({sgShape[1]}, dstType.getElementType());
+    else
+      newDstType = VectorType::get({sgShape[0]}, dstType.getElementType());
+
+    SmallVector<Value> newReductions;
+    for (auto [sgSrc, sgAcc] :
+         llvm::zip(adaptor.getSource(), adaptor.getAcc())) {
+      auto newOp = rewriter.create<vector::MultiDimReductionOp>(
+          op.getLoc(), newDstType, op.getKind(), sgSrc, sgAcc,
+          op.getReductionDims());
+      if (!layout.getLaneLayoutAsInt().empty() ||
+          !layout.getLaneDataAsInt().empty())
+        xegpu::setDistributeLayoutAttr(newOp->getResult(0),
+                                       layout.dropSgLayoutAndData());
+      newReductions.push_back(newOp.getResult());
+    }
+    rewriter.replaceOpWithMultiple(op, {newReductions});
+    return success();
+  }
+};
+
 } // namespace
 
 namespace mlir {
@@ -932,7 +987,8 @@ void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
            WgToSgElementwiseOp, WgToSgVectorBroadcastOp, WgToSgConvertLayoutOp,
            WgToSgArithConstantOp, WgToSgLoadGatherOpWithOffset,
            WgToSgStoreScatterOpWithOffset, WgToSgLoadMatrixOp,
-           WgToSgStoreMatrixOp>(patterns.getContext());
+           WgToSgStoreMatrixOp, WgToSgMultiDimReductionOp>(
+          patterns.getContext());
 }
 } // namespace xegpu
 } // namespace mlir
@@ -1077,6 +1133,11 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
         return isLegal(xegpu::getDistributeLayoutAttr(op.getResult()));
       });
 
+  target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
+      [=](vector::MultiDimReductionOp op) -> bool {
+        return isLegal(xegpu::getDistributeLayoutAttr(op.getResult()));
+      });
+
   target.addDynamicallyLegalOp<xegpu::ConvertLayoutOp>(
       [=](xegpu::ConvertLayoutOp op) -> bool {
         return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout());
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
index afb2bf876c18f..47e6f4cfd6d08 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
@@ -365,4 +365,32 @@ gpu.module @test_distribution {
     xegpu.store_matrix %cst, %mdesc[0, 0] {layout = #xegpu.layout<sg_layout = [2, 4], sg_data = [32, 32]>} : vector<64x128xf32>, !xegpu.mem_desc<64x128xf32>
     gpu.return
   }
+
+  // CHECK-LABEL: @vector_reduce_dim_0
+  gpu.func @vector_reduce_dim_0(%src: memref<1x128xf32>) {
+    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [1, 4]>, dims = [0]>} dense<1.0> : vector<128xf32>
+    %tdesc = xegpu.create_nd_tdesc %src : memref<1x128xf32>
+      -> !xegpu.tensor_desc<1x128xf32, #xegpu.layout<sg_layout = [1, 32], sg_data = [1, 4]>>
+    %load =  xegpu.load_nd %tdesc[0, 0]
+      : !xegpu.tensor_desc<1x128xf32, #xegpu.layout<sg_layout = [1, 32], sg_data = [1, 4]>>
+      -> vector<1x128xf32>
+    // CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [0] : vector<1x4xf32> to vector<4xf32>
+    %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [1, 4]>, dims = [0]>} [0]
+      : vector<1x128xf32> to vector<128xf32>
+    gpu.return
+  }
+
+  // CHECK-LABEL: @vector_reduce_dim_1
+  gpu.func @vector_reduce_dim_1(%src: memref<256x1xf32>) {
+    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 1]>, dims = [1]>} dense<1.0> : vector<256xf32>
+    %tdesc = xegpu.create_nd_tdesc %src : memref<256x1xf32>
+      -> !xegpu.tensor_desc<256x1xf32, #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 1]>>
+    %load =  xegpu.load_nd %tdesc[0, 0]
+      : !xegpu.tensor_desc<256x1xf32, #xegpu.layout<sg_layout = [32, 1], sg_data = [8, 1]>>
+      -> vector<256x1xf32>
+    // CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [1] : vector<8x1xf32> to vector<8xf32>
+    %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [32, 1], sg_data = [8, 1]>, dims = [1]>} [1]
+      : vector<256x1xf32> to vector<256xf32>
+    gpu.return
+  }
 }

Garra1980 · 2025-09-08T21:13:59Z

Please modify the PR title so we know it is for the simplest case. It can be e.g. [1/N] tag or explicit mentioning of the case supported

mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir

nbpatel · 2025-09-12T22:13:21Z

Addressed feedback. Please take a look

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp

nbpatel · 2025-09-16T22:32:36Z

Addressed Feedback

charithaintc

LGTM

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp

Jianhui-Li

LGTM

… [1/N] (llvm#157554) This PR adds pattern for lowering vector.multi_reduction from workgroup to subgroup IR. It currently only supports sg local reductions

Add pattern for reduction

d555046

llvmbot added mlir:gpu mlir labels Sep 8, 2025

nbpatel requested review from Jianhui-Li and charithaintc September 8, 2025 21:08

nbpatel changed the title ~~[MLIR][XeGPU] Add support for vector.multi_reduction in wg to sg pass~~ [MLIR][XeGPU] Add support for vector.multi_reduction in wg to sg pass [1/N] Sep 8, 2025

Merge branch 'main' into xegpu-reduction-unit-dim

9d18d94

charithaintc reviewed Sep 12, 2025

View reviewed changes

mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir Outdated Show resolved Hide resolved

mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir Outdated Show resolved Hide resolved

nbpatel added 3 commits September 12, 2025 20:10

Fix

924a5e1

sg local reduction

b4761de

Merge branch 'main' into xegpu-reduction-unit-dim

cf1eb16

Jianhui-Li requested changes Sep 15, 2025

View reviewed changes

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp Outdated Show resolved Hide resolved

charithaintc self-requested a review September 15, 2025 22:14

charithaintc reviewed Sep 15, 2025

View reviewed changes

adam-smnk reviewed Sep 16, 2025

View reviewed changes

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp Outdated Show resolved Hide resolved

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp Outdated Show resolved Hide resolved

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp Show resolved Hide resolved

nbpatel added 2 commits September 16, 2025 19:11

Address feedback

9be2284

Merge branch 'main' into xegpu-reduction-unit-dim

68b2da5

CHECK

be94e2f

charithaintc self-requested a review September 18, 2025 23:35

charithaintc approved these changes Sep 18, 2025

View reviewed changes

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp Outdated Show resolved Hide resolved

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp Outdated Show resolved Hide resolved

Jianhui-Li reviewed Sep 23, 2025

View reviewed changes

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp Outdated Show resolved Hide resolved

mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp Outdated Show resolved Hide resolved

Jianhui-Li self-requested a review September 23, 2025 23:47

restrict chained reduction

4217fd6

nbpatel force-pushed the xegpu-reduction-unit-dim branch from a402217 to 4217fd6 Compare September 24, 2025 18:17

nbpatel added 2 commits September 25, 2025 04:50

Relax check for 2D reduction

c06ca54

Fix comment

f84f6fd

Jianhui-Li approved these changes Sep 25, 2025

View reviewed changes

Merge branch 'main' into xegpu-reduction-unit-dim

c65b8bc

nbpatel merged commit 50a7eb6 into llvm:main Sep 25, 2025
9 checks passed

nbpatel deleted the xegpu-reduction-unit-dim branch October 16, 2025 18:11

[MLIR][XeGPU] Add support for vector.multi_reduction in wg to sg pass [1/N] #157554

[MLIR][XeGPU] Add support for vector.multi_reduction in wg to sg pass [1/N] #157554

Uh oh!

Conversation

nbpatel commented Sep 8, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Sep 8, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Garra1980 commented Sep 8, 2025

Uh oh!

Uh oh!

Uh oh!

nbpatel commented Sep 12, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

nbpatel commented Sep 16, 2025

Uh oh!

charithaintc left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Jianhui-Li left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

6 participants

nbpatel commented Sep 8, 2025 •

edited

Loading

llvmbot commented Sep 8, 2025 •

edited

Loading