Address feedback

nbpatel · nbpatel · commit 9be2284ec793 · 2025-09-16T19:11:52.000Z
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -1027,22 +1027,28 @@ struct WgToSgVectorShapeCastOp
   }
 };
 
-// Pattern for lowering vector.multi_reduction op to subgroup level.
+/// Pattern for lowering vector.multi_reduction op to subgroup level.
+/// Current limitation: only support 2D->1D reduction with single reduction
+/// dimension, and the sg_layout in the reduced dimension being 1
+/// so that reduction is local to subgroup & no cross-subgroup communication is
+/// needed.
+/// TODO: Add cases to handle more general situations which require SLM access.
 struct WgToSgMultiDimReductionOp
     : public OpConversionPattern<vector::MultiDimReductionOp> {
   using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;
 
   LogicalResult
   matchAndRewrite(vector::MultiDimReductionOp op, OneToNOpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    VectorType srcType = dyn_cast<VectorType>(op.getSource().getType());
+    VectorType srcType = op.getSourceVectorType();
     VectorType dstType = dyn_cast<VectorType>(op.getResult().getType());
-    if (!srcType || !dstType)
+    if (!dstType)
       return failure();
 
-    // TODO: generalize it
-    auto srcShape = srcType.getShape();
-    auto dstShape = dstType.getShape();
+    SmallVector<int64_t> srcShape(srcType.getShape().begin(),
+                                  srcType.getShape().end());
+    SmallVector<int64_t> dstShape(dstType.getShape().begin(),
+                                  dstType.getShape().end());
     if (srcShape.size() != 2 || dstShape.size() != 1)
       return failure();
 
@@ -1051,7 +1057,8 @@ struct WgToSgMultiDimReductionOp
     if (!layout || !layout.isForWorkgroup())
       return failure();
 
-    auto reductionDims = op.getReductionDims();
+    SmallVector<int64_t> reductionDims(op.getReductionDims().begin(),
+                                       op.getReductionDims().end());
     if (reductionDims.size() != 1)
       return failure();
 
@@ -1069,8 +1076,8 @@ struct WgToSgMultiDimReductionOp
     SmallVector<Value> newReductions;
     for (auto [sgSrc, sgAcc] :
          llvm::zip(adaptor.getSource(), adaptor.getAcc())) {
-      auto newOp = rewriter.create<vector::MultiDimReductionOp>(
-          op.getLoc(), newDstType, op.getKind(), sgSrc, sgAcc,
+      auto newOp = vector::MultiDimReductionOp::create(
+          rewriter, op.getLoc(), newDstType, op.getKind(), sgSrc, sgAcc,
           op.getReductionDims());
       if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
           !layout.getEffectiveInstDataAsInt().empty())
diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
@@ -82,4 +82,19 @@ gpu.module @test_distribution {
       : vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32>
     gpu.return
   }
+
+  // CHECK-LABEL: vector_reduce_dim_1
+  gpu.func @vector_reduce_dim_1(%src: memref<256x64xf32>) {
+    %cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>} dense<1.0> : vector<256xf32>
+    %tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32>
+      -> !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>>
+    %load =  xegpu.load_nd %tdesc[0, 0]
+      : !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>>
+      -> vector<256x64xf32>
+    // CHECK-COUNT-2: vector.multi_reduction <add>, {{.*}}, {{.*}} [1] : vector<16x64xf32> to vector<16xf32>
+    // CHECK-NOT: vector.multi_reduction
+    %reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>} [1]
+      : vector<256x64xf32> to vector<256xf32>
+    gpu.return
+  }
 }