Skip to content
72 changes: 70 additions & 2 deletions mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1027,6 +1027,69 @@ struct WgToSgVectorShapeCastOp
}
};

/// Pattern for lowering vector.multi_reduction op to subgroup level.
/// Current limitation: only support 2D->1D reduction with single reduction
/// dimension, and the sg_layout in the reduced dimension being 1
/// so that reduction is local to subgroup & no cross-subgroup communication is
/// needed.
/// TODO: Add cases to handle more general situations which require SLM access.
struct WgToSgMultiDimReductionOp
: public OpConversionPattern<vector::MultiDimReductionOp> {
using OpConversionPattern<vector::MultiDimReductionOp>::OpConversionPattern;

LogicalResult
matchAndRewrite(vector::MultiDimReductionOp op, OneToNOpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
VectorType srcType = op.getSourceVectorType();
VectorType dstType = dyn_cast<VectorType>(op.getResult().getType());
if (!dstType)
return failure();

SmallVector<int64_t> srcShape(srcType.getShape().begin(),
srcType.getShape().end());
SmallVector<int64_t> dstShape(dstType.getShape().begin(),
dstType.getShape().end());
if (srcShape.size() != 2 || dstShape.size() != 1)
return failure();

xegpu::DistributeLayoutAttr layout =
xegpu::getDistributeLayoutAttr(op.getResult());
if (!layout || !layout.isForWorkgroup())
return failure();

SmallVector<int64_t> reductionDims(op.getReductionDims().begin(),
op.getReductionDims().end());
if (reductionDims.size() != 1)
return failure();

SmallVector<int64_t> sgLayout = llvm::cast<xegpu::SliceAttr>(layout)
.getParent()
.getEffectiveSgLayoutAsInt();
// Check that the sgLayout in the reduced dimension is 1.
if (sgLayout[reductionDims[0]] != 1)
return failure();
SmallVector<int64_t> sgShape = getSgShapeAndCount(srcShape, layout).first;

VectorType newDstType =
VectorType::get({sgShape}, dstType.getElementType());

SmallVector<Value> newReductions;
for (auto [sgSrc, sgAcc] :
llvm::zip(adaptor.getSource(), adaptor.getAcc())) {
auto newOp = vector::MultiDimReductionOp::create(
rewriter, op.getLoc(), newDstType, op.getKind(), sgSrc, sgAcc,
op.getReductionDims());
if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
!layout.getEffectiveInstDataAsInt().empty())
xegpu::setDistributeLayoutAttr(newOp->getResult(0),
layout.dropSgLayoutAndData());
newReductions.push_back(newOp.getResult());
}
rewriter.replaceOpWithMultiple(op, {newReductions});
return success();
}
};

} // namespace

namespace mlir {
Expand All @@ -1040,8 +1103,8 @@ void populateXeGPUWgToSgDistributePatterns(RewritePatternSet &patterns) {
WgToSgElementwiseOp, WgToSgVectorBroadcastOp, WgToSgConvertLayoutOp,
WgToSgArithConstantOp, WgToSgLoadGatherOpWithOffset,
WgToSgStoreScatterOpWithOffset, WgToSgLoadMatrixOp,
WgToSgStoreMatrixOp, WgToSgVectorStepOp, WgToSgVectorShapeCastOp>(
patterns.getContext());
WgToSgStoreMatrixOp, WgToSgVectorStepOp, WgToSgVectorShapeCastOp,
WgToSgMultiDimReductionOp>(patterns.getContext());
}
} // namespace xegpu
} // namespace mlir
Expand Down Expand Up @@ -1195,6 +1258,11 @@ void XeGPUWgToSgDistributePass::runOnOperation() {
return isLegal(xegpu::getDistributeLayoutAttr(op.getResult()));
});

target.addDynamicallyLegalOp<vector::MultiDimReductionOp>(
[=](vector::MultiDimReductionOp op) -> bool {
return isLegal(xegpu::getDistributeLayoutAttr(op.getResult()));
});

target.addDynamicallyLegalOp<xegpu::ConvertLayoutOp>(
[=](xegpu::ConvertLayoutOp op) -> bool {
return isLegal(op.getInputLayout()) && isLegal(op.getTargetLayout());
Expand Down
16 changes: 16 additions & 0 deletions mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops-rr.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -82,4 +82,20 @@ gpu.module @test_distribution {
: vector<256x128xf16>, vector<128x256xf16> -> vector<256x256xf32>
gpu.return
}

// CHECK-LABEL: vector_reduce_dim_1
gpu.func @vector_reduce_dim_1(%src: memref<256x64xf32>) {
// CHECK: %[[CST:.*]] = arith.constant dense<1.000000e+00> : vector<16xf32>
%cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>} dense<1.0> : vector<256xf32>
%tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32>
-> !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>>
%load = xegpu.load_nd %tdesc[0, 0]
: !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>>
-> vector<256x64xf32>
// CHECK-COUNT-2: vector.multi_reduction <add>, {{.*}}, %[[CST]] [1] : vector<16x64xf32> to vector<16xf32>
// CHECK-NOT: vector.multi_reduction
%reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [8, 1], sg_data = [16, 64]>, dims = [1]>} [1]
: vector<256x64xf32> to vector<256xf32>
gpu.return
}
}
28 changes: 28 additions & 0 deletions mlir/test/Dialect/XeGPU/xegpu-wg-to-sg-unify-ops.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,34 @@ gpu.module @test_distribution {
gpu.return
}

// CHECK-LABEL: @vector_reduce_dim_0
gpu.func @vector_reduce_dim_0(%src: memref<4x128xf32>) {
%cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>, dims = [0]>} dense<1.0> : vector<128xf32>
%tdesc = xegpu.create_nd_tdesc %src : memref<4x128xf32>
-> !xegpu.tensor_desc<4x128xf32, #xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>>
%load = xegpu.load_nd %tdesc[0, 0]
: !xegpu.tensor_desc<4x128xf32, #xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>>
-> vector<4x128xf32>
// CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [0] : vector<4x4xf32> to vector<4xf32>
%reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [1, 32], sg_data = [4, 4]>, dims = [0]>} [0]
: vector<4x128xf32> to vector<128xf32>
gpu.return
}

// CHECK-LABEL: @vector_reduce_dim_1
gpu.func @vector_reduce_dim_1(%src: memref<256x64xf32>) {
%cst = arith.constant {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>, dims = [1]>} dense<1.0> : vector<256xf32>
%tdesc = xegpu.create_nd_tdesc %src : memref<256x64xf32>
-> !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>>
%load = xegpu.load_nd %tdesc[0, 0]
: !xegpu.tensor_desc<256x64xf32, #xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>>
-> vector<256x64xf32>
// CHECK: vector.multi_reduction <add>, {{.*}}, {{.*}} [1] : vector<16x64xf32> to vector<16xf32>
%reduce = vector.multi_reduction <add>, %load, %cst {layout_result_0 = #xegpu.slice<#xegpu.layout<sg_layout = [16, 1], sg_data = [16, 64]>, dims = [1]>} [1]
: vector<256x64xf32> to vector<256xf32>
gpu.return
}

// CHECK-LABEL: vector_step_op
gpu.func @vector_step_op_slice_attr() {
//CHECK: [[sgId:%.+]] = gpu.subgroup_id : index
Expand Down