Skip to content

Commit cc14d65

Browse files
authored
Fix sub-group size used for shuffles. (#4490)
This patch fixes the subgroup size used for shuffles in conversion layout lowering. This parameter should always match the number of threads set for the module. Otherwise, the shuffle op lowering to SPIR dialect would fail. The patch also adds a special guarding pattern to check that our layout conversion pattern succeeded in those cases where it's critical. Membar analysis works on the assumption that these patterns would succeed, and therefore, some of the barriers are optimized out. If we fail in our lowering and use the generic pattern, then missing barriers might cause incorrect behavior. This guard pattern helps to reveal that we have a problem in a few variants of the `test_reduce_layout` and will allow us to prevent regressions here in the future. Fixes #4481 Signed-off-by: Ilya Enkovich <[email protected]>
1 parent 74645a2 commit cc14d65

File tree

1 file changed

+30
-5
lines changed

1 file changed

+30
-5
lines changed

third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp

Lines changed: 30 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -92,8 +92,10 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
9292
// ReduceOp and ConvertLayoutOp, which prevents a segmentation fault.
9393
// However, this is a temporary solution. Once the OutDimSize computation
9494
// issue in LinearLayout is resolved, this workaround should be removed.
95-
int32_t subGroupSize = std::min((int32_t)op.getType().getNumElements(),
96-
conversion.getOutDimSize(kLane));
95+
int32_t numElems = std::min((int32_t)op.getType().getNumElements(),
96+
conversion.getOutDimSize(kLane));
97+
int32_t subGroupSize = triton::gpu::TritonGPUDialect::getThreadsPerWarp(
98+
op->getParentOfType<ModuleOp>());
9799
if (!op->hasAttr("allocation.offset")) {
98100
op->setAttr("allocation.offset",
99101
rewriter.getIntegerAttr(rewriter.getI32Type(), 0));
@@ -130,7 +132,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
130132
});
131133

132134
SmallVector<Value> outVals = performSubGroupShuffle(
133-
loc, inVals, subGroupSize, rewriter,
135+
loc, inVals, numElems, subGroupSize, rewriter,
134136
getNumContiguousRowsForShuffle(srcLayout, dstLayout));
135137

136138
// TODO: Drop 'BFloat16Type' and 'IntegerType' cases when supported at MLIR
@@ -163,6 +165,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
163165

164166
SmallVector<Value> performSubGroupShuffle(Location loc,
165167
ArrayRef<Value> inVals,
168+
int32_t numElems,
166169
int32_t subGroupSize,
167170
ConversionPatternRewriter &rewriter,
168171
int numContiguousRows) const {
@@ -176,7 +179,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
176179
// matrix: Output element `i` will take the `i / 16`th value from the `i %
177180
// 16`th thread.
178181
for (Value val : inVals) {
179-
for (int32_t i = 0; i < subGroupSize; ++i) {
182+
for (int32_t i = 0; i < numElems; ++i) {
180183
res.push_back(
181184
rewriter
182185
.create<mlir::gpu::ShuffleOp>(loc, val, b.i32_val(i), width,
@@ -188,7 +191,7 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
188191
// 2. Elements held by a work-item are contiguous rows in the abstract
189192
// slice matrix: Output element `i` will take the `i % 16`th value from
190193
// the `i / 16`th thread.
191-
for (int32_t i = 0; i < subGroupSize; ++i) {
194+
for (int32_t i = 0; i < numElems; ++i) {
192195
for (Value val : inVals) {
193196
res.push_back(
194197
rewriter
@@ -399,6 +402,22 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
399402
}
400403
};
401404

405+
struct ConvertLayoutOpGuard : public ConvertOpToLLVMPattern<ConvertLayoutOp> {
406+
using ConvertOpToLLVMPattern<ConvertLayoutOp>::ConvertOpToLLVMPattern;
407+
408+
LogicalResult
409+
matchAndRewrite(ConvertLayoutOp op, OpAdaptor adaptor,
410+
ConversionPatternRewriter &rewriter) const override {
411+
auto srcTy = op.getSrc().getType();
412+
auto dstTy = op.getType();
413+
assert(!intel::cvtIsSubGroupShuffle(srcTy, dstTy) &&
414+
"Failed to lower layout conversion through sub-group shuffles");
415+
assert(!intel::cvtIsSubGroupTranspose(srcTy, dstTy) &&
416+
"Failed to lower layout conversion through sub-group transpose");
417+
return failure();
418+
}
419+
};
420+
402421
} // namespace
403422

404423
} // namespace mlir::triton::gpu
@@ -410,6 +429,12 @@ void mlir::triton::intel::populateConvertLayoutOpToLLVMPatterns(
410429
// higher benefit.
411430
patterns.add<gpu::ConvertLayoutOpUsingLinearLayoutsConversion>(
412431
typeConverter, targetInfo, benefit.getBenefit() + 2);
432+
// This guards is to make sure we don't fall back to the generic patterns
433+
// for some specific cases. We check that we've lowered all those cases
434+
// for which the default allocation analysis for scratch buffer size was
435+
// not used. Otherwise, SLM corruption might occur.
436+
patterns.add<gpu::ConvertLayoutOpGuard>(typeConverter,
437+
benefit.getBenefit() + 1);
413438
mlir::triton::populateConvertLayoutOpToLLVMPatterns(typeConverter, targetInfo,
414439
patterns, benefit);
415440
}

0 commit comments

Comments
 (0)