Skip to content

Commit c97bd76

Browse files
authored
Limit the minimum repeatCount value to make sure the number elements of DPAS operand A is enough to be shared to each logical lane. (#4787)
The number of elements of the matrix for dot operands A maybe less than the sub-group-size. E.G for the case: repCount = 1, systolic depth = 8, ops per channel = 1 and threads_per_warp = 16. It is not supported by the IGC scalar backend. We have to limit the minimal number of repCount=2 for that case with paddings. Signed-off-by: Lu,Chengjun <[email protected]>
1 parent eb3bf93 commit c97bd76

File tree

4 files changed

+42
-1
lines changed

4 files changed

+42
-1
lines changed

lib/Dialect/TritonGPU/IR/Dialect.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -746,6 +746,24 @@ LogicalResult DotOperandEncodingAttr::verify(
746746
return emitError() << "ttg.dot_op kWidth parameter must match the "
747747
"parent's opsPerChannel";
748748
}
749+
750+
unsigned repeatCount = parentAttr.getRepeatCount();
751+
unsigned systolicDepth = parentAttr.getSystolicDepth();
752+
unsigned threadsPerWarp = parentAttr.getThreadsPerWarp();
753+
// OpsPerChannel: 4 is for i8 type. 2 is for f16/bf16 type. 1 is for
754+
// float32 type. 2 i8 elements are packed into i16. The number of packed
755+
// elements per row for A operand is: 8, 16, 16.
756+
unsigned numPackedElemPerRowForA =
757+
opsPerChannel == 1 ? systolicDepth : systolicDepth * 2;
758+
if (repeatCount * numPackedElemPerRowForA < threadsPerWarp)
759+
return emitError()
760+
<< "The DPAS encoding implies an invalid layout for A "
761+
"operand. The non-uniform matrix A could not be "
762+
"referred in kernel with threadsPerWarp: "
763+
<< threadsPerWarp
764+
<< ". numPackedElemPerRowForA:" << numPackedElemPerRowForA
765+
<< ". RC:" << repeatCount << ", systolicDepth:" << systolicDepth
766+
<< ", opsPerChan:" << opsPerChannel;
749767
} else {
750768
// operand B
751769
if (kWidth != parentAttr.getOpsPerChannel())

test/TritonIntelGPU/tritonintelgpu-invalid.mlir

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,3 +214,14 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
214214
tt.return
215215
}
216216
}
217+
218+
// -----
219+
220+
#dpas = #ttig.dpas<{repeatCount = 1, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 16, warpsPerCTA = [2, 2], repCluster = [1, 1]}>
221+
// expected-error @below {{The DPAS encoding implies an invalid layout for A operand. The non-uniform matrix A could not be referred in kernel}}
222+
#dot_a = #ttg.dot_op<{opIdx=0, parent=#dpas, kWidth=1}>
223+
224+
// -----
225+
226+
// expected-error @below {{threadsPerWarp could not be smaller than the execution size}}
227+
#dpas = #ttig.dpas<{repeatCount = 1, systolicDepth = 8, executionSize = 16, opsPerChan = 1, threadsPerWarp = 8, warpsPerCTA = [2, 2], repCluster = [1, 1]}>

third_party/intel/lib/Dialect/TritonIntelGPU/IR/Dialect.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -291,7 +291,7 @@ LogicalResult DpasEncodingAttr::verify(
291291
::llvm::function_ref<::mlir::InFlightDiagnostic()> emitError,
292292
unsigned repeatCount, unsigned systolicDepth, unsigned executionSize,
293293
unsigned opsPerChan, ::llvm::ArrayRef<unsigned> warpsPerCTA,
294-
::llvm::ArrayRef<unsigned> repCluster, unsigned sugGroupSize) {
294+
::llvm::ArrayRef<unsigned> repCluster, unsigned subGroupSize) {
295295
if (repeatCount > 8 || repeatCount < 1) {
296296
return emitError() << "repeatCount must be in the range [1, 8], but was:"
297297
<< repeatCount;
@@ -311,6 +311,12 @@ LogicalResult DpasEncodingAttr::verify(
311311
<< repCluster.size();
312312
}
313313

314+
if (subGroupSize < executionSize) {
315+
return emitError() << "threadsPerWarp could not be smaller than the "
316+
"execution size. got subGroupSize:"
317+
<< subGroupSize << ", executionSize:" << executionSize;
318+
}
319+
314320
return success();
315321
}
316322

third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,12 @@ class BlockedToDPAS : public OpRewritePattern<tt::DotOp> {
126126
unsigned repeatCount =
127127
std::min(dpasCap.repeatCount, (unsigned)retShape[rank - 2] /*M*/);
128128
unsigned threadsPerWarp = ttg::TritonGPUDialect::getThreadsPerWarp(mod);
129+
unsigned numElemsPerRowForA =
130+
opsPerChan == 1
131+
? dpasCap.systolicDepth
132+
: dpasCap.systolicDepth * 2; // A is packed to i16 or i32.
133+
unsigned minM = mlir::ceil<unsigned>(threadsPerWarp, numElemsPerRowForA);
134+
repeatCount = std::max(repeatCount, minM);
129135
auto dpasEnc = ttgi::DpasEncodingAttr::get(
130136
oldRetType.getContext(), repeatCount, dpasCap.systolicDepth,
131137
dpasCap.executionSize, opsPerChan, warpsPerTile, repCluster,

0 commit comments

Comments
 (0)