Skip to content

Commit d187943

Browse files
committed
Merge commit '86a2ac753befe5286a261ba3b64eb40bdcca5704'
2 parents 75dcede + 86a2ac7 commit d187943

File tree

11 files changed

+726
-314
lines changed

11 files changed

+726
-314
lines changed

lib/Dialect/TritonGPU/IR/Ops.cpp

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,21 +52,25 @@ LogicalResult UpcastMXFPOp::verify() {
5252
"all dimensions except the last must match between operands");
5353
}
5454

55-
auto layoutX = xTy.getEncoding();
56-
if (!layoutX || !isa<DotOperandEncodingAttr>(layoutX)) {
55+
auto dotEncoding =
56+
dyn_cast_or_null<DotOperandEncodingAttr>(xTy.getEncoding());
57+
if (!dotEncoding) {
5758
return emitOpError("Expected a DotOperandEncodingAttr for values");
5859
}
59-
auto layoutScale = scaleTy.getEncoding();
60-
if (!layoutScale || !isa<BlockedEncodingAttr>(layoutScale)) {
60+
61+
auto blockedScale =
62+
dyn_cast_or_null<BlockedEncodingAttr>(scaleTy.getEncoding());
63+
if (!blockedScale) {
6164
return emitOpError("Expected a BlockOperandEncoding for scales");
6265
}
63-
auto blockedScale = cast<BlockedEncodingAttr>(layoutScale);
6466

65-
// Necessary to keep all of the scales of a given block of values in the same
66-
// warp
67-
auto threadsPerWarp = blockedScale.getThreadsPerWarp();
68-
if (threadsPerWarp != ArrayRef<unsigned>({16, 2})) {
69-
return emitOpError("Expected threads per warp to be {16, 2}");
67+
if (isa<NvidiaMmaEncodingAttr>(dotEncoding.getParent())) {
68+
// Necessary to keep all of the scales of a given block of values in the
69+
// same warp
70+
auto threadsPerWarp = blockedScale.getThreadsPerWarp();
71+
if (threadsPerWarp != ArrayRef<unsigned>({16, 2})) {
72+
return emitOpError("Expected threads per warp to be {16, 2}");
73+
}
7074
}
7175

7276
return success();

lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -450,37 +450,6 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
450450
// If we can't agree on a shared encoding skip pipelinig the load.
451451
if (incompatible)
452452
continue;
453-
454-
// HACK: Triton LLVM codegen has a bug where local_loads from #shared to
455-
// #mma layout can lead to invalid code if the loaded shape is smaller
456-
// than the mma tile (e.g. loading a 128x1 tensor for an MMAv2 dot with
457-
// tile {16,8} is bad because 1 < 8). To work around this, don't
458-
// pipeline such loads.
459-
//
460-
// The codegen bug is caught by an assertion, so if you think you've
461-
// fixed it, feel free to delete this code and see if the assert still
462-
// fails. :)
463-
if (!loadInfo.sharedEncoding) {
464-
if (auto dotEnc = dyn_cast<ttg::NvidiaMmaEncodingAttr>(
465-
dot.getResult().getType().getEncoding())) {
466-
auto loadTy = cast<RankedTensorType>(op->getResultTypes()[0]);
467-
auto mmaInstrShape = dotEnc.getInstrShape();
468-
if (loadTy.getRank() < mmaInstrShape.size())
469-
continue;
470-
bool ok = true;
471-
for (int i = 0; i < mmaInstrShape.size(); i++) {
472-
if (loadTy.getShape()[loadTy.getRank() - mmaInstrShape.size() +
473-
i] < mmaInstrShape[i]) {
474-
ok = false;
475-
break;
476-
}
477-
}
478-
// If this load might trigger the bug, don't do the fallback logic
479-
// below, which might allow the load to be pipelined.
480-
if (!ok)
481-
continue;
482-
}
483-
}
484453
}
485454
} else if (auto loadOp = dyn_cast<tt::LoadOp>(use)) {
486455
// The use of this loadOp is another loadOp. If the use is not in the

0 commit comments

Comments
 (0)