Skip to content

Commit 9e72047

Browse files
authored
This change is causing failures in some internal tests. There must still be some miscompile associated with this.
1 parent ca70f08 commit 9e72047

File tree

2 files changed

+32
-2
lines changed

2 files changed

+32
-2
lines changed

lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,37 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
450450
// If we can't agree on a shared encoding skip pipelinig the load.
451451
if (incompatible)
452452
continue;
453+
454+
// HACK: Triton LLVM codegen has a bug where local_loads from #shared to
455+
// #mma layout can lead to invalid code if the loaded shape is smaller
456+
// than the mma tile (e.g. loading a 128x1 tensor for an MMAv2 dot with
457+
// tile {16,8} is bad because 1 < 8). To work around this, don't
458+
// pipeline such loads.
459+
//
460+
// The codegen bug is caught by an assertion, so if you think you've
461+
// fixed it, feel free to delete this code and see if the assert still
462+
// fails. :)
463+
if (!loadInfo.sharedEncoding) {
464+
if (auto dotEnc = dyn_cast<ttg::NvidiaMmaEncodingAttr>(
465+
dot.getResult().getType().getEncoding())) {
466+
auto loadTy = cast<RankedTensorType>(op->getResultTypes()[0]);
467+
auto mmaInstrShape = dotEnc.getInstrShape();
468+
if (loadTy.getRank() < mmaInstrShape.size())
469+
continue;
470+
bool ok = true;
471+
for (int i = 0; i < mmaInstrShape.size(); i++) {
472+
if (loadTy.getShape()[loadTy.getRank() - mmaInstrShape.size() +
473+
i] < mmaInstrShape[i]) {
474+
ok = false;
475+
break;
476+
}
477+
}
478+
// If this load might trigger the bug, don't do the fallback logic
479+
// below, which might allow the load to be pipelined.
480+
if (!ok)
481+
continue;
482+
}
483+
}
453484
}
454485
} else if (auto loadOp = dyn_cast<tt::LoadOp>(use)) {
455486
// The use of this loadOp is another loadOp. If the use is not in the

test/TritonGPU/loop-pipeline.mlir

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1453,8 +1453,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 :
14531453
// -----
14541454

14551455
// COMMON-LABEL: @dont_pipeline_128x1
1456-
// AMD-NOT: local_load{{.*}}128x1
1457-
// CHECK: local_load{{.*}}128x1
1456+
// COMMON-NOT: local_load{{.*}}128x1
14581457
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
14591458
#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}>
14601459
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {

0 commit comments

Comments
 (0)