Skip to content

Commit e7ec3fe

Browse files
authored
[SWP] attempt to remove a workaround for a triton llvm codegen bug (#4774)
Triton LLVM codegen has a bug where local_loads from #shared to #mma layout can lead to invalid code if the loaded shape is smaller than the mma tile. Remove the workaround. See triton-lang/triton#3561. Verified that with test case: https://pastebin.com/xxP3cFmy (test.mlir), running triton-opt test.mlir -tritongpu-pipeline=num-stages=3 --convert-scf-to-cf --allocate-shared-memory --convert-triton-gpu-to-llvm has no issue. Unit test case added in triton-lang/triton#4798 also shows no issue.
1 parent 2ef33c6 commit e7ec3fe

File tree

2 files changed

+2
-32
lines changed

2 files changed

+2
-32
lines changed

lib/Dialect/TritonGPU/Transforms/Pipeliner/MatmulLoopPipeline.cpp

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -441,37 +441,6 @@ assignMemoryLayouts(llvm::SmallVector<std::tuple<Operation *, int, Operation *>>
441441
} else if (auto dot = dyn_cast<tt::DotOp>(use)) {
442442
loadInfo.sharedEncoding =
443443
getSharedEncIfAllUsersAreDotEnc(op->getResult(0)).value_or(nullptr);
444-
445-
// HACK: Triton LLVM codegen has a bug where local_loads from #shared to
446-
// #mma layout can lead to invalid code if the loaded shape is smaller
447-
// than the mma tile (e.g. loading a 128x1 tensor for an MMAv2 dot with
448-
// tile {16,8} is bad because 1 < 8). To work around this, don't
449-
// pipeline such loads.
450-
//
451-
// The codegen bug is caught by an assertion, so if you think you've
452-
// fixed it, feel free to delete this code and see if the assert still
453-
// fails. :)
454-
if (!loadInfo.sharedEncoding) {
455-
if (auto dotEnc = dyn_cast<ttg::NvidiaMmaEncodingAttr>(
456-
dot.getResult().getType().getEncoding())) {
457-
auto loadTy = cast<RankedTensorType>(op->getResultTypes()[0]);
458-
auto mmaInstrShape = dotEnc.getInstrShape();
459-
if (loadTy.getRank() < mmaInstrShape.size())
460-
continue;
461-
bool ok = true;
462-
for (int i = 0; i < mmaInstrShape.size(); i++) {
463-
if (loadTy.getShape()[loadTy.getRank() - mmaInstrShape.size() +
464-
i] < mmaInstrShape[i]) {
465-
ok = false;
466-
break;
467-
}
468-
}
469-
// If this load might trigger the bug, don't do the fallback logic
470-
// below, which might allow the load to be pipelined.
471-
if (!ok)
472-
continue;
473-
}
474-
}
475444
}
476445
} else if (auto loadOp = dyn_cast<tt::LoadOp>(use)) {
477446
// The use of this loadOp is another loadOp. If the use is not in the

test/TritonGPU/loop-pipeline.mlir

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1460,7 +1460,8 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 2 :
14601460
// -----
14611461

14621462
// COMMON-LABEL: @dont_pipeline_128x1
1463-
// COMMON-NOT: local_load{{.*}}128x1
1463+
// AMD-NOT: local_load{{.*}}128x1
1464+
// CHECK: local_load{{.*}}128x1
14641465
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [0, 1]}>
14651466
#mma = #triton_gpu.nvidia_mma<{versionMajor = 2, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 8]}>
14661467
module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {

0 commit comments

Comments
 (0)