Skip to content

Commit 1e0a371

Browse files
[AMD] Restrict merging async_wait in StreamPipeliner (#7577)
Disable merging async_wait when pipelining with num_stages=3. This is to avoid incorrect operation order by combineRedundantWaitOps.
1 parent 4048f31 commit 1e0a371

File tree

1 file changed

+3
-2
lines changed

1 file changed

+3
-2
lines changed

third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -765,14 +765,15 @@ struct PipelinePass : impl::TritonAMDGPUStreamPipelineBase<PipelinePass> {
765765
useAsyncCopy, waitAtTail);
766766
}
767767

768-
if (useAsyncCopy) {
768+
if (useAsyncCopy && numStages != 3) {
769769
llvm::SmallSetVector<ttg::AsyncWaitOp, 8> waitOps;
770770
moduleOp.walk([&](ttg::AsyncWaitOp waitOp) {
771771
if (auto maybeForOp = dyn_cast<scf::ForOp>(waitOp->getParentOp()))
772772
// FIXME: There's potential bug in combinRedundantWaitOps(), it
773773
// generate incorrect IR order when numStages==3.
774774
if (tt::getNumStagesOrDefault(maybeForOp, numStages) == 3)
775-
waitOps.insert(waitOp);
775+
return;
776+
waitOps.insert(waitOp);
776777
});
777778
tt::combineRedundantWaitOps(waitOps);
778779
}

0 commit comments

Comments
 (0)