Skip to content

Commit 4ce54b5

Browse files
[AMD][BACKEND] Bugfix to small tile pingpong (#5759)
Do not simply append dot op next to the memory ops. It can reorder the dependency to the dot op. Fix test failure in test_matmul.
1 parent e2c09d7 commit 4ce54b5

File tree

2 files changed

+10
-3
lines changed

2 files changed

+10
-3
lines changed

test/TritonGPU/amd/amd-block-pingpong.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44
//CHECK: ttg.local_load
55
//CHECK: rocdl.s.setprio 1
66
//CHECK: tt.load
7-
//CHECK: rocdl.sched.barrier 0
7+
//CHECK: rocdl.sched.barrier
88
//CHECK: ttg.local_load
99
//CHECK: rocdl.s.setprio 0
1010
//CHECK: tt.load
11-
//CHECK: rocdl.sched.barrier 0
11+
//CHECK: rocdl.sched.barrier
1212
//CHECK: rocdl.s.setprio 1
1313
//CHECK: tt.dot
1414
//CHECK: rocdl.s.setprio 0

third_party/amd/lib/TritonAMDGPUTransforms/BlockPingpong.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,13 @@ void Pingponger::appendOpWithPrio(OpBuilder &builder, Operation *op,
119119
// high-level operations, inserting `setPrio` also has a same effect of
120120
// instruction scheduling boundary, too.
121121
void Pingponger::transformOnePPClusters(OpBuilder &builder, Location loc) {
122+
auto dotLoc = dotOps[0]->getPrevNode();
123+
// sched barrier to prevent memory ops from cross but leave other ops to be
124+
// scheduled across the barrier.
125+
auto preDotBar = builder.create<ROCDL::SchedBarrier>(loc, 1);
126+
updateOpInsertion(dotLoc);
127+
appendOp(preDotBar);
128+
122129
// Memory cluster #0
123130
updateOpInsertion(lLoadOps[0]);
124131
appendOp(builder.create<ROCDL::SetPrioOp>(loc, highPriority));
@@ -127,9 +134,9 @@ void Pingponger::transformOnePPClusters(OpBuilder &builder, Location loc) {
127134
appendOp(lLoadOps[1]);
128135
appendOp(builder.create<ROCDL::SetPrioOp>(loc, lowPriority));
129136
appendOp(gLoadOps[1]);
130-
appendOp(builder.create<ROCDL::SchedBarrier>(loc, 0));
131137

132138
// Dot cluster #0
139+
updateOpInsertion(preDotBar);
133140
appendOpWithPrio(builder, dotOps[0], loc);
134141
}
135142

0 commit comments

Comments
 (0)