Skip to content

Commit c8187f6

Browse files
authored
[AMDGPU] Fix Xcnt handling between blocks (#165201)
For blocks with multiple predescessors, there maybe `SMEM` and `VMEM` events active at the same time. This patch handles these cases.
1 parent 225f4c6 commit c8187f6

File tree

2 files changed

+199
-5
lines changed

2 files changed

+199
-5
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 24 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1288,18 +1288,38 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
12881288
}
12891289

12901290
void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) {
1291+
// On entry to a block with multiple predescessors, there may
1292+
// be pending SMEM and VMEM events active at the same time.
1293+
// In such cases, only clear one active event at a time.
1294+
auto applyPendingXcntGroup = [this](unsigned E) {
1295+
unsigned LowerBound = getScoreLB(X_CNT);
1296+
applyWaitcnt(X_CNT, 0);
1297+
PendingEvents |= (1 << E);
1298+
setScoreLB(X_CNT, LowerBound);
1299+
};
1300+
12911301
// Wait on XCNT is redundant if we are already waiting for a load to complete.
12921302
// SMEM can return out of order, so only omit XCNT wait if we are waiting till
12931303
// zero.
1294-
if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP))
1295-
return applyWaitcnt(X_CNT, 0);
1304+
if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) {
1305+
if (hasPendingEvent(VMEM_GROUP))
1306+
applyPendingXcntGroup(VMEM_GROUP);
1307+
else
1308+
applyWaitcnt(X_CNT, 0);
1309+
return;
1310+
}
12961311

12971312
// If we have pending store we cannot optimize XCnt because we do not wait for
12981313
// stores. VMEM loads retun in order, so if we only have loads XCnt is
12991314
// decremented to the same number as LOADCnt.
13001315
if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) &&
1301-
!hasPendingEvent(STORE_CNT))
1302-
return applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
1316+
!hasPendingEvent(STORE_CNT)) {
1317+
if (hasPendingEvent(SMEM_GROUP))
1318+
applyPendingXcntGroup(SMEM_GROUP);
1319+
else
1320+
applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt));
1321+
return;
1322+
}
13031323

13041324
applyWaitcnt(X_CNT, Wait.XCnt);
13051325
}

llvm/test/CodeGen/AMDGPU/wait-xcnt.mir

Lines changed: 175 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -945,7 +945,6 @@ body: |
945945
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
946946
...
947947

948-
# FIXME: Missing S_WAIT_XCNT before overwriting vgpr0.
949948
---
950949
name: wait_kmcnt_with_outstanding_vmem_2
951950
tracksRegLiveness: true
@@ -971,6 +970,7 @@ body: |
971970
; GCN-NEXT: {{ $}}
972971
; GCN-NEXT: S_WAIT_KMCNT 0
973972
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
973+
; GCN-NEXT: S_WAIT_XCNT 0
974974
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
975975
bb.0:
976976
liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
@@ -985,6 +985,180 @@ body: |
985985
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
986986
...
987987

988+
---
989+
name: wait_kmcnt_and_wait_loadcnt
990+
tracksRegLiveness: true
991+
machineFunctionInfo:
992+
isEntryFunction: true
993+
body: |
994+
; GCN-LABEL: name: wait_kmcnt_and_wait_loadcnt
995+
; GCN: bb.0:
996+
; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
997+
; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
998+
; GCN-NEXT: {{ $}}
999+
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
1000+
; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
1001+
; GCN-NEXT: {{ $}}
1002+
; GCN-NEXT: bb.1:
1003+
; GCN-NEXT: successors: %bb.2(0x80000000)
1004+
; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2
1005+
; GCN-NEXT: {{ $}}
1006+
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
1007+
; GCN-NEXT: {{ $}}
1008+
; GCN-NEXT: bb.2:
1009+
; GCN-NEXT: liveins: $sgpr2
1010+
; GCN-NEXT: {{ $}}
1011+
; GCN-NEXT: S_WAIT_KMCNT 0
1012+
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
1013+
; GCN-NEXT: S_WAIT_LOADCNT 0
1014+
; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
1015+
bb.0:
1016+
liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
1017+
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
1018+
S_CBRANCH_SCC1 %bb.2, implicit $scc
1019+
bb.1:
1020+
liveins: $vgpr0_vgpr1, $sgpr2
1021+
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
1022+
bb.2:
1023+
liveins: $sgpr2
1024+
$sgpr2 = S_MOV_B32 $sgpr2
1025+
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
1026+
...
1027+
1028+
---
1029+
name: implicit_handling_of_pending_vmem_group
1030+
tracksRegLiveness: true
1031+
machineFunctionInfo:
1032+
isEntryFunction: true
1033+
body: |
1034+
; GCN-LABEL: name: implicit_handling_of_pending_vmem_group
1035+
; GCN: bb.0:
1036+
; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
1037+
; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
1038+
; GCN-NEXT: {{ $}}
1039+
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
1040+
; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
1041+
; GCN-NEXT: {{ $}}
1042+
; GCN-NEXT: bb.1:
1043+
; GCN-NEXT: successors: %bb.2(0x80000000)
1044+
; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr2
1045+
; GCN-NEXT: {{ $}}
1046+
; GCN-NEXT: $vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
1047+
; GCN-NEXT: {{ $}}
1048+
; GCN-NEXT: bb.2:
1049+
; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2
1050+
; GCN-NEXT: {{ $}}
1051+
; GCN-NEXT: S_WAIT_KMCNT 0
1052+
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
1053+
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
1054+
; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec
1055+
; GCN-NEXT: S_WAIT_XCNT 0
1056+
; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0
1057+
bb.0:
1058+
liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
1059+
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
1060+
S_CBRANCH_SCC1 %bb.2, implicit $scc
1061+
bb.1:
1062+
liveins: $vgpr0_vgpr1, $sgpr2
1063+
$vgpr2 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
1064+
bb.2:
1065+
liveins: $sgpr0_sgpr1, $sgpr2
1066+
$sgpr2 = S_MOV_B32 $sgpr2
1067+
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
1068+
$vgpr0 = V_MOV_B32_e32 0, implicit $exec
1069+
$sgpr0 = S_MOV_B32 $sgpr0
1070+
...
1071+
1072+
---
1073+
name: pending_vmem_event_between_block
1074+
tracksRegLiveness: true
1075+
machineFunctionInfo:
1076+
isEntryFunction: true
1077+
body: |
1078+
; GCN-LABEL: name: pending_vmem_event_between_block
1079+
; GCN: bb.0:
1080+
; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
1081+
; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
1082+
; GCN-NEXT: {{ $}}
1083+
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
1084+
; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
1085+
; GCN-NEXT: {{ $}}
1086+
; GCN-NEXT: bb.1:
1087+
; GCN-NEXT: successors: %bb.2(0x80000000)
1088+
; GCN-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
1089+
; GCN-NEXT: {{ $}}
1090+
; GCN-NEXT: $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
1091+
; GCN-NEXT: $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
1092+
; GCN-NEXT: {{ $}}
1093+
; GCN-NEXT: bb.2:
1094+
; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
1095+
; GCN-NEXT: {{ $}}
1096+
; GCN-NEXT: S_WAIT_KMCNT 0
1097+
; GCN-NEXT: $sgpr2 = S_MOV_B32 $sgpr2
1098+
; GCN-NEXT: S_WAIT_XCNT 1
1099+
; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
1100+
; GCN-NEXT: S_WAIT_XCNT 0
1101+
; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
1102+
; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0
1103+
bb.0:
1104+
liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
1105+
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
1106+
S_CBRANCH_SCC1 %bb.2, implicit $scc
1107+
bb.1:
1108+
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
1109+
$vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
1110+
$vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
1111+
bb.2:
1112+
liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
1113+
$sgpr2 = S_MOV_B32 $sgpr2
1114+
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
1115+
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
1116+
$sgpr0 = S_MOV_B32 $sgpr0
1117+
...
1118+
1119+
---
1120+
name: flushing_vmem_cnt_on_block_entry
1121+
tracksRegLiveness: true
1122+
machineFunctionInfo:
1123+
isEntryFunction: true
1124+
body: |
1125+
; GCN-LABEL: name: flushing_vmem_cnt_on_block_entry
1126+
; GCN: bb.0:
1127+
; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
1128+
; GCN-NEXT: liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
1129+
; GCN-NEXT: {{ $}}
1130+
; GCN-NEXT: $sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
1131+
; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit $scc
1132+
; GCN-NEXT: {{ $}}
1133+
; GCN-NEXT: bb.1:
1134+
; GCN-NEXT: successors: %bb.2(0x80000000)
1135+
; GCN-NEXT: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
1136+
; GCN-NEXT: {{ $}}
1137+
; GCN-NEXT: $vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
1138+
; GCN-NEXT: $vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
1139+
; GCN-NEXT: {{ $}}
1140+
; GCN-NEXT: bb.2:
1141+
; GCN-NEXT: liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
1142+
; GCN-NEXT: {{ $}}
1143+
; GCN-NEXT: S_WAIT_XCNT 0
1144+
; GCN-NEXT: $vgpr1 = V_MOV_B32_e32 0, implicit $exec
1145+
; GCN-NEXT: $vgpr2 = V_MOV_B32_e32 0, implicit $exec
1146+
; GCN-NEXT: $sgpr0 = S_MOV_B32 $sgpr0
1147+
bb.0:
1148+
liveins: $vgpr0_vgpr1, $sgpr0_sgpr1, $scc
1149+
$sgpr2 = S_LOAD_DWORD_IMM $sgpr0_sgpr1, 0, 0
1150+
S_CBRANCH_SCC1 %bb.2, implicit $scc
1151+
bb.1:
1152+
liveins: $vgpr0_vgpr1, $vgpr2_vgpr3, $sgpr2
1153+
$vgpr4 = GLOBAL_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec
1154+
$vgpr5 = GLOBAL_LOAD_DWORD $vgpr2_vgpr3, 0, 0, implicit $exec
1155+
bb.2:
1156+
liveins: $sgpr0_sgpr1, $sgpr2, $vgpr2
1157+
$vgpr1 = V_MOV_B32_e32 0, implicit $exec
1158+
$vgpr2 = V_MOV_B32_e32 0, implicit $exec
1159+
$sgpr0 = S_MOV_B32 $sgpr0
1160+
...
1161+
9881162
---
9891163
name: wait_loadcnt_with_outstanding_smem
9901164
tracksRegLiveness: true

0 commit comments

Comments
 (0)