Skip to content

Commit 14727cf

Browse files
committed
[AMDGPU] Disable VALU sinking and hoisting with WWM
Machine LICM can hoist a VALU instruction from a WWM region. In this case WQM pass will have to create yet another WWM region around the hoisted instruction, which is not desired. Unfortunatelly we cannot tell if an instruction is in the WWM region, so this patch disables hoisting if WWM is used in the function. This works around the bug SWDEV-502411.
1 parent fbea21a commit 14727cf

16 files changed

+7764
-7385
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2773,6 +2773,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
27732773
case Intrinsic::amdgcn_wwm:
27742774
case Intrinsic::amdgcn_strict_wwm:
27752775
Opcode = AMDGPU::STRICT_WWM;
2776+
CurDAG->getMachineFunction()
2777+
.getInfo<SIMachineFunctionInfo>()
2778+
->setInitWholeWave();
27762779
break;
27772780
case Intrinsic::amdgcn_strict_wqm:
27782781
Opcode = AMDGPU::STRICT_WQM;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1055,8 +1055,12 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
10551055
case Intrinsic::amdgcn_softwqm:
10561056
return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
10571057
case Intrinsic::amdgcn_strict_wwm:
1058-
case Intrinsic::amdgcn_wwm:
1058+
case Intrinsic::amdgcn_wwm: {
1059+
MachineFunction *MF = I.getParent()->getParent();
1060+
SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
1061+
MFInfo->setInitWholeWave();
10591062
return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1063+
}
10601064
case Intrinsic::amdgcn_strict_wqm:
10611065
return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
10621066
case Intrinsic::amdgcn_writelane:

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,11 @@ static bool resultDependsOnExec(const MachineInstr &MI) {
184184
bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
185185
// Any implicit use of exec by VALU is not a real register read.
186186
return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
187-
isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
187+
isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent()) &&
188+
!MO.getParent()
189+
->getMF()
190+
->getInfo<SIMachineFunctionInfo>()
191+
->hasInitWholeWave();
188192
}
189193

190194
bool SIInstrInfo::isSafeToSink(MachineInstr &MI,

llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll

Lines changed: 700 additions & 680 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll

Lines changed: 3067 additions & 2967 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/cse-convergent.ll

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,31 +8,33 @@ define i32 @test(i32 %val, i32 %cond) {
88
; GCN-NEXT: s_xor_saveexec_b32 s4, -1
99
; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
1010
; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
11+
; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
1112
; GCN-NEXT: s_waitcnt_depctr 0xffe3
1213
; GCN-NEXT: s_mov_b32 exec_lo, s4
1314
; GCN-NEXT: s_or_saveexec_b32 s4, -1
1415
; GCN-NEXT: v_mov_b32_e32 v2, 0
1516
; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v0, s4
16-
; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
17-
; GCN-NEXT: s_mov_b32 exec_lo, s4
18-
; GCN-NEXT: v_mov_b32_e32 v5, 0
1917
; GCN-NEXT: v_mov_b32_e32 v4, v2
18+
; GCN-NEXT: v_mov_b32_dpp v4, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
19+
; GCN-NEXT: s_mov_b32 exec_lo, s4
20+
; GCN-NEXT: v_mov_b32_e32 v5, v4
2021
; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1
2122
; GCN-NEXT: s_and_saveexec_b32 s4, vcc_lo
2223
; GCN-NEXT: ; %bb.1: ; %if
2324
; GCN-NEXT: s_or_saveexec_b32 s5, -1
24-
; GCN-NEXT: v_mov_b32_e32 v2, 0
25-
; GCN-NEXT: v_cndmask_b32_e64 v3, 0, v0, s5
26-
; GCN-NEXT: v_mov_b32_dpp v2, v3 row_xmask:1 row_mask:0xf bank_mask:0xf
25+
; GCN-NEXT: v_mov_b32_e32 v3, 0
26+
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, v0, s5
27+
; GCN-NEXT: v_mov_b32_dpp v3, v2 row_xmask:1 row_mask:0xf bank_mask:0xf
2728
; GCN-NEXT: s_mov_b32 exec_lo, s5
28-
; GCN-NEXT: v_mov_b32_e32 v5, v2
29+
; GCN-NEXT: v_mov_b32_e32 v2, v3
2930
; GCN-NEXT: ; %bb.2: ; %end
3031
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4
31-
; GCN-NEXT: v_add_nc_u32_e32 v0, v4, v5
32+
; GCN-NEXT: v_add_nc_u32_e32 v0, v5, v2
3233
; GCN-NEXT: s_xor_saveexec_b32 s4, -1
33-
; GCN-NEXT: s_clause 0x1
34+
; GCN-NEXT: s_clause 0x2
3435
; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s32
3536
; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:4
37+
; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8
3638
; GCN-NEXT: s_waitcnt_depctr 0xffe3
3739
; GCN-NEXT: s_mov_b32 exec_lo, s4
3840
; GCN-NEXT: s_waitcnt vmcnt(0)

llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll

Lines changed: 1138 additions & 1077 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll

Lines changed: 792 additions & 745 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll

Lines changed: 792 additions & 745 deletions
Large diffs are not rendered by default.

llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll

Lines changed: 1179 additions & 1110 deletions
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)