Skip to content

Commit 0060f29

Browse files
authored
[AMDGPU] Add option to disable VALU sinking and hoisting with WWM (llvm#875)
Enable this via -amdgpu-ignorable-use-considers-wwm. This works around the bug SWDEV-502411. Originally authored by: Stanislav Mekhanoshin <[email protected]>
1 parent 67d3b06 commit 0060f29

14 files changed

+51
-10
lines changed

llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2866,6 +2866,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
28662866
case Intrinsic::amdgcn_wwm:
28672867
case Intrinsic::amdgcn_strict_wwm:
28682868
Opcode = AMDGPU::STRICT_WWM;
2869+
CurDAG->getMachineFunction()
2870+
.getInfo<SIMachineFunctionInfo>()
2871+
->setUsesWholeWave();
28692872
break;
28702873
case Intrinsic::amdgcn_strict_wqm:
28712874
Opcode = AMDGPU::STRICT_WQM;

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1137,8 +1137,12 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
11371137
case Intrinsic::amdgcn_softwqm:
11381138
return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
11391139
case Intrinsic::amdgcn_strict_wwm:
1140-
case Intrinsic::amdgcn_wwm:
1140+
case Intrinsic::amdgcn_wwm: {
1141+
MachineFunction *MF = I.getParent()->getParent();
1142+
SIMachineFunctionInfo *MFInfo = MF->getInfo<SIMachineFunctionInfo>();
1143+
MFInfo->setUsesWholeWave();
11411144
return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WWM);
1145+
}
11421146
case Intrinsic::amdgcn_strict_wqm:
11431147
return constrainCopyLikeIntrin(I, AMDGPU::STRICT_WQM);
11441148
case Intrinsic::amdgcn_writelane:

llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
6868
bool WaveLimiter = false;
6969

7070
bool HasInitWholeWave = false;
71+
bool UsesWholeWave = false;
7172

7273
public:
7374
AMDGPUMachineFunction(const Function &F, const AMDGPUSubtarget &ST);
@@ -114,6 +115,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
114115
bool hasInitWholeWave() const { return HasInitWholeWave; }
115116
void setInitWholeWave() { HasInitWholeWave = true; }
116117

118+
bool usesWholeWave() const { return HasInitWholeWave || UsesWholeWave; }
119+
void setUsesWholeWave() { UsesWholeWave = true; }
120+
117121
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV) {
118122
return allocateLDSGlobal(DL, GV, DynLDSAlign);
119123
}

llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1942,6 +1942,8 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
19421942

19431943
if (YamlMFI.HasInitWholeWave)
19441944
MFI->setInitWholeWave();
1945+
if (YamlMFI.UsesWholeWave)
1946+
MFI->setUsesWholeWave();
19451947

19461948
return false;
19471949
}

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,11 @@ static cl::opt<bool> Fix16BitCopies(
6060
cl::init(true),
6161
cl::ReallyHidden);
6262

63+
static cl::opt<bool> IgnorableUseConsidersWWM(
64+
"amdgpu-ignorable-use-considers-wwm",
65+
cl::desc("Disable ignore use semantics for functions with WWM usage"),
66+
cl::init(false), cl::ReallyHidden);
67+
6368
SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
6469
: AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
6570
RI(ST), ST(ST) {
@@ -184,8 +189,13 @@ static bool resultDependsOnExec(const MachineInstr &MI) {
184189

185190
bool SIInstrInfo::isIgnorableUse(const MachineOperand &MO) const {
186191
// Any implicit use of exec by VALU is not a real register read.
192+
bool FuncUsesWWM = MO.getParent()
193+
->getMF()
194+
->getInfo<SIMachineFunctionInfo>()
195+
->usesWholeWave();
187196
return MO.getReg() == AMDGPU::EXEC && MO.isImplicit() &&
188-
isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent());
197+
isVALU(*MO.getParent()) && !resultDependsOnExec(*MO.getParent()) &&
198+
!(IgnorableUseConsidersWWM && FuncUsesWWM);
189199
}
190200

191201
bool SIInstrInfo::isSafeToSink(MachineInstr &MI,

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -713,7 +713,8 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
713713
ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
714714
PSInputAddr(MFI.getPSInputAddr()), PSInputEnable(MFI.getPSInputEnable()),
715715
MaxMemoryClusterDWords(MFI.getMaxMemoryClusterDWords()),
716-
Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()) {
716+
Mode(MFI.getMode()), HasInitWholeWave(MFI.hasInitWholeWave()),
717+
UsesWholeWave(MFI.usesWholeWave()) {
717718
for (Register Reg : MFI.getSGPRSpillPhysVGPRs())
718719
SpillPhysVGPRS.push_back(regToString(Reg, TRI));
719720

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
298298
StringValue LongBranchReservedReg;
299299

300300
bool HasInitWholeWave = false;
301+
bool UsesWholeWave = false;
301302

302303
SIMachineFunctionInfo() = default;
303304
SIMachineFunctionInfo(const llvm::SIMachineFunctionInfo &,
@@ -350,6 +351,7 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
350351
YamlIO.mapOptional("longBranchReservedReg", MFI.LongBranchReservedReg,
351352
StringValue());
352353
YamlIO.mapOptional("hasInitWholeWave", MFI.HasInitWholeWave, false);
354+
YamlIO.mapOptional("usesWholeWave", MFI.UsesWholeWave, false);
353355
}
354356
};
355357

llvm/test/CodeGen/AMDGPU/licm-wwm.mir

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,31 @@
11
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2-
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=early-machinelicm,si-wqm -o - %s | FileCheck -check-prefix=GCN %s
3-
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=early-machinelicm,si-wqm -o - %s | FileCheck -check-prefix=GCN %s
2+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=early-machinelicm,si-wqm -amdgpu-ignorable-use-considers-wwm=1 -o - %s | FileCheck -check-prefix=GCN %s
3+
# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -passes=early-machinelicm,si-wqm -amdgpu-ignorable-use-considers-wwm=1 -o - %s | FileCheck -check-prefix=GCN %s
44

55
# Machine LICM may hoist an intruction from a WWM region, which will force SI-WQM pass
66
# to create a second WWM region. This is an unwanted hoisting.
7+
# Make sure it does not happen when ignorable use considers WWM setting.
78

89
---
910
name: licm_move_wwm
1011
tracksRegLiveness: true
12+
machineFunctionInfo:
13+
usesWholeWave: true
14+
1115
body: |
1216
; GCN-LABEL: name: licm_move_wwm
1317
; GCN: bb.0:
1418
; GCN-NEXT: successors: %bb.1(0x80000000)
1519
; GCN-NEXT: {{ $}}
16-
; GCN-NEXT: [[ENTER_STRICT_WWM:%[0-9]+]]:sreg_32 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
17-
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
18-
; GCN-NEXT: $exec_lo = EXIT_STRICT_WWM [[ENTER_STRICT_WWM]]
1920
; GCN-NEXT: S_BRANCH %bb.1
2021
; GCN-NEXT: {{ $}}
2122
; GCN-NEXT: bb.1:
2223
; GCN-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
2324
; GCN-NEXT: {{ $}}
24-
; GCN-NEXT: [[ENTER_STRICT_WWM1:%[0-9]+]]:sreg_32 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
25+
; GCN-NEXT: [[ENTER_STRICT_WWM:%[0-9]+]]:sreg_32 = ENTER_STRICT_WWM -1, implicit-def $exec, implicit-def $scc, implicit $exec
26+
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
2527
; GCN-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[V_MOV_B32_e32_]], implicit $exec
26-
; GCN-NEXT: $exec_lo = EXIT_STRICT_WWM [[ENTER_STRICT_WWM1]]
28+
; GCN-NEXT: $exec_lo = EXIT_STRICT_WWM [[ENTER_STRICT_WWM]]
2729
; GCN-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[V_READFIRSTLANE_B32_]]
2830
; GCN-NEXT: $exec_lo = S_OR_B32 $exec_lo, [[COPY]], implicit-def $scc
2931
; GCN-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec

llvm/test/CodeGen/MIR/AMDGPU/long-branch-reg-all-sgpr-used.ll

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
4545
; CHECK-NEXT: longBranchReservedReg: ''
4646
; CHECK-NEXT: hasInitWholeWave: false
47+
; CHECK-NEXT: usesWholeWave: false
4748
; CHECK-NEXT: body:
4849
define amdgpu_kernel void @long_branch_used_all_sgprs(ptr addrspace(1) %arg, i32 %cnd) #0 {
4950
entry:
@@ -311,6 +312,7 @@
311312
; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
312313
; CHECK-NEXT: longBranchReservedReg: ''
313314
; CHECK-NEXT: hasInitWholeWave: false
315+
; CHECK-NEXT: usesWholeWave: false
314316
; CHECK-NEXT: body:
315317
define amdgpu_kernel void @long_branch_high_num_sgprs_used(ptr addrspace(1) %arg, i32 %cnd) #0 {
316318
entry:

llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-after-pei.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@
4444
; AFTER-PEI-NEXT: sgprForEXECCopy: ''
4545
; AFTER-PEI-NEXT: longBranchReservedReg: ''
4646
; AFTER-PEI-NEXT: hasInitWholeWave: false
47+
; AFTER-PEI-NEXT: usesWholeWave: false
4748
; AFTER-PEI-NEXT: body:
4849
define amdgpu_kernel void @scavenge_fi(ptr addrspace(1) %out, i32 %in) #0 {
4950
%wide.sgpr0 = call <32 x i32> asm sideeffect "; def $0", "=s" () #0

0 commit comments

Comments
 (0)