-
Notifications
You must be signed in to change notification settings - Fork 15.4k
AMDGPU: Track AGPR pressure #150288
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
AMDGPU: Track AGPR pressure #150288
Conversation
e973d74 to
d4fd1c1
Compare
jrbyrnes
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needs lit tests.
b4396ad to
41a5fb4
Compare
|
@llvm/pr-subscribers-backend-amdgpu Author: Nicholas Baron (Nicholas-Baron) ChangesPatch is 12.94 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150288.diff 157 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 254b75b784e75..fab44ef942aa3 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -86,6 +86,8 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
VGPRExcessLimit =
Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
+ AGPRExcessLimit =
+ Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::AGPR_32RegClass);
SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
// Set the initial TargetOccupnacy to the maximum occupancy that we can
@@ -98,6 +100,9 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
SGPRCriticalLimit =
std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
+ AGPRCriticalLimit =
+ std::min(ST.getMaxNumAGPRs(TargetOccupancy), AGPRExcessLimit);
+
if (!KnownExcessRP) {
VGPRCriticalLimit = std::min(
ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
@@ -201,7 +206,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
const RegPressureTracker &RPTracker,
const SIRegisterInfo *SRI,
unsigned SGPRPressure,
- unsigned VGPRPressure, bool IsBottomUp) {
+ unsigned VGPRPressure,
+ unsigned AGPRPressure, bool IsBottomUp) {
Cand.SU = SU;
Cand.AtTop = AtTop;
@@ -230,6 +236,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
Pressure.resize(4, 0);
Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
+ Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = AGPRPressure;
for (const auto &Diff : DAG->getPressureDiff(SU)) {
if (!Diff.isValid())
@@ -247,7 +254,9 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
- CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
+ CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] ||
+ Pressure[AMDGPU::RegisterPressureSets::AGPR_32] !=
+ CheckPressure[AMDGPU::RegisterPressureSets::AGPR_32]) {
errs() << "Register Pressure is inaccurate when calculated through "
"PressureDiff\n"
<< "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
@@ -255,7 +264,10 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
<< CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
<< "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
<< ", expected "
- << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
+ << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n"
+ << "AGPR got " << Pressure[AMDGPU::RegisterPressureSets::AGPR_32]
+ << ", expected "
+ << CheckPressure[AMDGPU::RegisterPressureSets::AGPR_32] << "\n";
report_fatal_error("inaccurate register pressure calculation");
}
#endif
@@ -263,6 +275,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+ unsigned NewAGPRPressure = Pressure[AMDGPU::RegisterPressureSets::AGPR_32];
// If two instructions increase the pressure of different register sets
// by the same amount, the generic scheduler will prefer to schedule the
@@ -272,9 +285,11 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
// only for VGPRs or only for SGPRs.
// FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
- const unsigned MaxVGPRPressureInc = 16;
+ static constexpr unsigned MaxVGPRPressureInc = 16;
bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
- bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
+ bool ShouldTrackAGPRs = !ShouldTrackVGPRs && AGPRPressure >= AGPRExcessLimit;
+ bool ShouldTrackSGPRs =
+ !ShouldTrackVGPRs && !ShouldTrackAGPRs && SGPRPressure >= SGPRExcessLimit;
// FIXME: We have to enter REG-EXCESS before we reach the actual threshold
// to increase the likelihood we don't go over the limits. We should improve
@@ -291,6 +306,12 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
}
+ if (ShouldTrackAGPRs && NewAGPRPressure >= AGPRPressure) {
+ HasHighPressure = true;
+ Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::AGPR_32);
+ Cand.RPDelta.Excess.setUnitInc(NewAGPRPressure - AGPRExcessLimit);
+ }
+
if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
HasHighPressure = true;
Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
@@ -304,13 +325,19 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
+ int AGPRDelta = NewAGPRPressure - AGPRCriticalLimit;
- if (SGPRDelta >= 0 || VGPRDelta >= 0) {
+ if (SGPRDelta >= 0 || VGPRDelta >= 0 || AGPRDelta >= 0) {
HasHighPressure = true;
- if (SGPRDelta > VGPRDelta) {
+ // Prioritize reducing the VGPRDelta if both are >= 0
+ if (SGPRDelta > VGPRDelta && SGPRDelta > AGPRDelta) {
Cand.RPDelta.CriticalMax =
PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
+ } else if (AGPRDelta > VGPRDelta) {
+ Cand.RPDelta.CriticalMax =
+ PressureChange(AMDGPU::RegisterPressureSets::AGPR_32);
+ Cand.RPDelta.CriticalMax.setUnitInc(AGPRDelta);
} else {
Cand.RPDelta.CriticalMax =
PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
@@ -330,16 +357,19 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
unsigned SGPRPressure = 0;
unsigned VGPRPressure = 0;
+ unsigned AGPRPressure = 0;
if (DAG->isTrackingPressure()) {
if (!GCNTrackers) {
SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+ AGPRPressure = Pressure[AMDGPU::RegisterPressureSets::AGPR_32];
} else {
GCNRPTracker *T = IsBottomUp
? static_cast<GCNRPTracker *>(&UpwardTracker)
: static_cast<GCNRPTracker *>(&DownwardTracker);
SGPRPressure = T->getPressure().getSGPRNum();
VGPRPressure = T->getPressure().getArchVGPRNum();
+ AGPRPressure = T->getPressure().getAGPRNum();
}
}
ReadyQueue &Q = Zone.Available;
@@ -347,7 +377,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
SchedCandidate TryCand(ZonePolicy);
initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
- VGPRPressure, IsBottomUp);
+ VGPRPressure, AGPRPressure, IsBottomUp);
// Pass SchedBoundary only when comparing nodes from the same boundary.
SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
tryCandidate(Cand, TryCand, ZoneArg);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 790370ff8ab4d..8b2137bcd14da 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -53,7 +53,8 @@ class GCNSchedStrategy : public GenericScheduler {
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop,
const RegPressureTracker &RPTracker,
const SIRegisterInfo *SRI, unsigned SGPRPressure,
- unsigned VGPRPressure, bool IsBottomUp);
+ unsigned VGPRPressure, unsigned AGPRPressure,
+ bool IsBottomUp);
std::vector<unsigned> Pressure;
@@ -63,6 +64,8 @@ class GCNSchedStrategy : public GenericScheduler {
unsigned VGPRExcessLimit;
+ unsigned AGPRExcessLimit;
+
unsigned TargetOccupancy;
MachineFunction *MF;
@@ -103,6 +106,8 @@ class GCNSchedStrategy : public GenericScheduler {
unsigned VGPRCriticalLimit;
+ unsigned AGPRCriticalLimit;
+
unsigned SGPRLimitBias = 0;
unsigned VGPRLimitBias = 0;
@@ -183,8 +188,7 @@ class ScheduleMetrics {
};
inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) {
- dbgs() << "\n Schedule Metric (scaled by "
- << ScheduleMetrics::ScaleFactor
+ dbgs() << "\n Schedule Metric (scaled by " << ScheduleMetrics::ScaleFactor
<< " ) is: " << Sm.getMetric() << " [ " << Sm.getBubbles() << "/"
<< Sm.getLength() << " ]\n";
return OS;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4475c8d1d1602..c9fa3894408e9 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1722,8 +1722,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// unit requirement.
unsigned getMaxNumVGPRs(const Function &F) const;
- unsigned getMaxNumAGPRs(const Function &F) const {
- return getMaxNumVGPRs(F);
+ unsigned getMaxNumAGPRs(unsigned WavesPerEU) const {
+ return AMDGPU::IsaInfo::getMaxNumAGPRs(this, WavesPerEU);
}
/// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
@@ -1744,13 +1744,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool supportsWave64() const { return !hasGFX1250Insts(); }
- bool isWave32() const {
- return getWavefrontSize() == 32;
- }
+ bool isWave32() const { return getWavefrontSize() == 32; }
- bool isWave64() const {
- return getWavefrontSize() == 64;
- }
+ bool isWave64() const { return getWavefrontSize() == 64; }
/// Returns if the wavesize of this subtarget is known reliable. This is false
/// only for the a default target-cpu that does not have an explicit
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 0a0b02c18c1db..d78106694f2e8 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1494,6 +1494,22 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
return std::min(MaxNumVGPRs, AddressableNumVGPRs);
}
+unsigned getMaxNumAGPRs(const MCSubtargetInfo *STI, unsigned int WavesPerEU) {
+ if (!STI->getFeatureBits().test(FeatureMAIInsts))
+ return 0;
+
+ assert(WavesPerEU != 0);
+
+ assert(!STI->getFeatureBits().test(FeatureDynamicVGPR));
+
+ unsigned MaxNumAGPRs =
+ alignTo(getTotalNumVGPRs(STI) / WavesPerEU, getVGPRAllocGranule(STI, 0));
+ unsigned AddressableNumAGPRs = getAddressableNumArchVGPRs(STI);
+ return std::min(MaxNumAGPRs, AddressableNumAGPRs);
+}
+
+unsigned getAddressableNumAGPRs(const MCSubtargetInfo *STI) { return 256; }
+
unsigned getEncodedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs,
std::optional<bool> EnableWavefrontSize32) {
return getGranulatedNumRegisterBlocks(
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 23ea3ba0c8385..ecf7faac89ce5 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -353,6 +353,13 @@ unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
unsigned MaxWaves,
unsigned TotalNumVGPRs);
+/// \returns Maximum number of AGPRs that meets given number of waves per
+/// execution unit requirement for given subtarget \p STI.
+unsigned getMaxNumAGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
+
+/// \returns Addressable number of AGPRs for a given subtarget \p STI.
+unsigned getAddressableNumAGPRs(const MCSubtargetInfo *STI);
+
/// \returns Occupancy for a given \p SGPRs usage, \p MaxWaves possible, and \p
/// Gen.
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
index b67080bd4798d..7c58791281562 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
@@ -149,55 +149,55 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX8-LABEL: add_v5i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0
+; GFX8-NEXT: v_add_u32_e64 v19, s[16:17], 2, v0
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v0
+; GFX8-NEXT: v_add_u32_e64 v8, s[4:5], 6, v0
+; GFX8-NEXT: v_add_u32_e64 v10, s[6:7], 8, v0
+; GFX8-NEXT: v_add_u32_e64 v12, s[8:9], 2, v2
+; GFX8-NEXT: v_add_u32_e64 v14, s[10:11], 4, v2
+; GFX8-NEXT: v_add_u32_e64 v18, s[14:15], 8, v2
+; GFX8-NEXT: v_addc_u32_e64 v20, s[16:17], 0, v1, s[16:17]
+; GFX8-NEXT: v_add_u32_e64 v16, s[12:13], 6, v2
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v0
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v12, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v13, v[6:7]
-; GFX8-NEXT: flat_load_ushort v14, v[8:9]
-; GFX8-NEXT: flat_load_ushort v15, v[10:11]
-; GFX8-NEXT: flat_load_ushort v16, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 8, v2
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_ushort v17, v[2:3]
-; GFX8-NEXT: flat_load_ushort v18, v[0:1]
-; GFX8-NEXT: flat_load_ushort v19, v[6:7]
-; GFX8-NEXT: flat_load_ushort v20, v[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v9, vcc, 0, v1, s[4:5]
+; GFX8-NEXT: v_addc_u32_e64 v11, vcc, 0, v1, s[6:7]
+; GFX8-NEXT: flat_load_ushort v21, v[0:1]
+; GFX8-NEXT: flat_load_ushort v20, v[19:20]
+; GFX8-NEXT: flat_load_ushort v22, v[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v13, s[6:7], 0, v3, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v15, s[6:7], 0, v3, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v19, s[6:7], 0, v3, s[14:15]
+; GFX8-NEXT: flat_load_ushort v23, v[8:9]
; GFX8-NEXT: flat_load_ushort v10, v[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v17, s[6:7], 0, v3, s[12:13]
+; GFX8-NEXT: flat_load_ushort v11, v[2:3]
+; GFX8-NEXT: flat_load_ushort v12, v[12:13]
+; GFX8-NEXT: flat_load_ushort v13, v[14:15]
+; GFX8-NEXT: flat_load_ushort v14, v[16:17]
+; GFX8-NEXT: flat_load_ushort v15, v[18:19]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4
+; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], 4, v4
+; GFX8-NEXT: v_add_u32_e64 v8, s[6:7], 6, v4
+; GFX8-NEXT: v_add_u32_e64 v2, s[8:9], 8, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 6, v4
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 8, v4
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GFX8-NEXT: v_addc_u32_e64 v7, vcc, 0, v5, s[4:5]
+; GFX8-NEXT: v_addc_u32_e64 v9, vcc, 0, v5, s[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v3, vcc, 0, v5, s[8:9]
; GFX8-NEXT: s_waitcnt vmcnt(4)
-; GFX8-NEXT: v_add_u16_e32 v11, v12, v17
+; GFX8-NEXT: v_add_u16_e32 v11, v21, v11
; GFX8-NEXT: s_waitcnt vmcnt(3)
-; GFX8-NEXT: v_add_u16_e32 v12, v13, v18
+; GFX8-NEXT: v_add_u16_e32 v12, v20, v12
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_add_u16_e32 v13, v14, v19
+; GFX8-NEXT: v_add_u16_e32 v13, v22, v13
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_add_u16_e32 v14, v15, v20
+; GFX8-NEXT: v_add_u16_e32 v14, v23, v14
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u16_e32 v10, v16, v10
+; GFX8-NEXT: v_add_u16_e32 v10, v10, v15
; GFX8-NEXT: flat_store_short v[4:5], v11
; GFX8-NEXT: flat_store_short v[0:1], v12
-; GFX8-NEXT: flat_store_short v[2:3], v13
-; GFX8-NEXT: flat_store_short v[6:7], v14
-; GFX8-NEXT: flat_store_short v[8:9], v10
+; GFX8-NEXT: flat_store_short v[6:7], v13
+; GFX8-NEXT: flat_store_short v[8:9], v14
+; GFX8-NEXT: flat_store_short v[2:3], v10
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -341,77 +341,77 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX8-LABEL: addv_7i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v0
+; GFX8-NEXT: v_add_u32_e64 v8, s[4:5], 6, v0
+; GFX8-NEXT: v_add_u32_e64 v10, s[6:7], 8, v0
+; GFX8-NEXT: v_add_u32_e64 v12, s[8:9], 10, v0
+; GFX8-NEXT: v_add_u32_e64 v14, s[10:11], 12, v0
+; GFX8-NEXT: v_add_u32_e64 v19, s[16:17], 2, v0
+; GFX8-NEXT: v_add_u32_e64 v16, s[12:13], 2, v2
+; GFX8-NEXT: v_add_u32_e64 v18, s[14:15], 4, v2
+; GFX8-NEXT: v_addc_u32_e64 v20, s[16:17], 0, v1, s[16:17]
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v0
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, 8, v0
-; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 10, v0
-; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v16, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v17, v[6:7]
-; GFX8-NEXT: flat_load_ushort v18, v[8:9]
-; GFX8-NEXT: flat_load_ushort v19, v[10:11]
-; GFX8-NEXT: flat_load_ushort v20, v[12:13]
-; GFX8-NEXT: flat_load_ushort v21, v[14:15]
-; GFX8-NEXT: flat_load_ushort v22, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2
+; GFX8-NEXT: v_addc_u32_e64 v9, vcc, 0, v1, s[4:5]
+; GFX8-NEXT: v_addc_u32_e64 v11, vcc, 0, v1, s[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v13, vcc, 0, v1, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v15, vcc, 0, v1, s[10:11]
+; GFX8-NEXT: flat_load_ushort v21, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v2
+; GFX8-NEXT: flat_load_ushort v20, v[19:20]
+; GFX8-NEXT: flat_load_ushort v22, v[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v17, s[8:9], 0, v3, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v19, s[10:11], 0, v3, s[14:15]
+; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], 8, v2
+; GFX8-NEXT: flat_load_ushort v23, v[8:9]
+; GFX8-NEXT: flat_load_ushort v24, v[10:11]
+; GFX8-NEXT: v_add_u32_e64 v8, s[6:7], 10, v2
+; GFX8-NEXT: flat_load_ushort v12, v[12:13]
+; GFX8-NEXT: flat_load_ushort v13, v[14:15]
+; GFX8-NEXT: v_add_u32_e64 v10, s[8:9], 12, v2
+; GFX8-NEXT: flat_load_ushort v14, v[16:17]
+; GFX8-NEXT: flat_load_ushort v15, v[18:19]
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u...
[truncated]
|
|
@llvm/pr-subscribers-llvm-globalisel Author: Nicholas Baron (Nicholas-Baron) ChangesPatch is 12.94 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150288.diff 157 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index 254b75b784e75..fab44ef942aa3 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -86,6 +86,8 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
VGPRExcessLimit =
Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
+ AGPRExcessLimit =
+ Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::AGPR_32RegClass);
SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
// Set the initial TargetOccupnacy to the maximum occupancy that we can
@@ -98,6 +100,9 @@ void GCNSchedStrategy::initialize(ScheduleDAGMI *DAG) {
SGPRCriticalLimit =
std::min(ST.getMaxNumSGPRs(TargetOccupancy, true), SGPRExcessLimit);
+ AGPRCriticalLimit =
+ std::min(ST.getMaxNumAGPRs(TargetOccupancy), AGPRExcessLimit);
+
if (!KnownExcessRP) {
VGPRCriticalLimit = std::min(
ST.getMaxNumVGPRs(TargetOccupancy, MFI.getDynamicVGPRBlockSize()),
@@ -201,7 +206,8 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
const RegPressureTracker &RPTracker,
const SIRegisterInfo *SRI,
unsigned SGPRPressure,
- unsigned VGPRPressure, bool IsBottomUp) {
+ unsigned VGPRPressure,
+ unsigned AGPRPressure, bool IsBottomUp) {
Cand.SU = SU;
Cand.AtTop = AtTop;
@@ -230,6 +236,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
Pressure.resize(4, 0);
Pressure[AMDGPU::RegisterPressureSets::SReg_32] = SGPRPressure;
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] = VGPRPressure;
+ Pressure[AMDGPU::RegisterPressureSets::AGPR_32] = AGPRPressure;
for (const auto &Diff : DAG->getPressureDiff(SU)) {
if (!Diff.isValid())
@@ -247,7 +254,9 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
if (Pressure[AMDGPU::RegisterPressureSets::SReg_32] !=
CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] ||
Pressure[AMDGPU::RegisterPressureSets::VGPR_32] !=
- CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32]) {
+ CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] ||
+ Pressure[AMDGPU::RegisterPressureSets::AGPR_32] !=
+ CheckPressure[AMDGPU::RegisterPressureSets::AGPR_32]) {
errs() << "Register Pressure is inaccurate when calculated through "
"PressureDiff\n"
<< "SGPR got " << Pressure[AMDGPU::RegisterPressureSets::SReg_32]
@@ -255,7 +264,10 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
<< CheckPressure[AMDGPU::RegisterPressureSets::SReg_32] << "\n"
<< "VGPR got " << Pressure[AMDGPU::RegisterPressureSets::VGPR_32]
<< ", expected "
- << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n";
+ << CheckPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n"
+ << "AGPR got " << Pressure[AMDGPU::RegisterPressureSets::AGPR_32]
+ << ", expected "
+ << CheckPressure[AMDGPU::RegisterPressureSets::AGPR_32] << "\n";
report_fatal_error("inaccurate register pressure calculation");
}
#endif
@@ -263,6 +275,7 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+ unsigned NewAGPRPressure = Pressure[AMDGPU::RegisterPressureSets::AGPR_32];
// If two instructions increase the pressure of different register sets
// by the same amount, the generic scheduler will prefer to schedule the
@@ -272,9 +285,11 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
// only for VGPRs or only for SGPRs.
// FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
- const unsigned MaxVGPRPressureInc = 16;
+ static constexpr unsigned MaxVGPRPressureInc = 16;
bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
- bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
+ bool ShouldTrackAGPRs = !ShouldTrackVGPRs && AGPRPressure >= AGPRExcessLimit;
+ bool ShouldTrackSGPRs =
+ !ShouldTrackVGPRs && !ShouldTrackAGPRs && SGPRPressure >= SGPRExcessLimit;
// FIXME: We have to enter REG-EXCESS before we reach the actual threshold
// to increase the likelihood we don't go over the limits. We should improve
@@ -291,6 +306,12 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
}
+ if (ShouldTrackAGPRs && NewAGPRPressure >= AGPRPressure) {
+ HasHighPressure = true;
+ Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::AGPR_32);
+ Cand.RPDelta.Excess.setUnitInc(NewAGPRPressure - AGPRExcessLimit);
+ }
+
if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
HasHighPressure = true;
Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
@@ -304,13 +325,19 @@ void GCNSchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
+ int AGPRDelta = NewAGPRPressure - AGPRCriticalLimit;
- if (SGPRDelta >= 0 || VGPRDelta >= 0) {
+ if (SGPRDelta >= 0 || VGPRDelta >= 0 || AGPRDelta >= 0) {
HasHighPressure = true;
- if (SGPRDelta > VGPRDelta) {
+ // Prioritize reducing the VGPRDelta if both are >= 0
+ if (SGPRDelta > VGPRDelta && SGPRDelta > AGPRDelta) {
Cand.RPDelta.CriticalMax =
PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
+ } else if (AGPRDelta > VGPRDelta) {
+ Cand.RPDelta.CriticalMax =
+ PressureChange(AMDGPU::RegisterPressureSets::AGPR_32);
+ Cand.RPDelta.CriticalMax.setUnitInc(AGPRDelta);
} else {
Cand.RPDelta.CriticalMax =
PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
@@ -330,16 +357,19 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
unsigned SGPRPressure = 0;
unsigned VGPRPressure = 0;
+ unsigned AGPRPressure = 0;
if (DAG->isTrackingPressure()) {
if (!GCNTrackers) {
SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
+ AGPRPressure = Pressure[AMDGPU::RegisterPressureSets::AGPR_32];
} else {
GCNRPTracker *T = IsBottomUp
? static_cast<GCNRPTracker *>(&UpwardTracker)
: static_cast<GCNRPTracker *>(&DownwardTracker);
SGPRPressure = T->getPressure().getSGPRNum();
VGPRPressure = T->getPressure().getArchVGPRNum();
+ AGPRPressure = T->getPressure().getAGPRNum();
}
}
ReadyQueue &Q = Zone.Available;
@@ -347,7 +377,7 @@ void GCNSchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
SchedCandidate TryCand(ZonePolicy);
initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI, SGPRPressure,
- VGPRPressure, IsBottomUp);
+ VGPRPressure, AGPRPressure, IsBottomUp);
// Pass SchedBoundary only when comparing nodes from the same boundary.
SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
tryCandidate(Cand, TryCand, ZoneArg);
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index 790370ff8ab4d..8b2137bcd14da 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -53,7 +53,8 @@ class GCNSchedStrategy : public GenericScheduler {
void initCandidate(SchedCandidate &Cand, SUnit *SU, bool AtTop,
const RegPressureTracker &RPTracker,
const SIRegisterInfo *SRI, unsigned SGPRPressure,
- unsigned VGPRPressure, bool IsBottomUp);
+ unsigned VGPRPressure, unsigned AGPRPressure,
+ bool IsBottomUp);
std::vector<unsigned> Pressure;
@@ -63,6 +64,8 @@ class GCNSchedStrategy : public GenericScheduler {
unsigned VGPRExcessLimit;
+ unsigned AGPRExcessLimit;
+
unsigned TargetOccupancy;
MachineFunction *MF;
@@ -103,6 +106,8 @@ class GCNSchedStrategy : public GenericScheduler {
unsigned VGPRCriticalLimit;
+ unsigned AGPRCriticalLimit;
+
unsigned SGPRLimitBias = 0;
unsigned VGPRLimitBias = 0;
@@ -183,8 +188,7 @@ class ScheduleMetrics {
};
inline raw_ostream &operator<<(raw_ostream &OS, const ScheduleMetrics &Sm) {
- dbgs() << "\n Schedule Metric (scaled by "
- << ScheduleMetrics::ScaleFactor
+ dbgs() << "\n Schedule Metric (scaled by " << ScheduleMetrics::ScaleFactor
<< " ) is: " << Sm.getMetric() << " [ " << Sm.getBubbles() << "/"
<< Sm.getLength() << " ]\n";
return OS;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4475c8d1d1602..c9fa3894408e9 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1722,8 +1722,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
/// unit requirement.
unsigned getMaxNumVGPRs(const Function &F) const;
- unsigned getMaxNumAGPRs(const Function &F) const {
- return getMaxNumVGPRs(F);
+ unsigned getMaxNumAGPRs(unsigned WavesPerEU) const {
+ return AMDGPU::IsaInfo::getMaxNumAGPRs(this, WavesPerEU);
}
/// Return a pair of maximum numbers of VGPRs and AGPRs that meet the number
@@ -1744,13 +1744,9 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool supportsWave64() const { return !hasGFX1250Insts(); }
- bool isWave32() const {
- return getWavefrontSize() == 32;
- }
+ bool isWave32() const { return getWavefrontSize() == 32; }
- bool isWave64() const {
- return getWavefrontSize() == 64;
- }
+ bool isWave64() const { return getWavefrontSize() == 64; }
/// Returns if the wavesize of this subtarget is known reliable. This is false
/// only for the a default target-cpu that does not have an explicit
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 0a0b02c18c1db..d78106694f2e8 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1494,6 +1494,22 @@ unsigned getMaxNumVGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU,
return std::min(MaxNumVGPRs, AddressableNumVGPRs);
}
+unsigned getMaxNumAGPRs(const MCSubtargetInfo *STI, unsigned int WavesPerEU) {
+ if (!STI->getFeatureBits().test(FeatureMAIInsts))
+ return 0;
+
+ assert(WavesPerEU != 0);
+
+ assert(!STI->getFeatureBits().test(FeatureDynamicVGPR));
+
+ unsigned MaxNumAGPRs =
+ alignTo(getTotalNumVGPRs(STI) / WavesPerEU, getVGPRAllocGranule(STI, 0));
+ unsigned AddressableNumAGPRs = getAddressableNumArchVGPRs(STI);
+ return std::min(MaxNumAGPRs, AddressableNumAGPRs);
+}
+
+unsigned getAddressableNumAGPRs(const MCSubtargetInfo *STI) { return 256; }
+
unsigned getEncodedNumVGPRBlocks(const MCSubtargetInfo *STI, unsigned NumVGPRs,
std::optional<bool> EnableWavefrontSize32) {
return getGranulatedNumRegisterBlocks(
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 23ea3ba0c8385..ecf7faac89ce5 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -353,6 +353,13 @@ unsigned getNumWavesPerEUWithNumVGPRs(unsigned NumVGPRs, unsigned Granule,
unsigned MaxWaves,
unsigned TotalNumVGPRs);
+/// \returns Maximum number of AGPRs that meets given number of waves per
+/// execution unit requirement for given subtarget \p STI.
+unsigned getMaxNumAGPRs(const MCSubtargetInfo *STI, unsigned WavesPerEU);
+
+/// \returns Addressable number of AGPRs for a given subtarget \p STI.
+unsigned getAddressableNumAGPRs(const MCSubtargetInfo *STI);
+
/// \returns Occupancy for a given \p SGPRs usage, \p MaxWaves possible, and \p
/// Gen.
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
index b67080bd4798d..7c58791281562 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
@@ -149,55 +149,55 @@ define void @add_v5i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX8-LABEL: add_v5i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0
+; GFX8-NEXT: v_add_u32_e64 v19, s[16:17], 2, v0
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v0
+; GFX8-NEXT: v_add_u32_e64 v8, s[4:5], 6, v0
+; GFX8-NEXT: v_add_u32_e64 v10, s[6:7], 8, v0
+; GFX8-NEXT: v_add_u32_e64 v12, s[8:9], 2, v2
+; GFX8-NEXT: v_add_u32_e64 v14, s[10:11], 4, v2
+; GFX8-NEXT: v_add_u32_e64 v18, s[14:15], 8, v2
+; GFX8-NEXT: v_addc_u32_e64 v20, s[16:17], 0, v1, s[16:17]
+; GFX8-NEXT: v_add_u32_e64 v16, s[12:13], 6, v2
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v0
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v12, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 8, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v13, v[6:7]
-; GFX8-NEXT: flat_load_ushort v14, v[8:9]
-; GFX8-NEXT: flat_load_ushort v15, v[10:11]
-; GFX8-NEXT: flat_load_ushort v16, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 8, v2
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc
-; GFX8-NEXT: flat_load_ushort v17, v[2:3]
-; GFX8-NEXT: flat_load_ushort v18, v[0:1]
-; GFX8-NEXT: flat_load_ushort v19, v[6:7]
-; GFX8-NEXT: flat_load_ushort v20, v[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v9, vcc, 0, v1, s[4:5]
+; GFX8-NEXT: v_addc_u32_e64 v11, vcc, 0, v1, s[6:7]
+; GFX8-NEXT: flat_load_ushort v21, v[0:1]
+; GFX8-NEXT: flat_load_ushort v20, v[19:20]
+; GFX8-NEXT: flat_load_ushort v22, v[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v13, s[6:7], 0, v3, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v15, s[6:7], 0, v3, s[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v19, s[6:7], 0, v3, s[14:15]
+; GFX8-NEXT: flat_load_ushort v23, v[8:9]
; GFX8-NEXT: flat_load_ushort v10, v[10:11]
+; GFX8-NEXT: v_addc_u32_e64 v17, s[6:7], 0, v3, s[12:13]
+; GFX8-NEXT: flat_load_ushort v11, v[2:3]
+; GFX8-NEXT: flat_load_ushort v12, v[12:13]
+; GFX8-NEXT: flat_load_ushort v13, v[14:15]
+; GFX8-NEXT: flat_load_ushort v14, v[16:17]
+; GFX8-NEXT: flat_load_ushort v15, v[18:19]
; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v4
+; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], 4, v4
+; GFX8-NEXT: v_add_u32_e64 v8, s[6:7], 6, v4
+; GFX8-NEXT: v_add_u32_e64 v2, s[8:9], 8, v4
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v2, vcc, 4, v4
-; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 6, v4
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v5, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 8, v4
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v5, vcc
+; GFX8-NEXT: v_addc_u32_e64 v7, vcc, 0, v5, s[4:5]
+; GFX8-NEXT: v_addc_u32_e64 v9, vcc, 0, v5, s[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v3, vcc, 0, v5, s[8:9]
; GFX8-NEXT: s_waitcnt vmcnt(4)
-; GFX8-NEXT: v_add_u16_e32 v11, v12, v17
+; GFX8-NEXT: v_add_u16_e32 v11, v21, v11
; GFX8-NEXT: s_waitcnt vmcnt(3)
-; GFX8-NEXT: v_add_u16_e32 v12, v13, v18
+; GFX8-NEXT: v_add_u16_e32 v12, v20, v12
; GFX8-NEXT: s_waitcnt vmcnt(2)
-; GFX8-NEXT: v_add_u16_e32 v13, v14, v19
+; GFX8-NEXT: v_add_u16_e32 v13, v22, v13
; GFX8-NEXT: s_waitcnt vmcnt(1)
-; GFX8-NEXT: v_add_u16_e32 v14, v15, v20
+; GFX8-NEXT: v_add_u16_e32 v14, v23, v14
; GFX8-NEXT: s_waitcnt vmcnt(0)
-; GFX8-NEXT: v_add_u16_e32 v10, v16, v10
+; GFX8-NEXT: v_add_u16_e32 v10, v10, v15
; GFX8-NEXT: flat_store_short v[4:5], v11
; GFX8-NEXT: flat_store_short v[0:1], v12
-; GFX8-NEXT: flat_store_short v[2:3], v13
-; GFX8-NEXT: flat_store_short v[6:7], v14
-; GFX8-NEXT: flat_store_short v[8:9], v10
+; GFX8-NEXT: flat_store_short v[6:7], v13
+; GFX8-NEXT: flat_store_short v[8:9], v14
+; GFX8-NEXT: flat_store_short v[2:3], v10
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
@@ -341,77 +341,77 @@ define void @addv_7i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addrs
; GFX8-LABEL: addv_7i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 2, v0
+; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v0
+; GFX8-NEXT: v_add_u32_e64 v8, s[4:5], 6, v0
+; GFX8-NEXT: v_add_u32_e64 v10, s[6:7], 8, v0
+; GFX8-NEXT: v_add_u32_e64 v12, s[8:9], 10, v0
+; GFX8-NEXT: v_add_u32_e64 v14, s[10:11], 12, v0
+; GFX8-NEXT: v_add_u32_e64 v19, s[16:17], 2, v0
+; GFX8-NEXT: v_add_u32_e64 v16, s[12:13], 2, v2
+; GFX8-NEXT: v_add_u32_e64 v18, s[14:15], 4, v2
+; GFX8-NEXT: v_addc_u32_e64 v20, s[16:17], 0, v1, s[16:17]
; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 4, v0
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v10, vcc, 6, v0
-; GFX8-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v12, vcc, 8, v0
-; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v1, vcc
-; GFX8-NEXT: v_add_u32_e32 v14, vcc, 10, v0
-; GFX8-NEXT: v_addc_u32_e32 v15, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v16, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 12, v0
-; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT: flat_load_ushort v17, v[6:7]
-; GFX8-NEXT: flat_load_ushort v18, v[8:9]
-; GFX8-NEXT: flat_load_ushort v19, v[10:11]
-; GFX8-NEXT: flat_load_ushort v20, v[12:13]
-; GFX8-NEXT: flat_load_ushort v21, v[14:15]
-; GFX8-NEXT: flat_load_ushort v22, v[0:1]
-; GFX8-NEXT: v_add_u32_e32 v0, vcc, 2, v2
+; GFX8-NEXT: v_addc_u32_e64 v9, vcc, 0, v1, s[4:5]
+; GFX8-NEXT: v_addc_u32_e64 v11, vcc, 0, v1, s[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v13, vcc, 0, v1, s[8:9]
+; GFX8-NEXT: v_addc_u32_e64 v15, vcc, 0, v1, s[10:11]
+; GFX8-NEXT: flat_load_ushort v21, v[0:1]
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, 6, v2
+; GFX8-NEXT: flat_load_ushort v20, v[19:20]
+; GFX8-NEXT: flat_load_ushort v22, v[6:7]
+; GFX8-NEXT: v_addc_u32_e64 v17, s[8:9], 0, v3, s[12:13]
+; GFX8-NEXT: v_addc_u32_e64 v19, s[10:11], 0, v3, s[14:15]
+; GFX8-NEXT: v_add_u32_e64 v6, s[4:5], 8, v2
+; GFX8-NEXT: flat_load_ushort v23, v[8:9]
+; GFX8-NEXT: flat_load_ushort v24, v[10:11]
+; GFX8-NEXT: v_add_u32_e64 v8, s[6:7], 10, v2
+; GFX8-NEXT: flat_load_ushort v12, v[12:13]
+; GFX8-NEXT: flat_load_ushort v13, v[14:15]
+; GFX8-NEXT: v_add_u32_e64 v10, s[8:9], 12, v2
+; GFX8-NEXT: flat_load_ushort v14, v[16:17]
+; GFX8-NEXT: flat_load_ushort v15, v[18:19]
; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v6, vcc, 4, v2
-; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u32_e32 v8, vcc, 6, v2
-; GFX8-NEXT: v_addc_u32_e32 v9, vcc, 0, v3, vcc
-; GFX8-NEXT: v_add_u...
[truncated]
|
Reuse VGPR helpers for AGPR calculations, since in the common case of unified register files, they are somewhat interchangeable
Track AGPRs only if we are not tracking VGPRs Prioritize AGPR pressure over SGPR pressure
5ad6b8c to
500b198
Compare
| ; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v32 | ||
| ; GFX6-NEXT: v_min_i32_e32 v32, 0, v1 | ||
| ; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v31, v32 | ||
| ; GFX6-NEXT: v_sub_i32_e32 v32, vcc, v31, v33 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why are there so many test changes on targets that don't have AGPRs, and functions that don't have AGPRs
arsenm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should fix this effecting targets and functions without agers
No description provided.