Skip to content

Commit 6a43bce

Browse files
committed
Scoring system for rematerializations
1 parent 9e0c0a0 commit 6a43bce

33 files changed

+2371
-1490
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 471 additions & 269 deletions
Large diffs are not rendered by default.

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 191 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
#include "llvm/ADT/MapVector.h"
1919
#include "llvm/CodeGen/MachineInstr.h"
2020
#include "llvm/CodeGen/MachineScheduler.h"
21+
#include <cstdint>
22+
#include <limits>
2123

2224
namespace llvm {
2325

@@ -297,6 +299,8 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
297299

298300
std::unique_ptr<GCNSchedStage> createSchedStage(GCNSchedStageID SchedStageID);
299301

302+
void deleteMI(unsigned RegionIdx, MachineInstr *MI);
303+
300304
public:
301305
GCNScheduleDAGMILive(MachineSchedContext *C,
302306
std::unique_ptr<MachineSchedStrategy> S);
@@ -432,70 +436,209 @@ class ClusteredLowOccStage : public GCNSchedStage {
432436
};
433437

434438
/// Attempts to reduce function spilling or, if there is no spilling, to
435-
/// increase function occupancy by one with respect to ArchVGPR usage by sinking
436-
/// rematerializable instructions to their use. When the stage
437-
/// estimates reducing spilling or increasing occupancy is possible, as few
438-
/// instructions as possible are rematerialized to reduce potential negative
439+
/// increase function occupancy by one with respect to register usage by sinking
440+
/// rematerializable instructions to their use. When the stage estimates that
441+
/// reducing spilling or increasing occupancy is possible, it tries to
442+
/// rematerialize as few registers as possible to reduce potential negative
439443
/// effects on function latency.
440444
class PreRARematStage : public GCNSchedStage {
441445
private:
442-
/// Useful information about a rematerializable instruction.
443-
struct RematInstruction {
444-
/// Single use of the rematerializable instruction's defined register,
445-
/// located in a different block.
446+
/// Groups information about a rematerializable register.
447+
struct RematReg {
448+
/// Single MI defining the rematerializable register.
449+
MachineInstr *DefMI;
450+
/// Single user of the rematerializable register.
446451
MachineInstr *UseMI;
447-
/// Rematerialized version of \p DefMI, set in
448-
/// PreRARematStage::rematerialize. Used for reverting rematerializations.
449-
MachineInstr *RematMI;
450-
/// Set of regions in which the rematerializable instruction's defined
451-
/// register is a live-in.
452-
SmallDenseSet<unsigned, 4> LiveInRegions;
453-
454-
RematInstruction(MachineInstr *UseMI) : UseMI(UseMI) {}
452+
/// Using region.
453+
unsigned UseRegion;
454+
/// Regions in which the register is live-in/live-out/live anywhere.
455+
BitVector LiveIn, LiveOut, Live;
456+
/// The rematerializable register's lane bitmask.
457+
LaneBitmask Mask;
458+
/// Frequency of region defining/using the register. 0 when unknown.
459+
unsigned DefFrequency, UseFrequency;
460+
461+
RematReg(MachineInstr *DefMI, MachineInstr *UseMI,
462+
GCNScheduleDAGMILive &DAG,
463+
const DenseMap<MachineInstr *, unsigned> &MIRegion,
464+
ArrayRef<uint64_t> RegionFreq);
465+
466+
/// Returns whether the regions at which the register is live intersects
467+
/// with the \p Target regions.
468+
bool intersectWithTarget(BitVector Target) const {
469+
Target &= Live;
470+
return Target.any();
471+
}
472+
473+
/// Returns whether is is always beneficial to rematerialize this register.
474+
bool isAlwaysBeneficial() const {
475+
// When the using region is executed a single time, we know
476+
// rematerializing will be beneficial whatever the defining region's
477+
// frequency.
478+
if (UseFrequency == 1)
479+
return true;
480+
// When there is uncertainty on the defining or using frequency, we err on
481+
// the conservative side and do not consider the rematerialization always
482+
// beneficial.
483+
if (!DefFrequency || !UseFrequency)
484+
return false;
485+
return UseFrequency <= DefFrequency;
486+
}
487+
488+
/// Determines whether rematerializing the register is guaranteed to reduce
489+
/// pressure in the region.
490+
bool isBeneficialRegion(unsigned I) const {
491+
assert(I < Live.size() && "region index out of range");
492+
return LiveIn[I] && LiveOut[I] && I != UseRegion;
493+
}
494+
495+
/// Determines whether rematerializing the register can but is not
496+
/// guaranteed to reduce pressure in the region.
497+
bool isMaybeBeneficialRegion(unsigned I) const {
498+
assert(I < Live.size() && "region index out of range");
499+
return Live[I] && !isBeneficialRegion(I);
500+
}
501+
502+
/// Updates internal structures following a MI rematerialization. Part of
503+
/// the stage instead of the DAG because it makes assumptions that are
504+
/// specific to the rematerialization process.
505+
MachineInstr *insertMI(unsigned RegionIdx,
506+
MachineBasicBlock::iterator InsertPos,
507+
GCNScheduleDAGMILive &DAG) const;
508+
509+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
510+
void print(const DenseMap<MachineInstr *, unsigned> &MIRegion) const;
511+
#endif
512+
};
513+
514+
/// A scored rematerializable register. Higher scores indicate more beneficial
515+
/// rematerializations. Non-positive scores indicate the rematerialization is
516+
/// not helpful to reduce RP in target regions.
517+
struct ScoredRemat {
518+
/// The rematerializable register under consideration.
519+
const RematReg *Remat;
520+
521+
/// This only initializes state-independent characteristics of \p Remat, not
522+
/// the actual score.
523+
ScoredRemat(const RematReg *Remat, const GCNSubtarget &ST,
524+
const TargetInstrInfo &TII);
525+
526+
/// Updates the rematerialization's score w.r.t. the current \p RPTargets.
527+
/// \p RegionFreq indicates the frequency of each region
528+
void update(const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets,
529+
ArrayRef<uint64_t> RegionFreq, bool ReduceSpill);
530+
531+
int getScore() const { return Score; }
532+
533+
bool operator<(const ScoredRemat &O) const { return Score < O.Score; }
534+
bool operator==(const ScoredRemat &O) const { return Score == O.Score; }
535+
536+
private:
537+
/// Estimated save/restore latency costs for spilling a register to stack.
538+
/// FIXME: These numbers are very arbitrary. Need a good rationale for them,
539+
/// which I don't know where to get from.
540+
static constexpr int SaveCost = 100, RestoreCost = 100;
541+
/// Per-region contribution weights to RP score depending on whether RP is
542+
/// guaranteed or only likely to be reduced in the region. Only their
543+
/// relative value w.r.t. one another matter.
544+
static constexpr int WeightRP = 10, WeightRPMaybe = 5;
545+
546+
/// Latency gain induced by rematerializing the instruction. Does not
547+
/// include estimated spilling cost of *not* rematerializing (save/restore
548+
/// to/from stack).
549+
std::optional<int> InstrLatencyGain = std::nullopt;
550+
551+
using ScoreTy = int32_t;
552+
/// Overall rematerialization score. Scoring components are mapped to bit
553+
/// ranges in the overall score.
554+
///
555+
/// [31:1] : estimated RP reduction score
556+
/// [0] : known latency gain
557+
ScoreTy Score;
558+
559+
void resetScore() { Score = 0; }
560+
561+
void setUselessRemat() { Score = std::numeric_limits<ScoreTy>::min(); }
562+
563+
void setKnownLatencyGain() { Score |= 1; }
564+
565+
void setRPScore(unsigned RPScore) {
566+
Score |= static_cast<ScoreTy>(RPScore) << 1;
567+
}
455568
};
456569

457-
/// Maps all MIs to their parent region. MI terminators are considered to be
458-
/// outside the region they delimitate, and as such are not stored in the map.
570+
/// Maps all MIs (except lone terminators, which are not part of any region)
571+
/// to their parent region. Non-lone terminators are considered part of the
572+
/// region they delimitate.
459573
DenseMap<MachineInstr *, unsigned> MIRegion;
460574
/// Parent MBB to each region, in region order.
461575
SmallVector<MachineBasicBlock *> RegionBB;
462-
/// Collects instructions to rematerialize.
463-
MapVector<MachineInstr *, RematInstruction> Rematerializations;
464-
/// Collects regions whose live-ins or register pressure will change due to
465-
/// rematerializations.
466-
DenseMap<unsigned, GCNRegPressure> ImpactedRegions;
467-
/// In case we need to rollback rematerializations, save lane masks for all
468-
/// rematerialized registers in all regions in which they are live-ins.
469-
DenseMap<std::pair<unsigned, Register>, LaneBitmask> RegMasks;
470-
/// After successful stage initialization, indicates which regions should be
471-
/// rescheduled.
472-
BitVector RescheduleRegions;
473-
/// The target occupancy the stage is trying to achieve. Empty when the
576+
577+
/// Register pressure targets for all regions.
578+
SmallVector<GCNRPTarget> RPTargets;
579+
/// Regions which are above the stage's RP target.
580+
BitVector TargetRegions;
581+
/// The target occupancy the set is trying to achieve. Empty when the
474582
/// objective is spilling reduction.
475583
std::optional<unsigned> TargetOcc;
476584
/// Achieved occupancy *only* through rematerializations (pre-rescheduling).
477-
/// Smaller than or equal to the target occupancy.
585+
/// Smaller than or equal to the target occupancy, when it is defined.
478586
unsigned AchievedOcc;
479587

480-
/// Returns whether remat can reduce spilling or increase function occupancy
481-
/// by 1 through rematerialization. If it can do one, collects instructions in
482-
/// PreRARematStage::Rematerializations and sets the target occupancy in
483-
/// PreRARematStage::TargetOccupancy.
484-
bool canIncreaseOccupancyOrReduceSpill();
588+
/// List of rematerializable registers.
589+
SmallVector<RematReg, 16> RematRegs;
590+
591+
using RollbackReg = std::pair<MachineInstr *, const RematReg *>;
592+
/// List of rematerializations to rollback if rematerialization does not end
593+
/// up being beneficial. Each element pairs the MI created during
594+
/// rematerialization to the original rematerializable register.
595+
SmallVector<RollbackReg> Rollbackable;
596+
597+
/// After successful stage initialization, indicates which regions should be
598+
/// rescheduled.
599+
BitVector RescheduleRegions;
600+
601+
/// Determines the stage's objective (increasing occupancy or reducing
602+
/// spilling, set in \ref TargetOcc). Defines \ref RPTargets in all regions to
603+
/// achieve that objective and mark those that don't achieve it in \ref
604+
/// TargetRegions.
605+
void setObjective();
606+
607+
/// Unsets target regions in \p Regions whose RP target has been reached.
608+
void unsetSatisifedRPTargets(const BitVector &Regions);
609+
610+
/// Fully recomputes RP from the DAG in \p Regions. Among those regions, sets
611+
/// again all \ref TargetRegions that were optimistically marked as satisfied
612+
/// but are actually not, and returns whether there were any such regions.
613+
bool updateAndVerifyRPTargets(const BitVector &Regions);
614+
615+
/// Collects all rematerializable registers and appends them to \ref
616+
/// RematRegs. \p RegionFreq contains the frequency of each region, 0
617+
/// indicating an unknown frequency. Returns whether any rematerializable
618+
/// register was found.
619+
bool collectRematRegs(ArrayRef<uint64_t> RegionFreq);
620+
621+
/// Rematerializes \p Remat. This removes the rematerialized register from
622+
/// live-in/out lists in the DAG and updates RP targets in all affected
623+
/// regions, which are also marked in \ref RescheduleRegions. Regions in which
624+
/// RP savings are not guaranteed are set in \p RecomputeRP. Returns the newly
625+
/// created MI.
626+
MachineInstr *rematerialize(const RematReg &Remat, BitVector &RecomputeRP);
627+
628+
/// Rollbacks rematerialization \p Rollback.
629+
void rollback(const RollbackReg &Rollback) const;
485630

486631
/// Whether the MI is rematerializable
487632
bool isReMaterializable(const MachineInstr &MI);
488633

489-
/// Rematerializes all instructions in PreRARematStage::Rematerializations
490-
/// and stores the achieved occupancy after remat in
491-
/// PreRARematStage::AchievedOcc.
492-
void rematerialize();
493-
494634
/// If remat alone did not increase occupancy to the target one, rollbacks all
495635
/// rematerializations and resets live-ins/RP in all regions impacted by the
496636
/// stage to their pre-stage values.
497637
void finalizeGCNSchedStage() override;
498638

639+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
640+
void printTargetRegions(bool PrintAll = false) const;
641+
#endif
499642
public:
500643
bool initGCNSchedStage() override;
501644

@@ -504,7 +647,13 @@ class PreRARematStage : public GCNSchedStage {
504647
bool shouldRevertScheduling(unsigned WavesAfter) override;
505648

506649
PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
507-
: GCNSchedStage(StageID, DAG), RescheduleRegions(DAG.Regions.size()) {}
650+
: GCNSchedStage(StageID, DAG), TargetRegions(DAG.Regions.size()),
651+
RescheduleRegions(DAG.Regions.size()) {
652+
const unsigned NumRegions = DAG.Regions.size();
653+
RPTargets.reserve(NumRegions);
654+
RegionBB.reserve(NumRegions);
655+
MIRegion.reserve(MF.getInstructionCount());
656+
}
508657
};
509658

510659
class ILPInitialScheduleStage : public GCNSchedStage {

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -885,7 +885,6 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
885885
; SI-NEXT: s_mov_b64 s[0:1], exec
886886
; SI-NEXT: s_wqm_b64 exec, exec
887887
; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
888-
; SI-NEXT: s_mov_b32 s4, 0
889888
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
890889
; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
891890
; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
@@ -894,10 +893,11 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
894893
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
895894
; SI-NEXT: s_cbranch_scc0 .LBB7_9
896895
; SI-NEXT: ; %bb.2: ; %.demote0
897-
; SI-NEXT: s_wqm_b64 s[6:7], s[0:1]
898-
; SI-NEXT: s_and_b64 exec, exec, s[6:7]
896+
; SI-NEXT: s_wqm_b64 s[4:5], s[0:1]
897+
; SI-NEXT: s_and_b64 exec, exec, s[4:5]
899898
; SI-NEXT: .LBB7_3: ; %.continue0.preheader
900899
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
900+
; SI-NEXT: s_mov_b32 s4, 0
901901
; SI-NEXT: s_mov_b64 s[2:3], 0
902902
; SI-NEXT: v_mov_b32_e32 v0, s4
903903
; SI-NEXT: s_branch .LBB7_5
@@ -951,7 +951,6 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
951951
; GFX9-NEXT: s_mov_b64 s[0:1], exec
952952
; GFX9-NEXT: s_wqm_b64 exec, exec
953953
; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
954-
; GFX9-NEXT: s_mov_b32 s4, 0
955954
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
956955
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
957956
; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
@@ -960,10 +959,11 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
960959
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
961960
; GFX9-NEXT: s_cbranch_scc0 .LBB7_9
962961
; GFX9-NEXT: ; %bb.2: ; %.demote0
963-
; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1]
964-
; GFX9-NEXT: s_and_b64 exec, exec, s[6:7]
962+
; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1]
963+
; GFX9-NEXT: s_and_b64 exec, exec, s[4:5]
965964
; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader
966965
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
966+
; GFX9-NEXT: s_mov_b32 s4, 0
967967
; GFX9-NEXT: s_mov_b64 s[2:3], 0
968968
; GFX9-NEXT: v_mov_b32_e32 v0, s4
969969
; GFX9-NEXT: s_branch .LBB7_5
@@ -1080,7 +1080,6 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
10801080
; GFX10-64-NEXT: s_mov_b64 s[0:1], exec
10811081
; GFX10-64-NEXT: s_wqm_b64 exec, exec
10821082
; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0
1083-
; GFX10-64-NEXT: s_mov_b32 s4, 0
10841083
; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
10851084
; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc
10861085
; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
@@ -1089,11 +1088,12 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
10891088
; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
10901089
; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9
10911090
; GFX10-64-NEXT: ; %bb.2: ; %.demote0
1092-
; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1]
1093-
; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7]
1091+
; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1]
1092+
; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5]
10941093
; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader
10951094
; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3]
1096-
; GFX10-64-NEXT: v_mov_b32_e32 v0, s4
1095+
; GFX10-64-NEXT: s_mov_b32 s2, 0
1096+
; GFX10-64-NEXT: v_mov_b32_e32 v0, s2
10971097
; GFX10-64-NEXT: s_mov_b64 s[2:3], 0
10981098
; GFX10-64-NEXT: s_branch .LBB7_5
10991099
; GFX10-64-NEXT: .LBB7_4: ; %.continue1

0 commit comments

Comments
 (0)