Skip to content

Commit 5137519

Browse files
committed
Refactor scoring system + Remove always benef/latency calc
- Change components of scoring system. - Remove rematerialization of always beneficial registers. - Remove latency calculation in scoring. - Other small changes.
1 parent 846238b commit 5137519

30 files changed

+1292
-1297
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 180 additions & 202 deletions
Large diffs are not rendered by default.

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 87 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -287,14 +287,6 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
287287
// Compute and cache live-ins and pressure for all regions in block.
288288
void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);
289289

290-
/// If necessary, updates a region's boundaries following insertion ( \p NewMI
291-
/// != nullptr) or removal ( \p NewMI == nullptr) of a \p MI in the region.
292-
/// For an MI removal, this must be called before the MI is actually erased
293-
/// from its parent MBB.
294-
void updateRegionBoundaries(RegionBoundaries &RegionBounds,
295-
MachineBasicBlock::iterator MI,
296-
MachineInstr *NewMI);
297-
298290
void runSchedStages();
299291

300292
std::unique_ptr<GCNSchedStage> createSchedStage(GCNSchedStageID SchedStageID);
@@ -462,88 +454,66 @@ class PreRARematStage : public GCNSchedStage {
462454
MachineInstr *DefMI;
463455
/// Single user of the rematerializable register.
464456
MachineInstr *UseMI;
465-
/// Using region.
466-
unsigned UseRegion;
467457
/// Regions in which the register is live-in/live-out/live anywhere.
468458
BitVector LiveIn, LiveOut, Live;
469459
/// The rematerializable register's lane bitmask.
470460
LaneBitmask Mask;
471-
/// Frequency of region defining/using the register. 0 when unknown.
472-
unsigned DefFrequency, UseFrequency;
461+
/// Defining and using regions.
462+
unsigned DefRegion, UseRegion;
463+
/// Frequency of defining/using regions. 0 when unknown.
464+
uint64_t DefFrequency, UseFrequency;
473465

474466
RematReg(MachineInstr *DefMI, MachineInstr *UseMI,
475467
GCNScheduleDAGMILive &DAG,
476468
const DenseMap<MachineInstr *, unsigned> &MIRegion,
477469
ArrayRef<uint64_t> RegionFreq);
478470

479-
/// Returns whether the regions at which the register is live intersects
480-
/// with the \p Target regions.
481-
bool intersectWithTarget(BitVector Target) const {
482-
Target &= Live;
483-
return Target.any();
484-
}
471+
/// Returns the rematerializable register. Do not call after deleting the
472+
/// original defining instruction.
473+
Register getReg() const { return DefMI->getOperand(0).getReg(); }
485474

486-
/// Returns whether is is always beneficial to rematerialize this register.
487-
/// These are rematerializations that never move instructions into higher
488-
/// frequency regions and at least shorten live intervals, so they are
489-
/// always useful irrespective of RP targets.
490-
bool isAlwaysBeneficial() const {
491-
// When the using region is executed a single time, we know
492-
// rematerializing will be beneficial whatever the defining region's
493-
// frequency.
494-
if (UseFrequency == 1)
495-
return true;
496-
// When there is uncertainty on the defining or using frequency, we err on
497-
// the conservative side and do not consider the rematerialization always
498-
// beneficial.
499-
if (!DefFrequency || !UseFrequency)
500-
return false;
501-
return UseFrequency <= DefFrequency;
502-
}
475+
/// Determines whether this rematerialization may be beneficial in at least
476+
/// one target region.
477+
bool maybeBeneficial(const BitVector &TargetRegions,
478+
ArrayRef<GCNRPTarget> RPTargets) const;
503479

504-
/// Determines whether rematerializing the register is guaranteed to reduce
505-
/// pressure in the region.
506-
bool isBeneficialRegion(unsigned I) const {
480+
/// Determines if the register is both unused and live-through in region \p
481+
/// I. This guarantees that rematerializing it will reduce RP in the region.
482+
bool isUnusedLiveThrough(unsigned I) const {
507483
assert(I < Live.size() && "region index out of range");
508484
return LiveIn[I] && LiveOut[I] && I != UseRegion;
509485
}
510486

511-
/// Determines whether rematerializing the register can but is not
512-
/// guaranteed to reduce pressure in the region.
513-
bool isMaybeBeneficialRegion(unsigned I) const {
514-
assert(I < Live.size() && "region index out of range");
515-
return Live[I] && !isBeneficialRegion(I);
516-
}
517-
518487
/// Updates internal structures following a MI rematerialization. Part of
519488
/// the stage instead of the DAG because it makes assumptions that are
520489
/// specific to the rematerialization process.
521-
MachineInstr *insertMI(unsigned RegionIdx,
522-
MachineBasicBlock::iterator InsertPos,
523-
GCNScheduleDAGMILive &DAG) const;
490+
void insertMI(unsigned RegionIdx, MachineInstr *RematMI,
491+
GCNScheduleDAGMILive &DAG) const;
524492

525493
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
526-
void print(const DenseMap<MachineInstr *, unsigned> &MIRegion) const;
494+
void print() const;
527495
#endif
528496
};
529497

530498
/// A scored rematerializable register. Higher scores indicate more beneficial
531-
/// rematerializations. Non-positive scores indicate the rematerialization is
499+
/// rematerializations. A null score indicate the rematerialization is
532500
/// not helpful to reduce RP in target regions.
533501
struct ScoredRemat {
534502
/// The rematerializable register under consideration.
535503
const RematReg *Remat;
536504

537505
/// This only initializes state-independent characteristics of \p Remat, not
538506
/// the actual score.
539-
ScoredRemat(const RematReg *Remat, const GCNScheduleDAGMILive &DAG);
507+
ScoredRemat(const RematReg *Remat, uint64_t MinFreq, uint64_t MaxFreq,
508+
const GCNScheduleDAGMILive &DAG);
540509

541510
/// Updates the rematerialization's score w.r.t. the current \p RPTargets.
542511
/// \p RegionFreq indicates the frequency of each region
543512
void update(const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets,
544513
ArrayRef<uint64_t> RegionFreq, bool ReduceSpill);
545514

546-
int getScore() const { return Score; }
515+
/// Returns whether the current score is null.
516+
bool hasNullScore() const { return !Score; }
547517

548518
bool operator<(const ScoredRemat &O) const {
549519
// Break ties using pointer to rematerializable register. Since
@@ -554,49 +524,68 @@ class PreRARematStage : public GCNSchedStage {
554524
return Score < O.Score;
555525
}
556526

527+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
528+
void print() const;
529+
#endif
530+
557531
private:
558-
/// Per-region contribution weights to RP score depending on whether RP is
559-
/// guaranteed or only likely to be reduced in the region. Only their
560-
/// relative value w.r.t. one another matter.
561-
static constexpr int WeightRP = 2, WeightRPMaybe = 1;
532+
/// Bitwidths for score components.
533+
static constexpr unsigned MaxFreqWidth = 32, FreqDiffWidth = 16,
534+
RegionImpactWidth = 16;
562535

563536
/// Number of 32-bit registers this rematerialization covers.
564537
const unsigned NumRegs;
565-
/// Latency gain induced by rematerializing the register over spilling its
566-
/// defining instruction.
567-
const int RematLatencyGainOverSpill;
568-
569-
/// Whether we can estimate the latency gain of rematerialazing over
570-
/// spilling; this requires knowing defining/using region frequencies.
571-
bool hasUnknownLatencyGain() const {
572-
return !Remat->DefFrequency || !Remat->UseFrequency;
573-
}
538+
/// Frequency difference between defining and using regions, normalized to
539+
/// the maximum possible difference and rescaled to the representable range
540+
/// in the score.
541+
const uint64_t FreqDiff;
574542

575-
using ScoreTy = int32_t;
543+
using ScoreTy = uint64_t;
576544
/// Overall rematerialization score. Scoring components are mapped to bit
577545
/// ranges in the overall score.
578546
///
579-
/// [31:1] : estimated RP reduction score
580-
/// [0] : known latency gain
581-
ScoreTy Score;
547+
/// [63:32] : maximum frequency in benefiting target region (spilling only)
548+
/// [31:16] : frequency difference between defining and using region
549+
/// [15: 0] : number of benefiting regions times register size
550+
ScoreTy Score = 0;
582551

583-
void resetScore() { Score = 0; }
552+
void setNullScore() { Score = 0; }
584553

585-
void setUselessRemat() { Score = std::numeric_limits<ScoreTy>::min(); }
554+
void setMaxFreqScore(ScoreTy MaxFreq) {
555+
MaxFreq = std::min(
556+
static_cast<ScoreTy>(std::numeric_limits<uint32_t>::max()), MaxFreq);
557+
Score |= MaxFreq << (FreqDiffWidth + RegionImpactWidth);
558+
}
586559

587-
void setKnownLatencyGain() { Score |= 1; }
560+
void setFreqDiffScore(ScoreTy FreqDiff) {
561+
FreqDiff = std::min(
562+
static_cast<ScoreTy>(std::numeric_limits<uint16_t>::max()), FreqDiff);
563+
Score |= FreqDiff << RegionImpactWidth;
564+
}
588565

589-
void setRPScore(ScoreTy RPScore) { Score |= RPScore << 1; }
566+
void setRegionImpactScore(ScoreTy RegionImpact) {
567+
RegionImpact =
568+
std::min(static_cast<ScoreTy>(std::numeric_limits<uint16_t>::max()),
569+
RegionImpact);
570+
Score |= RegionImpact;
571+
}
590572

591573
unsigned getNumRegs(const GCNScheduleDAGMILive &DAG) const;
592574

593-
unsigned getLatencyGain(const GCNScheduleDAGMILive &DAG) const;
575+
uint64_t getFreqDiff(uint64_t MinFreq, uint64_t MaxFreq) const;
576+
};
577+
578+
/// Holds enough information to rollback a rematerialization decision post
579+
/// re-scheduling.
580+
struct RollbackInfo {
581+
/// The rematerializable register under consideration.
582+
const RematReg *Remat;
583+
/// The rematerialized MI replacing the original defining MI.
584+
MachineInstr *RematMI;
585+
586+
RollbackInfo(const RematReg *Remat) : Remat(Remat) {}
594587
};
595588

596-
/// Maps all MIs (except lone terminators, which are not part of any region)
597-
/// to their parent region. Non-lone terminators are considered part of the
598-
/// region they delimitate.
599-
DenseMap<MachineInstr *, unsigned> MIRegion;
600589
/// Parent MBB to each region, in region order.
601590
SmallVector<MachineBasicBlock *> RegionBB;
602591

@@ -613,22 +602,18 @@ class PreRARematStage : public GCNSchedStage {
613602

614603
/// List of rematerializable registers.
615604
SmallVector<RematReg, 16> RematRegs;
616-
617-
using RollbackReg = std::pair<MachineInstr *, const RematReg *>;
618605
/// List of rematerializations to rollback if rematerialization does not end
619-
/// up being beneficial. Each element pairs the MI created during
620-
/// rematerialization to the original rematerializable register.
621-
SmallVector<RollbackReg> Rollbackable;
622-
606+
/// up being beneficial.
607+
SmallVector<RollbackInfo> Rollbacks;
623608
/// After successful stage initialization, indicates which regions should be
624609
/// rescheduled.
625610
BitVector RescheduleRegions;
626611

627612
/// Determines the stage's objective (increasing occupancy or reducing
628613
/// spilling, set in \ref TargetOcc). Defines \ref RPTargets in all regions to
629614
/// achieve that objective and mark those that don't achieve it in \ref
630-
/// TargetRegions.
631-
void setObjective();
615+
/// TargetRegions. Returns whether there is any target region.
616+
bool setObjective();
632617

633618
/// Unsets target regions in \p Regions whose RP target has been reached.
634619
void unsetSatisifedRPTargets(const BitVector &Regions);
@@ -639,20 +624,26 @@ class PreRARematStage : public GCNSchedStage {
639624
bool updateAndVerifyRPTargets(const BitVector &Regions);
640625

641626
/// Collects all rematerializable registers and appends them to \ref
642-
/// RematRegs. \p RegionFreq contains the frequency of each region, 0
643-
/// indicating an unknown frequency. Returns whether any rematerializable
644-
/// register was found.
645-
bool collectRematRegs(ArrayRef<uint64_t> RegionFreq);
627+
/// RematRegs. \p MIRegion maps MIs to their region and \p RegionFreq contains
628+
/// the frequency of each region, 0 indicating an unknown frequency. Returns
629+
/// whether any rematerializable register was found.
630+
bool collectRematRegs(const DenseMap<MachineInstr *, unsigned> &MIRegion,
631+
ArrayRef<uint64_t> RegionFreq);
646632

647633
/// Rematerializes \p Remat. This removes the rematerialized register from
648634
/// live-in/out lists in the DAG and updates RP targets in all affected
649635
/// regions, which are also marked in \ref RescheduleRegions. Regions in which
650-
/// RP savings are not guaranteed are set in \p RecomputeRP. Returns the newly
651-
/// created MI.
652-
MachineInstr *rematerialize(const RematReg &Remat, BitVector &RecomputeRP);
653-
654-
/// Rollbacks rematerialization \p Rollback.
655-
void rollback(const RollbackReg &Rollback) const;
636+
/// RP savings are not guaranteed are set in \p RecomputeRP. When \p Rollback
637+
/// is non-null, fills it with required information to be able to rollback the
638+
/// rematerialization post-rescheduling.
639+
void rematerialize(const RematReg &Remat, BitVector &RecomputeRP,
640+
RollbackInfo *Rollback);
641+
642+
/// Rollbacks the rematerialization decision represented by \p Rollback. This
643+
/// update live-in/out lists in the DAG but does not update cached register
644+
/// pressures. Regions in which RP may be impacted are marked in \ref
645+
/// RecomputeRP.
646+
void rollback(const RollbackInfo &Rollback, BitVector &RecomputeRP) const;
656647

657648
/// Whether the MI is rematerializable
658649
bool isReMaterializable(const MachineInstr &MI);
@@ -678,7 +669,6 @@ class PreRARematStage : public GCNSchedStage {
678669
const unsigned NumRegions = DAG.Regions.size();
679670
RPTargets.reserve(NumRegions);
680671
RegionBB.reserve(NumRegions);
681-
MIRegion.reserve(MF.getInstructionCount());
682672
}
683673
};
684674

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.wqm.demote.ll

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -885,6 +885,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
885885
; SI-NEXT: s_mov_b64 s[0:1], exec
886886
; SI-NEXT: s_wqm_b64 exec, exec
887887
; SI-NEXT: v_cvt_i32_f32_e32 v0, v0
888+
; SI-NEXT: s_mov_b32 s4, 0
888889
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
889890
; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc
890891
; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
@@ -893,11 +894,10 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
893894
; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
894895
; SI-NEXT: s_cbranch_scc0 .LBB7_9
895896
; SI-NEXT: ; %bb.2: ; %.demote0
896-
; SI-NEXT: s_wqm_b64 s[4:5], s[0:1]
897-
; SI-NEXT: s_and_b64 exec, exec, s[4:5]
897+
; SI-NEXT: s_wqm_b64 s[6:7], s[0:1]
898+
; SI-NEXT: s_and_b64 exec, exec, s[6:7]
898899
; SI-NEXT: .LBB7_3: ; %.continue0.preheader
899900
; SI-NEXT: s_or_b64 exec, exec, s[2:3]
900-
; SI-NEXT: s_mov_b32 s4, 0
901901
; SI-NEXT: s_mov_b64 s[2:3], 0
902902
; SI-NEXT: v_mov_b32_e32 v0, s4
903903
; SI-NEXT: s_branch .LBB7_5
@@ -951,6 +951,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
951951
; GFX9-NEXT: s_mov_b64 s[0:1], exec
952952
; GFX9-NEXT: s_wqm_b64 exec, exec
953953
; GFX9-NEXT: v_cvt_i32_f32_e32 v0, v0
954+
; GFX9-NEXT: s_mov_b32 s4, 0
954955
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
955956
; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
956957
; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
@@ -959,11 +960,10 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
959960
; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
960961
; GFX9-NEXT: s_cbranch_scc0 .LBB7_9
961962
; GFX9-NEXT: ; %bb.2: ; %.demote0
962-
; GFX9-NEXT: s_wqm_b64 s[4:5], s[0:1]
963-
; GFX9-NEXT: s_and_b64 exec, exec, s[4:5]
963+
; GFX9-NEXT: s_wqm_b64 s[6:7], s[0:1]
964+
; GFX9-NEXT: s_and_b64 exec, exec, s[6:7]
964965
; GFX9-NEXT: .LBB7_3: ; %.continue0.preheader
965966
; GFX9-NEXT: s_or_b64 exec, exec, s[2:3]
966-
; GFX9-NEXT: s_mov_b32 s4, 0
967967
; GFX9-NEXT: s_mov_b64 s[2:3], 0
968968
; GFX9-NEXT: v_mov_b32_e32 v0, s4
969969
; GFX9-NEXT: s_branch .LBB7_5
@@ -1080,6 +1080,7 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
10801080
; GFX10-64-NEXT: s_mov_b64 s[0:1], exec
10811081
; GFX10-64-NEXT: s_wqm_b64 exec, exec
10821082
; GFX10-64-NEXT: v_cvt_i32_f32_e32 v0, v0
1083+
; GFX10-64-NEXT: s_mov_b32 s4, 0
10831084
; GFX10-64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
10841085
; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc
10851086
; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3]
@@ -1088,12 +1089,11 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index
10881089
; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec
10891090
; GFX10-64-NEXT: s_cbranch_scc0 .LBB7_9
10901091
; GFX10-64-NEXT: ; %bb.2: ; %.demote0
1091-
; GFX10-64-NEXT: s_wqm_b64 s[4:5], s[0:1]
1092-
; GFX10-64-NEXT: s_and_b64 exec, exec, s[4:5]
1092+
; GFX10-64-NEXT: s_wqm_b64 s[6:7], s[0:1]
1093+
; GFX10-64-NEXT: s_and_b64 exec, exec, s[6:7]
10931094
; GFX10-64-NEXT: .LBB7_3: ; %.continue0.preheader
10941095
; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3]
1095-
; GFX10-64-NEXT: s_mov_b32 s2, 0
1096-
; GFX10-64-NEXT: v_mov_b32_e32 v0, s2
1096+
; GFX10-64-NEXT: v_mov_b32_e32 v0, s4
10971097
; GFX10-64-NEXT: s_mov_b64 s[2:3], 0
10981098
; GFX10-64-NEXT: s_branch .LBB7_5
10991099
; GFX10-64-NEXT: .LBB7_4: ; %.continue1

0 commit comments

Comments
 (0)