Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
797 changes: 504 additions & 293 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Large diffs are not rendered by default.

249 changes: 200 additions & 49 deletions llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include "llvm/ADT/MapVector.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include <cstdint>
#include <limits>

namespace llvm {

Expand Down Expand Up @@ -300,18 +302,12 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
// Compute and cache live-ins and pressure for all regions in block.
void computeBlockPressure(unsigned RegionIdx, const MachineBasicBlock *MBB);

/// If necessary, updates a region's boundaries following insertion ( \p NewMI
/// != nullptr) or removal ( \p NewMI == nullptr) of a \p MI in the region.
/// For an MI removal, this must be called before the MI is actually erased
/// from its parent MBB.
void updateRegionBoundaries(RegionBoundaries &RegionBounds,
MachineBasicBlock::iterator MI,
MachineInstr *NewMI);

void runSchedStages();

std::unique_ptr<GCNSchedStage> createSchedStage(GCNSchedStageID SchedStageID);

void deleteMI(unsigned RegionIdx, MachineInstr *MI);

public:
GCNScheduleDAGMILive(MachineSchedContext *C,
std::unique_ptr<MachineSchedStrategy> S);
Expand Down Expand Up @@ -447,65 +443,215 @@ class ClusteredLowOccStage : public GCNSchedStage {
};

/// Attempts to reduce function spilling or, if there is no spilling, to
/// increase function occupancy by one with respect to ArchVGPR usage by sinking
/// rematerializable instructions to their use. When the stage
/// estimates reducing spilling or increasing occupancy is possible, as few
/// instructions as possible are rematerialized to reduce potential negative
/// increase function occupancy by one with respect to register usage by sinking
/// rematerializable instructions to their use. When the stage estimates that
/// reducing spilling or increasing occupancy is possible, it tries to
/// rematerialize as few registers as possible to reduce potential negative
/// effects on function latency.
///
/// The stage only supports rematerializing registers that meet all of the
/// following constraints.
/// 1. The register is virtual and has a single defining instruction.
/// 2. The single defining instruction is either deemed rematerializable by the
/// target-independent logic, or if not, has no non-constant and
/// non-ignorable physical register use.
/// 3 The register has no virtual register use whose live range would be
/// extended by the rematerialization.
/// 4. The register has a single non-debug user in a different region from its
/// defining region.
/// 5. The register is not used by or using another register that is going to be
/// rematerialized.
class PreRARematStage : public GCNSchedStage {
private:
/// Useful information about a rematerializable instruction.
struct RematInstruction {
/// Single use of the rematerializable instruction's defined register,
/// located in a different block.
/// A rematerializable register.
struct RematReg {
/// Single MI defining the rematerializable register.
MachineInstr *DefMI;
/// Single user of the rematerializable register.
MachineInstr *UseMI;
/// Rematerialized version of \p DefMI, set in
/// PreRARematStage::rematerialize. Used for reverting rematerializations.
/// Regions in which the register is live-in/live-out/live anywhere.
BitVector LiveIn, LiveOut, Live;
/// The rematerializable register's lane bitmask.
LaneBitmask Mask;
/// Defining and using regions.
unsigned DefRegion, UseRegion;

RematReg(MachineInstr *DefMI, MachineInstr *UseMI,
GCNScheduleDAGMILive &DAG,
const DenseMap<MachineInstr *, unsigned> &MIRegion);

/// Returns the rematerializable register. Do not call after deleting the
/// original defining instruction.
Register getReg() const { return DefMI->getOperand(0).getReg(); }

/// Determines whether this rematerialization may be beneficial in at least
/// one target region.
bool maybeBeneficial(const BitVector &TargetRegions,
ArrayRef<GCNRPTarget> RPTargets) const;

/// Determines if the register is both unused and live-through in region \p
/// I. This guarantees that rematerializing it will reduce RP in the region.
bool isUnusedLiveThrough(unsigned I) const {
assert(I < Live.size() && "region index out of range");
return LiveIn[I] && LiveOut[I] && I != UseRegion;
}

/// Updates internal structures following a MI rematerialization. Part of
/// the stage instead of the DAG because it makes assumptions that are
/// specific to the rematerialization process.
void insertMI(unsigned RegionIdx, MachineInstr *RematMI,
GCNScheduleDAGMILive &DAG) const;
};

/// A scored rematerialization candidate. Higher scores indicate more
/// beneficial rematerializations. A null score indicate the rematerialization
/// is not helpful to reduce RP in target regions.
struct ScoredRemat {
/// The rematerializable register under consideration.
const RematReg *Remat;

/// Execution frequency information required by scoring heuristics.
struct FreqInfo {
/// Per-region execution frequencies, normalized to minimum observed
/// frequency. 0 when unknown.
SmallVector<uint64_t> Regions;
/// Maximum observed frequency, normalized to minimum observed frequency.
uint64_t MaxFreq = 0;

FreqInfo(MachineFunction &MF, const GCNScheduleDAGMILive &DAG);
};

/// This only initializes state-independent characteristics of \p Remat, not
/// the actual score.
ScoredRemat(const RematReg *Remat, const FreqInfo &Freq,
const GCNScheduleDAGMILive &DAG);

/// Updates the rematerialization's score w.r.t. the current \p RPTargets.
/// \p RegionFreq indicates the frequency of each region
void update(const BitVector &TargetRegions, ArrayRef<GCNRPTarget> RPTargets,
const FreqInfo &Freq, bool ReduceSpill);

/// Returns whether the current score is null, indicating the
/// rematerialization is useless.
bool hasNullScore() const { return !MaxFreq && !RegionImpact; }

/// For each pair of candidates the most important scoring component with
/// non-equal values determine the result of the comparison (higher is
/// better).
bool operator<(const ScoredRemat &O) const {
if (hasNullScore())
return true;
if (O.hasNullScore())
return false;
if (MaxFreq != O.MaxFreq)
return MaxFreq < O.MaxFreq;
if (FreqDiff != O.FreqDiff)
return FreqDiff < O.FreqDiff;
if (RegionImpact != O.RegionImpact)
return RegionImpact < O.RegionImpact;
// Break ties using pointer to rematerializable register.
return Remat > O.Remat;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should it be Remat < 0.Remat ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I made the comparison in this direction because, all other things being equal, it will favor rematerializing registers with slightly longer live ranges in their defining regions.

Registers are collected in instruction order within each region so registers defined "early" in a region have lower addresses than those defined "late" in the same region. The former also have longer live-ranges in the region since they start earlier and extend to the live-outs. If the defining region is a region with excess RP this can lead to more optimal rematerializations overall in very specific cases (our remat unit tests actually hit that very specific case quite often due to their regular/artificial nature).

}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
Printable print() const;
#endif

private:
/// Number of 32-bit registers this rematerialization covers.
const unsigned NumRegs;

// The three members below are the scoring components, top to bottom from
// most important to least important when comparing candidates.

/// Frequency of impacted target region with highest known frequency. This
/// only matters when the stage is trying to reduce spilling, so it is
/// always 0 when it is not.
uint64_t MaxFreq;
/// Frequency difference between defining and using regions. Negative values
/// indicate we are rematerializing to higher frequency regions; positive
/// values indicate the contrary.
const int64_t FreqDiff;
/// Expected number of target regions impacted by the rematerialization,
/// scaled by the size of the register being rematerialized.
unsigned RegionImpact;

unsigned getNumRegs(const GCNScheduleDAGMILive &DAG) const;

int64_t getFreqDiff(const FreqInfo &Freq) const;
};

/// Holds enough information to rollback a rematerialization decision post
/// re-scheduling.
struct RollbackInfo {
/// The rematerializable register under consideration.
const RematReg *Remat;
/// The rematerialized MI replacing the original defining MI.
MachineInstr *RematMI;
/// Set of regions in which the rematerializable instruction's defined
/// register is a live-in.
SmallDenseSet<unsigned, 4> LiveInRegions;

RematInstruction(MachineInstr *UseMI) : UseMI(UseMI) {}
RollbackInfo(const RematReg *Remat) : Remat(Remat) {}
};

/// Maps all MIs to their parent region. MI terminators are considered to be
/// outside the region they delimitate, and as such are not stored in the map.
DenseMap<MachineInstr *, unsigned> MIRegion;
/// Parent MBB to each region, in region order.
SmallVector<MachineBasicBlock *> RegionBB;
/// Collects instructions to rematerialize.
MapVector<MachineInstr *, RematInstruction> Rematerializations;
/// Collects regions whose live-ins or register pressure will change due to
/// rematerializations.
DenseMap<unsigned, GCNRegPressure> ImpactedRegions;
/// In case we need to rollback rematerializations, save lane masks for all
/// rematerialized registers in all regions in which they are live-ins.
DenseMap<std::pair<unsigned, Register>, LaneBitmask> RegMasks;
/// After successful stage initialization, indicates which regions should be
/// rescheduled.
BitVector RescheduleRegions;
/// The target occupancy the stage is trying to achieve. Empty when the

/// Register pressure targets for all regions.
SmallVector<GCNRPTarget> RPTargets;
/// Regions which are above the stage's RP target.
BitVector TargetRegions;
/// The target occupancy the set is trying to achieve. Empty when the
/// objective is spilling reduction.
std::optional<unsigned> TargetOcc;
/// Achieved occupancy *only* through rematerializations (pre-rescheduling).
/// Smaller than or equal to the target occupancy.
/// Smaller than or equal to the target occupancy, when it is defined.
unsigned AchievedOcc;

/// Returns whether remat can reduce spilling or increase function occupancy
/// by 1 through rematerialization. If it can do one, collects instructions in
/// PreRARematStage::Rematerializations and sets the target occupancy in
/// PreRARematStage::TargetOccupancy.
bool canIncreaseOccupancyOrReduceSpill();
/// List of rematerializable registers.
SmallVector<RematReg, 16> RematRegs;
/// List of rematerializations to rollback if rematerialization does not end
/// up being beneficial.
SmallVector<RollbackInfo> Rollbacks;
/// After successful stage initialization, indicates which regions should be
/// rescheduled.
BitVector RescheduleRegions;

/// Determines the stage's objective (increasing occupancy or reducing
/// spilling, set in \ref TargetOcc). Defines \ref RPTargets in all regions to
/// achieve that objective and mark those that don't achieve it in \ref
/// TargetRegions. Returns whether there is any target region.
bool setObjective();

/// Unsets target regions in \p Regions whose RP target has been reached.
void unsetSatisifedRPTargets(const BitVector &Regions);

/// Fully recomputes RP from the DAG in \p Regions. Among those regions, sets
/// again all \ref TargetRegions that were optimistically marked as satisfied
/// but are actually not, and returns whether there were any such regions.
bool updateAndVerifyRPTargets(const BitVector &Regions);

/// Collects all rematerializable registers and appends them to \ref
/// RematRegs. \p MIRegion maps MIs to their region. Returns whether any
/// rematerializable register was found.
bool collectRematRegs(const DenseMap<MachineInstr *, unsigned> &MIRegion);

/// Rematerializes \p Remat. This removes the rematerialized register from
/// live-in/out lists in the DAG and updates RP targets in all affected
/// regions, which are also marked in \ref RescheduleRegions. Regions in which
/// RP savings are not guaranteed are set in \p RecomputeRP. When \p Rollback
/// is non-null, fills it with required information to be able to rollback the
/// rematerialization post-rescheduling.
void rematerialize(const RematReg &Remat, BitVector &RecomputeRP,
RollbackInfo *Rollback);

/// Rollbacks the rematerialization decision represented by \p Rollback. This
/// update live-in/out lists in the DAG but does not update cached register
/// pressures. Regions in which RP may be impacted are marked in \ref
/// RecomputeRP.
void rollback(const RollbackInfo &Rollback, BitVector &RecomputeRP) const;

/// Whether the MI is rematerializable
bool isReMaterializable(const MachineInstr &MI);

/// Rematerializes all instructions in PreRARematStage::Rematerializations
/// and stores the achieved occupancy after remat in
/// PreRARematStage::AchievedOcc.
void rematerialize();

/// If remat alone did not increase occupancy to the target one, rollbacks all
/// rematerializations and resets live-ins/RP in all regions impacted by the
/// stage to their pre-stage values.
Expand All @@ -519,7 +665,12 @@ class PreRARematStage : public GCNSchedStage {
bool shouldRevertScheduling(unsigned WavesAfter) override;

PreRARematStage(GCNSchedStageID StageID, GCNScheduleDAGMILive &DAG)
: GCNSchedStage(StageID, DAG), RescheduleRegions(DAG.Regions.size()) {}
: GCNSchedStage(StageID, DAG), TargetRegions(DAG.Regions.size()),
RescheduleRegions(DAG.Regions.size()) {
const unsigned NumRegions = DAG.Regions.size();
RPTargets.reserve(NumRegions);
RegionBB.reserve(NumRegions);
}
};

class ILPInitialScheduleStage : public GCNSchedStage {
Expand Down
24 changes: 12 additions & 12 deletions llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
Original file line number Diff line number Diff line change
Expand Up @@ -419,18 +419,18 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
; GISEL-GFX942-NEXT: s_load_dword s11, s[4:5], 0x34
; GISEL-GFX942-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x44
; GISEL-GFX942-NEXT: s_mov_b32 s16, 0
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x2000
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s16
; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX942-NEXT: s_mov_b32 s8, s1
; GISEL-GFX942-NEXT: s_mov_b32 s9, s2
; GISEL-GFX942-NEXT: s_mov_b32 s10, s3
; GISEL-GFX942-NEXT: s_mov_b32 s4, s13
; GISEL-GFX942-NEXT: s_mov_b32 s5, s14
; GISEL-GFX942-NEXT: s_mov_b32 s6, s15
; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16
; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, 0x2000
; GISEL-GFX942-NEXT: .LBB0_1: ; %load-store-loop
; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v0
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
Expand All @@ -447,9 +447,9 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v0
; GISEL-GFX942-NEXT: v_add_u32_e32 v0, 0x100, v0
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
Expand Down Expand Up @@ -945,18 +945,18 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX942-NEXT: s_load_dword s11, s[4:5], 0x34
; GISEL-GFX942-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x44
; GISEL-GFX942-NEXT: s_mov_b32 s16, 0
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, 0x100
; GISEL-GFX942-NEXT: v_mov_b32_e32 v0, s16
; GISEL-GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GISEL-GFX942-NEXT: s_mov_b32 s8, s1
; GISEL-GFX942-NEXT: s_mov_b32 s9, s2
; GISEL-GFX942-NEXT: s_mov_b32 s10, s3
; GISEL-GFX942-NEXT: s_mov_b32 s4, s13
; GISEL-GFX942-NEXT: s_mov_b32 s5, s14
; GISEL-GFX942-NEXT: s_mov_b32 s6, s15
; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, s16
; GISEL-GFX942-NEXT: v_mov_b32_e32 v1, 0x100
; GISEL-GFX942-NEXT: .LBB1_1: ; %load-store-loop
; GISEL-GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v1
; GISEL-GFX942-NEXT: v_add_u32_e32 v62, s0, v0
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[2:5], v62, s[8:11], 0 offen
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[6:9], v62, s[8:11], 0 offen offset:16
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[10:13], v62, s[8:11], 0 offen offset:32
Expand All @@ -973,9 +973,9 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[54:57], v62, s[8:11], 0 offen offset:208
; GISEL-GFX942-NEXT: buffer_load_dwordx4 v[58:61], v62, s[8:11], 0 offen offset:224
; GISEL-GFX942-NEXT: buffer_load_dwordx4 a[0:3], v62, s[8:11], 0 offen offset:240
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v1
; GISEL-GFX942-NEXT: v_add_u32_e32 v1, 0x100, v1
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v1, v0
; GISEL-GFX942-NEXT: v_add_u32_e32 v63, s12, v0
; GISEL-GFX942-NEXT: v_add_u32_e32 v0, 0x100, v0
; GISEL-GFX942-NEXT: v_cmp_lt_u32_e32 vcc, v0, v1
; GISEL-GFX942-NEXT: s_waitcnt vmcnt(0)
; GISEL-GFX942-NEXT: scratch_store_dwordx4 off, a[0:3], off ; 16-byte Folded Spill
; GISEL-GFX942-NEXT: buffer_store_dwordx4 v[2:5], v63, s[4:7], 0 offen
Expand Down
Loading
Loading