1616// ===----------------------------------------------------------------------===//
1717
1818#include " AMDGPU.h"
19+ #include " SILowerI1Copies.h"
20+ #include " llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
1921#include " llvm/CodeGen/MachineFunctionPass.h"
22+ #include " llvm/CodeGen/MachineUniformityAnalysis.h"
23+ #include " llvm/InitializePasses.h"
2024
2125#define DEBUG_TYPE " amdgpu-global-isel-divergence-lowering"
2226
@@ -42,14 +46,146 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
4246
4347 void getAnalysisUsage (AnalysisUsage &AU) const override {
4448 AU.setPreservesCFG ();
49+ AU.addRequired <MachineDominatorTree>();
50+ AU.addRequired <MachinePostDominatorTree>();
51+ AU.addRequired <MachineUniformityAnalysisPass>();
4552 MachineFunctionPass::getAnalysisUsage (AU);
4653 }
4754};
4855
56+ class DivergenceLoweringHelper : public PhiLoweringHelper {
57+ public:
58+ DivergenceLoweringHelper (MachineFunction *MF, MachineDominatorTree *DT,
59+ MachinePostDominatorTree *PDT,
60+ MachineUniformityInfo *MUI);
61+
62+ private:
63+ MachineUniformityInfo *MUI = nullptr ;
64+ MachineIRBuilder B;
65+ Register buildRegCopyToLaneMask (Register Reg);
66+
67+ public:
68+ void markAsLaneMask (Register DstReg) const override ;
69+ void getCandidatesForLowering (
70+ SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override ;
71+ void collectIncomingValuesFromPhi (
72+ const MachineInstr *MI,
73+ SmallVectorImpl<Incoming> &Incomings) const override ;
74+ void replaceDstReg (Register NewReg, Register OldReg,
75+ MachineBasicBlock *MBB) override ;
76+ void buildMergeLaneMasks (MachineBasicBlock &MBB,
77+ MachineBasicBlock::iterator I, const DebugLoc &DL,
78+ Register DstReg, Register PrevReg,
79+ Register CurReg) override ;
80+ void constrainAsLaneMask (Incoming &In) override ;
81+ };
82+
83+ DivergenceLoweringHelper::DivergenceLoweringHelper (
84+ MachineFunction *MF, MachineDominatorTree *DT,
85+ MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI)
86+ : PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {}
87+
88+ // _(s1) -> SReg_32/64(s1)
89+ void DivergenceLoweringHelper::markAsLaneMask (Register DstReg) const {
90+ assert (MRI->getType (DstReg) == LLT::scalar (1 ));
91+
92+ if (MRI->getRegClassOrNull (DstReg)) {
93+ if (MRI->constrainRegClass (DstReg, ST->getBoolRC ()))
94+ return ;
95+ llvm_unreachable (" Failed to constrain register class" );
96+ }
97+
98+ MRI->setRegClass (DstReg, ST->getBoolRC ());
99+ }
100+
101+ void DivergenceLoweringHelper::getCandidatesForLowering (
102+ SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
103+ LLT S1 = LLT::scalar (1 );
104+
105+ // Add divergent i1 phis to the list
106+ for (MachineBasicBlock &MBB : *MF) {
107+ for (MachineInstr &MI : MBB.phis ()) {
108+ Register Dst = MI.getOperand (0 ).getReg ();
109+ if (MRI->getType (Dst) == S1 && MUI->isDivergent (Dst))
110+ Vreg1Phis.push_back (&MI);
111+ }
112+ }
113+ }
114+
115+ void DivergenceLoweringHelper::collectIncomingValuesFromPhi (
116+ const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
117+ for (unsigned i = 1 ; i < MI->getNumOperands (); i += 2 ) {
118+ Incomings.emplace_back (MI->getOperand (i).getReg (),
119+ MI->getOperand (i + 1 ).getMBB (), Register ());
120+ }
121+ }
122+
123+ void DivergenceLoweringHelper::replaceDstReg (Register NewReg, Register OldReg,
124+ MachineBasicBlock *MBB) {
125+ BuildMI (*MBB, MBB->getFirstNonPHI (), {}, TII->get (AMDGPU::COPY), OldReg)
126+ .addReg (NewReg);
127+ }
128+
129+ // Copy Reg to new lane mask register, insert a copy after instruction that
130+ // defines Reg while skipping phis if needed.
131+ Register DivergenceLoweringHelper::buildRegCopyToLaneMask (Register Reg) {
132+ Register LaneMask = createLaneMaskReg (MRI, LaneMaskRegAttrs);
133+ MachineInstr *Instr = MRI->getVRegDef (Reg);
134+ MachineBasicBlock *MBB = Instr->getParent ();
135+ B.setInsertPt (*MBB, MBB->SkipPHIsAndLabels (std::next (Instr->getIterator ())));
136+ B.buildCopy (LaneMask, Reg);
137+ return LaneMask;
138+ }
139+
140+ // bb.previous
141+ // %PrevReg = ...
142+ //
143+ // bb.current
144+ // %CurReg = ...
145+ //
146+ // %DstReg - not defined
147+ //
148+ // -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
149+ //
150+ // bb.previous
151+ // %PrevReg = ...
152+ // %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
153+ //
154+ // bb.current
155+ // %CurReg = ...
156+ // %CurRegCopy:sreg_32(s1) = COPY %CurReg
157+ // ...
158+ // %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
159+ // %CurMaskedReg:sreg_32(s1) = AND %ExecReg, CurRegCopy - inactive lanes to 0
160+ // %DstReg:sreg_32(s1) = OR %PrevMaskedReg, CurMaskedReg
161+ //
162+ // DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
163+ void DivergenceLoweringHelper::buildMergeLaneMasks (
164+ MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
165+ Register DstReg, Register PrevReg, Register CurReg) {
166+ // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
167+ // TODO: check if inputs are constants or results of a compare.
168+
169+ Register PrevRegCopy = buildRegCopyToLaneMask (PrevReg);
170+ Register CurRegCopy = buildRegCopyToLaneMask (CurReg);
171+ Register PrevMaskedReg = createLaneMaskReg (MRI, LaneMaskRegAttrs);
172+ Register CurMaskedReg = createLaneMaskReg (MRI, LaneMaskRegAttrs);
173+
174+ B.setInsertPt (MBB, I);
175+ B.buildInstr (AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg});
176+ B.buildInstr (AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy});
177+ B.buildInstr (OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg});
178+ }
179+
180+ void DivergenceLoweringHelper::constrainAsLaneMask (Incoming &In) { return ; }
181+
49182} // End anonymous namespace.
50183
51184INITIALIZE_PASS_BEGIN (AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
52185 " AMDGPU GlobalISel divergence lowering" , false , false )
186+ INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
187+ INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
188+ INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
53189INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
54190 " AMDGPU GlobalISel divergence lowering" , false , false )
55191
@@ -64,5 +200,12 @@ FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
64200
65201bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction (
66202 MachineFunction &MF) {
67- return false ;
203+ MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
204+ MachinePostDominatorTree &PDT = getAnalysis<MachinePostDominatorTree>();
205+ MachineUniformityInfo &MUI =
206+ getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo ();
207+
208+ DivergenceLoweringHelper Helper (&MF, &DT, &PDT, &MUI);
209+
210+ return Helper.lowerPhis ();
68211}
0 commit comments