@@ -57,6 +57,226 @@ void RegBankLegalizeHelper::findRuleAndApplyMapping(MachineInstr &MI) {
5757 lower (MI, Mapping, WaterfallSgprs);
5858}
5959
60+ bool RegBankLegalizeHelper::executeInWaterfallLoop (
61+ MachineIRBuilder &B, iterator_range<MachineBasicBlock::iterator> Range,
62+ SmallSet<Register, 4 > &SGPROperandRegs) {
63+ // Track use registers which have already been expanded with a readfirstlane
64+ // sequence. This may have multiple uses if moving a sequence.
65+ DenseMap<Register, Register> WaterfalledRegMap;
66+
67+ MachineBasicBlock &MBB = B.getMBB ();
68+ MachineFunction &MF = B.getMF ();
69+
70+ const SIRegisterInfo *TRI = ST.getRegisterInfo ();
71+ const TargetRegisterClass *WaveRC = TRI->getWaveMaskRegClass ();
72+ unsigned MovExecOpc, MovExecTermOpc, XorTermOpc, AndSaveExecOpc, ExecReg;
73+ if (ST.isWave32 ()) {
74+ MovExecOpc = AMDGPU::S_MOV_B32;
75+ MovExecTermOpc = AMDGPU::S_MOV_B32_term;
76+ XorTermOpc = AMDGPU::S_XOR_B32_term;
77+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B32;
78+ ExecReg = AMDGPU::EXEC_LO;
79+ } else {
80+ MovExecOpc = AMDGPU::S_MOV_B64;
81+ MovExecTermOpc = AMDGPU::S_MOV_B64_term;
82+ XorTermOpc = AMDGPU::S_XOR_B64_term;
83+ AndSaveExecOpc = AMDGPU::S_AND_SAVEEXEC_B64;
84+ ExecReg = AMDGPU::EXEC;
85+ }
86+
87+ #ifndef NDEBUG
88+ const int OrigRangeSize = std::distance (Range.begin (), Range.end ());
89+ #endif
90+
91+ MachineRegisterInfo &MRI = *B.getMRI ();
92+ Register SaveExecReg = MRI.createVirtualRegister (WaveRC);
93+ Register InitSaveExecReg = MRI.createVirtualRegister (WaveRC);
94+
95+ // Don't bother using generic instructions/registers for the exec mask.
96+ B.buildInstr (TargetOpcode::IMPLICIT_DEF).addDef (InitSaveExecReg);
97+
98+ Register SavedExec = MRI.createVirtualRegister (WaveRC);
99+
100+ // To insert the loop we need to split the block. Move everything before
101+ // this point to a new block, and insert a new empty block before this
102+ // instruction.
103+ MachineBasicBlock *LoopBB = MF.CreateMachineBasicBlock ();
104+ MachineBasicBlock *BodyBB = MF.CreateMachineBasicBlock ();
105+ MachineBasicBlock *RestoreExecBB = MF.CreateMachineBasicBlock ();
106+ MachineBasicBlock *RemainderBB = MF.CreateMachineBasicBlock ();
107+ MachineFunction::iterator MBBI (MBB);
108+ ++MBBI;
109+ MF.insert (MBBI, LoopBB);
110+ MF.insert (MBBI, BodyBB);
111+ MF.insert (MBBI, RestoreExecBB);
112+ MF.insert (MBBI, RemainderBB);
113+
114+ LoopBB->addSuccessor (BodyBB);
115+ BodyBB->addSuccessor (RestoreExecBB);
116+ BodyBB->addSuccessor (LoopBB);
117+
118+ // Move the rest of the block into a new block.
119+ RemainderBB->transferSuccessorsAndUpdatePHIs (&MBB);
120+ RemainderBB->splice (RemainderBB->begin (), &MBB, Range.end (), MBB.end ());
121+
122+ MBB.addSuccessor (LoopBB);
123+ RestoreExecBB->addSuccessor (RemainderBB);
124+
125+ B.setInsertPt (*LoopBB, LoopBB->end ());
126+
127+ // +-MBB:------------+
128+ // | ... |
129+ // | %0 = G_INST_1 |
130+ // | %Dst = MI %Vgpr |
131+ // | %1 = G_INST_2 |
132+ // | ... |
133+ // +-----------------+
134+ // ->
135+ // +-MBB-------------------------------+
136+ // | ... |
137+ // | %0 = G_INST_1 |
138+ // | %SaveExecReg = S_MOV_B32 $exec_lo |
139+ // +----------------|------------------+
140+ // | /------------------------------|
141+ // V V |
142+ // +-LoopBB---------------------------------------------------------------+ |
143+ // | %CurrentLaneReg:sgpr(s32) = READFIRSTLANE %Vgpr | |
144+ // | instead of executing for each lane, see if other lanes had | |
145+ // | same value for %Vgpr and execute for them also. | |
146+ // | %CondReg:vcc(s1) = G_ICMP eq %CurrentLaneReg, %Vgpr | |
147+ // | %CondRegLM:sreg_32 = ballot %CondReg // copy vcc to sreg32 lane mask | |
148+ // | %SavedExec = S_AND_SAVEEXEC_B32 %CondRegLM | |
149+ // | exec is active for lanes with the same "CurrentLane value" in Vgpr | |
150+ // +----------------|-----------------------------------------------------+ |
151+ // V |
152+ // +-BodyBB------------------------------------------------------------+ |
153+ // | %Dst = MI %CurrentLaneReg:sgpr(s32) | |
154+ // | executed only for active lanes and written to Dst | |
155+ // | $exec = S_XOR_B32 $exec, %SavedExec | |
156+ // | set active lanes to 0 in SavedExec, lanes that did not write to | |
157+ // | Dst yet, and set this as new exec (for READFIRSTLANE and ICMP) | |
158+ // | SI_WATERFALL_LOOP LoopBB |-----|
159+ // +----------------|--------------------------------------------------+
160+ // V
161+ // +-RestoreExecBB--------------------------+
162+ // | $exec_lo = S_MOV_B32_term %SaveExecReg |
163+ // +----------------|-----------------------+
164+ // V
165+ // +-RemainderBB:----------------------+
166+ // | %1 = G_INST_2 |
167+ // | ... |
168+ // +---------------------------------- +
169+
170+ // Move the instruction into the loop body. Note we moved everything after
171+ // Range.end() already into a new block, so Range.end() is no longer valid.
172+ BodyBB->splice (BodyBB->end (), &MBB, Range.begin (), MBB.end ());
173+
174+ // Figure out the iterator range after splicing the instructions.
175+ MachineBasicBlock::iterator NewBegin = Range.begin ()->getIterator ();
176+ auto NewEnd = BodyBB->end ();
177+ assert (std::distance (NewBegin, NewEnd) == OrigRangeSize);
178+
179+ B.setMBB (*LoopBB);
180+ Register CondReg;
181+
182+ for (MachineInstr &MI : make_range (NewBegin, NewEnd)) {
183+ for (MachineOperand &Op : MI.all_uses ()) {
184+ Register OldReg = Op.getReg ();
185+ if (!SGPROperandRegs.count (OldReg))
186+ continue ;
187+
188+ // See if we already processed this register in another instruction in
189+ // the sequence.
190+ auto OldVal = WaterfalledRegMap.find (OldReg);
191+ if (OldVal != WaterfalledRegMap.end ()) {
192+ Op.setReg (OldVal->second );
193+ continue ;
194+ }
195+
196+ Register OpReg = Op.getReg ();
197+ LLT OpTy = MRI.getType (OpReg);
198+
199+ // TODO: support for agpr
200+ assert (MRI.getRegBank (OpReg) == VgprRB);
201+ Register CurrentLaneReg = MRI.createVirtualRegister ({SgprRB, OpTy});
202+ buildReadFirstLane (B, CurrentLaneReg, OpReg, RBI);
203+
204+ // Build the comparison(s), CurrentLaneReg == OpReg.
205+ unsigned OpSize = OpTy.getSizeInBits ();
206+ bool Is64 = OpSize % 64 == 0 ;
207+ unsigned PartSize = Is64 ? 64 : 32 ;
208+ LLT PartTy = LLT::scalar (PartSize);
209+ unsigned NumParts = OpSize / PartSize;
210+ SmallVector<Register, 8 > OpParts;
211+ SmallVector<Register, 8 > CurrentLaneParts;
212+
213+ if (NumParts == 1 ) {
214+ OpParts.push_back (OpReg);
215+ CurrentLaneParts.push_back (CurrentLaneReg);
216+ } else {
217+ auto UnmergeOp = B.buildUnmerge ({VgprRB, PartTy}, OpReg);
218+ auto UnmergeCurrLane = B.buildUnmerge ({SgprRB, PartTy}, CurrentLaneReg);
219+ for (unsigned i = 0 ; i < NumParts; ++i) {
220+ OpParts.push_back (UnmergeOp.getReg (i));
221+ CurrentLaneParts.push_back (UnmergeCurrLane.getReg (i));
222+ }
223+ }
224+
225+ for (unsigned i = 0 ; i < NumParts; ++i) {
226+ Register CmpReg = MRI.createVirtualRegister (VccRB_S1);
227+ B.buildICmp (CmpInst::ICMP_EQ, CmpReg, CurrentLaneParts[i], OpParts[i]);
228+
229+ if (!CondReg) {
230+ CondReg = CmpReg;
231+ } else {
232+ CondReg = B.buildAnd (VccRB_S1, CondReg, CmpReg).getReg (0 );
233+ }
234+ }
235+
236+ Op.setReg (CurrentLaneReg);
237+
238+ // Make sure we don't re-process this register again.
239+ WaterfalledRegMap.insert (std::pair (OldReg, Op.getReg ()));
240+ }
241+ }
242+
243+ // Copy vcc to sgpr32/64, ballot becomes a no-op during instruction selection.
244+ Register CondRegLM =
245+ MRI.createVirtualRegister ({WaveRC, LLT::scalar (ST.isWave32 () ? 32 : 64 )});
246+ B.buildIntrinsic (Intrinsic::amdgcn_ballot, CondRegLM).addReg (CondReg);
247+
248+ // Update EXEC, save the original EXEC value to SavedExec.
249+ B.buildInstr (AndSaveExecOpc)
250+ .addDef (SavedExec)
251+ .addReg (CondRegLM, RegState::Kill);
252+ MRI.setSimpleHint (SavedExec, CondRegLM);
253+
254+ B.setInsertPt (*BodyBB, BodyBB->end ());
255+
256+ // Update EXEC, switch all done bits to 0 and all todo bits to 1.
257+ B.buildInstr (XorTermOpc).addDef (ExecReg).addReg (ExecReg).addReg (SavedExec);
258+
259+ // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
260+ // s_cbranch_scc0?
261+
262+ // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
263+ B.buildInstr (AMDGPU::SI_WATERFALL_LOOP).addMBB (LoopBB);
264+
265+ // Save the EXEC mask before the loop.
266+ B.setInsertPt (MBB, MBB.end ());
267+ B.buildInstr (MovExecOpc).addDef (SaveExecReg).addReg (ExecReg);
268+
269+ // Restore the EXEC mask after the loop.
270+ B.setInsertPt (*RestoreExecBB, RestoreExecBB->begin ());
271+ B.buildInstr (MovExecTermOpc).addDef (ExecReg).addReg (SaveExecReg);
272+
273+ // Set the insert point after the original instruction, so any new
274+ // instructions will be in the remainder.
275+ B.setInsertPt (*RemainderBB, RemainderBB->begin ());
276+
277+ return true ;
278+ }
279+
60280void RegBankLegalizeHelper::splitLoad (MachineInstr &MI,
61281 ArrayRef<LLT> LLTBreakdown, LLT MergeTy) {
62282 MachineFunction &MF = B.getMF ();
@@ -395,7 +615,7 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
395615
396616 switch (Mapping.LoweringMethod ) {
397617 case DoNotLower:
398- return ;
618+ break ;
399619 case VccExtToSel:
400620 return lowerVccExtToSel (MI);
401621 case UniExtToSel: {
@@ -531,7 +751,10 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI,
531751 }
532752 }
533753
534- // TODO: executeInWaterfallLoop(... WaterfallSgprs)
754+ if (!WaterfallSgprs.empty ()) {
755+ MachineBasicBlock::iterator I = MI.getIterator ();
756+ executeInWaterfallLoop (B, make_range (I, std::next (I)), WaterfallSgprs);
757+ }
535758}
536759
537760LLT RegBankLegalizeHelper::getTyFromID (RegBankLLTMappingApplyID ID) {
@@ -543,6 +766,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
543766 case Vgpr16:
544767 return LLT::scalar (16 );
545768 case Sgpr32:
769+ case Sgpr32_W:
546770 case Sgpr32Trunc:
547771 case Sgpr32AExt:
548772 case Sgpr32AExtBoolInReg:
@@ -578,6 +802,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) {
578802 case VgprV2S32:
579803 return LLT::fixed_vector (2 , 32 );
580804 case SgprV4S32:
805+ case SgprV4S32_W:
581806 case VgprV4S32:
582807 case UniInVgprV4S32:
583808 return LLT::fixed_vector (4 , 32 );
@@ -645,6 +870,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
645870 return VccRB;
646871 case Sgpr16:
647872 case Sgpr32:
873+ case Sgpr32_W:
648874 case Sgpr64:
649875 case SgprP1:
650876 case SgprP3:
@@ -653,6 +879,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) {
653879 case SgprV2S16:
654880 case SgprV2S32:
655881 case SgprV4S32:
882+ case SgprV4S32_W:
656883 case SgprB32:
657884 case SgprB64:
658885 case SgprB96:
@@ -894,6 +1121,15 @@ void RegBankLegalizeHelper::applyMappingSrc(
8941121 }
8951122 break ;
8961123 }
1124+ // sgpr waterfall, scalars and vectors
1125+ case Sgpr32_W:
1126+ case SgprV4S32_W: {
1127+ assert (Ty == getTyFromID (MethodIDs[i]));
1128+ if (RB != SgprRB) {
1129+ SgprWaterfallOperandRegs.insert (Reg);
1130+ }
1131+ break ;
1132+ }
8971133 // sgpr and vgpr scalars with extend
8981134 case Sgpr32AExt: {
8991135 // Note: this ext allows S1, and it is meant to be combined away.
0 commit comments