@@ -37,6 +37,8 @@ class LoongArchMergeBaseOffsetOpt : public MachineFunctionPass {
3737 bool detectFoldable (MachineInstr &Hi20, MachineInstr *&Lo12,
3838 MachineInstr *&Lo20, MachineInstr *&Hi12,
3939 MachineInstr *&Last);
40+ bool detectFoldable (MachineInstr &Hi20, MachineInstr *&Add,
41+ MachineInstr *&Lo12);
4042
4143 bool detectAndFoldOffset (MachineInstr &Hi20, MachineInstr &Lo12,
4244 MachineInstr *&Lo20, MachineInstr *&Hi12,
@@ -176,7 +178,80 @@ bool LoongArchMergeBaseOffsetOpt::detectFoldable(MachineInstr &Hi20,
176178 return true ;
177179}
178180
179- // Update the offset in Hi20, Lo12, Lo20 and Hi12 instructions.
181+ // Detect the pattern:
182+ //
183+ // (small/medium):
184+ // lu12i.w vreg1, %le_hi20_r(s)
185+ // add.w/d vreg2, vreg1, r2, %le_add_r(s)
186+ // addi.w/d vreg3, vreg2, %le_lo12_r(s)
187+
188+ // The pattern is only accepted if:
189+ // 1) The first instruction has only one use, which is the PseudoAddTPRel.
190+ // The second instruction has only one use, which is the ADDI. The
191+ // second instruction's last operand is the tp register.
192+ // 2) The address operands have the appropriate type, reflecting the
193+ // lowering of a thread_local global address using the pattern.
194+ // 3) The offset value in the ThreadLocal Global Address is 0.
195+ bool LoongArchMergeBaseOffsetOpt::detectFoldable (MachineInstr &Hi20,
196+ MachineInstr *&Add,
197+ MachineInstr *&Lo12) {
198+ if (Hi20.getOpcode () != LoongArch::LU12I_W)
199+ return false ;
200+
201+ auto isGlobalOrCPI = [](const MachineOperand &Op) {
202+ return Op.isGlobal () || Op.isCPI ();
203+ };
204+
205+ const MachineOperand &Hi20Op1 = Hi20.getOperand (1 );
206+ if (LoongArchII::getDirectFlags (Hi20Op1) != LoongArchII::MO_LE_HI_R ||
207+ !isGlobalOrCPI (Hi20Op1) || Hi20Op1.getOffset () != 0 )
208+ return false ;
209+
210+ Register HiDestReg = Hi20.getOperand (0 ).getReg ();
211+ if (!MRI->hasOneUse (HiDestReg))
212+ return false ;
213+
214+ Add = &*MRI->use_instr_begin (HiDestReg);
215+ if ((ST->is64Bit () && Add->getOpcode () != LoongArch::PseudoAddTPRel_D) ||
216+ (!ST->is64Bit () && Add->getOpcode () != LoongArch::PseudoAddTPRel_W))
217+ return false ;
218+
219+ if (Add->getOperand (2 ).getReg () != LoongArch::R2)
220+ return false ;
221+
222+ const MachineOperand &AddOp3 = Add->getOperand (3 );
223+ if (LoongArchII::getDirectFlags (AddOp3) != LoongArchII::MO_LE_ADD_R ||
224+ !(isGlobalOrCPI (AddOp3) || AddOp3.isMCSymbol ()) ||
225+ AddOp3.getOffset () != 0 )
226+ return false ;
227+
228+ Register AddDestReg = Add->getOperand (0 ).getReg ();
229+ if (!MRI->hasOneUse (AddDestReg))
230+ return false ;
231+
232+ Lo12 = &*MRI->use_instr_begin (AddDestReg);
233+ if ((ST->is64Bit () && Lo12->getOpcode () != LoongArch::ADDI_D) ||
234+ (!ST->is64Bit () && Lo12->getOpcode () != LoongArch::ADDI_W))
235+ return false ;
236+
237+ const MachineOperand &Lo12Op2 = Lo12->getOperand (2 );
238+ if (LoongArchII::getDirectFlags (Lo12Op2) != LoongArchII::MO_LE_LO_R ||
239+ !(isGlobalOrCPI (Lo12Op2) || Lo12Op2.isMCSymbol ()) ||
240+ Lo12Op2.getOffset () != 0 )
241+ return false ;
242+
243+ if (Hi20Op1.isGlobal ()) {
244+ LLVM_DEBUG (dbgs () << " Found lowered global address: "
245+ << *Hi20Op1.getGlobal () << " \n " );
246+ } else if (Hi20Op1.isCPI ()) {
247+ LLVM_DEBUG (dbgs () << " Found lowered constant pool: " << Hi20Op1.getIndex ()
248+ << " \n " );
249+ }
250+
251+ return true ;
252+ }
253+
254+ // Update the offset in Hi20, (Add), Lo12, (Lo20 and Hi12) instructions.
180255// Delete the tail instruction and update all the uses to use the
181256// output from Last.
182257void LoongArchMergeBaseOffsetOpt::foldOffset (
@@ -190,31 +265,49 @@ void LoongArchMergeBaseOffsetOpt::foldOffset(
190265 Lo20->getOperand (2 ).setOffset (Offset);
191266 Hi12->getOperand (2 ).setOffset (Offset);
192267 }
268+
269+ // For tls-le, offset of the second PseudoAddTPRel instr should also be
270+ // updated.
271+ MachineInstr *Add = &*MRI->use_instr_begin (Hi20.getOperand (0 ).getReg ());
272+ if (Hi20.getOpcode () == LoongArch::LU12I_W)
273+ Add->getOperand (3 ).setOffset (Offset);
274+
193275 // Delete the tail instruction.
194276 MachineInstr *Def = Last ? Last : &Lo12;
195277 MRI->constrainRegClass (Def->getOperand (0 ).getReg (),
196278 MRI->getRegClass (Tail.getOperand (0 ).getReg ()));
197279 MRI->replaceRegWith (Tail.getOperand (0 ).getReg (), Def->getOperand (0 ).getReg ());
198280 Tail.eraseFromParent ();
281+
199282 LLVM_DEBUG (dbgs () << " Merged offset " << Offset << " into base.\n "
200- << " " << Hi20 << " " << Lo12;);
283+ << " " << Hi20;);
284+ if (Hi20.getOpcode () == LoongArch::LU12I_W) {
285+ LLVM_DEBUG (dbgs () << " " << *Add;);
286+ }
287+ LLVM_DEBUG (dbgs () << " " << Lo12;);
201288 if (Lo20 && Hi12) {
202289 LLVM_DEBUG (dbgs () << " " << *Lo20 << " " << *Hi12;);
203290 }
204291}
205292
206293// Detect patterns for large offsets that are passed into an ADD instruction.
207- // If the pattern is found, updates the offset in Hi20, Lo12, Lo20 and Hi12
208- // instructions and deletes TailAdd and the instructions that produced the
209- // offset.
294+ // If the pattern is found, updates the offset in Hi20, (Add), Lo12,
295+ // (Lo20 and Hi12) instructions and deletes TailAdd and the instructions that
296+ // produced the offset.
210297//
211298// (The instructions marked with "!" are not necessarily present)
212299//
213300// Base address lowering is of the form:
214- // Hi20: pcalau12i vreg1, %pc_hi20(s)
215- // +- Lo12: addi.d vreg2, vreg1, %pc_lo12(s)
216- // | Lo20: lu32i.d vreg2, %pc64_lo20(s) !
217- // +- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) !
301+ // 1) pcala:
302+ // Hi20: pcalau12i vreg1, %pc_hi20(s)
303+ // +--- Lo12: addi.d vreg2, vreg1, %pc_lo12(s)
304+ // | Lo20: lu32i.d vreg2, %pc64_lo20(s) !
305+ // +--- Hi12: lu52i.d vreg2, vreg2, %pc64_hi12(s) !
306+ // |
307+ // | 2) tls-le:
308+ // | Hi20: lu12i.w vreg1, %le_hi20_r(s)
309+ // | Add: add.w/d vreg1, vreg1, r2, %le_add_r(s)
310+ // +--- Lo12: addi.w/d vreg2, vreg1, %le_lo12_r(s)
218311// |
219312// | The large offset can be one of the forms:
220313// |
@@ -334,7 +427,8 @@ bool LoongArchMergeBaseOffsetOpt::detectAndFoldOffset(MachineInstr &Hi20,
334427
335428 // Look for arithmetic instructions we can get an offset from.
336429 // We might be able to remove the arithmetic instructions by folding the
337- // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I).
430+ // offset into the PCALAU12I+(ADDI/ADDI+LU32I+LU52I) or
431+ // LU12I_W+PseudoAddTPRel+ADDI.
338432 if (!MRI->hasOneUse (DestReg))
339433 return false ;
340434
@@ -454,6 +548,7 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
454548 // If all the uses are memory ops with the same offset, we can transform:
455549 //
456550 // 1. (small/medium):
551+ // 1.1. pcala
457552 // pcalau12i vreg1, %pc_hi20(s)
458553 // addi.d vreg2, vreg1, %pc_lo12(s)
459554 // ld.w vreg3, 8(vreg2)
@@ -463,6 +558,18 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
463558 // pcalau12i vreg1, %pc_hi20(s+8)
464559 // ld.w vreg3, vreg1, %pc_lo12(s+8)(vreg1)
465560 //
561+ // 1.2. tls-le
562+ // lu12i.w vreg1, %le_hi20_r(s)
563+ // add.w/d vreg2, vreg1, r2, %le_add_r(s)
564+ // addi.w/d vreg3, vreg2, %le_lo12_r(s)
565+ // ld.w vreg4, 8(vreg3)
566+ //
567+ // =>
568+ //
569+ // lu12i.w vreg1, %le_hi20_r(s+8)
570+ // add.w/d vreg2, vreg1, r2, %le_add_r(s+8)
571+ // ld.w vreg4, vreg2, %le_lo12_r(s+8)(vreg2)
572+ //
466573 // 2. (large):
467574 // pcalau12i vreg1, %pc_hi20(s)
468575 // addi.d vreg2, $zero, %pc_lo12(s)
@@ -598,7 +705,8 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
598705 return false ;
599706
600707 // If optimized by this pass successfully, MO_RELAX bitmask target-flag should
601- // be removed from the code sequence.
708+ // be removed from the pcala code sequence. Code sequence of tls-le can still
709+ // be relaxed after being optimized.
602710 //
603711 // For example:
604712 // pcalau12i $a0, %pc_hi20(symbol)
@@ -614,15 +722,20 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
614722 // optimized, it cannot be relaxed any more. So MO_RELAX flag should not be
615723 // carried by them.
616724 Hi20.getOperand (1 ).setOffset (NewOffset);
617- Hi20.getOperand (1 ).setTargetFlags (
618- LoongArchII::getDirectFlags (Hi20.getOperand (1 )));
619725 MachineOperand &ImmOp = Lo12.getOperand (2 );
620726 ImmOp.setOffset (NewOffset);
621- ImmOp.setTargetFlags (LoongArchII::getDirectFlags (ImmOp));
622727 if (Lo20 && Hi12) {
623728 Lo20->getOperand (2 ).setOffset (NewOffset);
624729 Hi12->getOperand (2 ).setOffset (NewOffset);
625730 }
731+ if (Hi20.getOpcode () == LoongArch::PCALAU12I) {
732+ Hi20.getOperand (1 ).setTargetFlags (
733+ LoongArchII::getDirectFlags (Hi20.getOperand (1 )));
734+ ImmOp.setTargetFlags (LoongArchII::getDirectFlags (ImmOp));
735+ } else if (Hi20.getOpcode () == LoongArch::LU12I_W) {
736+ MachineInstr *Add = &*MRI->use_instr_begin (Hi20.getOperand (0 ).getReg ());
737+ Add->getOperand (3 ).setOffset (NewOffset);
738+ }
626739
627740 // Update the immediate in the load/store instructions to add the offset.
628741 const LoongArchInstrInfo &TII = *ST->getInstrInfo ();
@@ -673,7 +786,14 @@ bool LoongArchMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi20,
673786 return true ;
674787 }
675788
676- MRI->replaceRegWith (Lo12.getOperand (0 ).getReg (), Hi20.getOperand (0 ).getReg ());
789+ if (Hi20.getOpcode () == LoongArch::PCALAU12I) {
790+ MRI->replaceRegWith (Lo12.getOperand (0 ).getReg (),
791+ Hi20.getOperand (0 ).getReg ());
792+ } else if (Hi20.getOpcode () == LoongArch::LU12I_W) {
793+ MachineInstr *Add = &*MRI->use_instr_begin (Hi20.getOperand (0 ).getReg ());
794+ MRI->replaceRegWith (Lo12.getOperand (0 ).getReg (),
795+ Add->getOperand (0 ).getReg ());
796+ }
677797 Lo12.eraseFromParent ();
678798 return true ;
679799}
@@ -693,8 +813,21 @@ bool LoongArchMergeBaseOffsetOpt::runOnMachineFunction(MachineFunction &Fn) {
693813 MachineInstr *Lo20 = nullptr ;
694814 MachineInstr *Hi12 = nullptr ;
695815 MachineInstr *Last = nullptr ;
696- if (!detectFoldable (Hi20, Lo12, Lo20, Hi12, Last))
816+ if (Hi20.getOpcode () == LoongArch::PCALAU12I) {
817+ // Detect foldable pcala code sequence in small/medium/large code model.
818+ if (!detectFoldable (Hi20, Lo12, Lo20, Hi12, Last))
819+ continue ;
820+ } else if (Hi20.getOpcode () == LoongArch::LU12I_W) {
821+ MachineInstr *Add = nullptr ;
822+ // Detect foldable tls-le code sequence in small/medium code model.
823+ if (!detectFoldable (Hi20, Add, Lo12))
824+ continue ;
825+ } else {
697826 continue ;
827+ }
828+ // For tls-le, we do not pass the second PseudoAddTPRel instr in order to
829+ // reuse the existing hooks and the last three paramaters should always be
830+ // nullptr.
698831 MadeChange |= detectAndFoldOffset (Hi20, *Lo12, Lo20, Hi12, Last);
699832 MadeChange |= foldIntoMemoryOps (Hi20, *Lo12, Lo20, Hi12, Last);
700833 }
0 commit comments