@@ -2517,21 +2517,17 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
25172517 createInstrIncMemory (const MCSymbol *Target, MCContext *Ctx, bool IsLeaf,
25182518 unsigned CodePointerSize) const override {
25192519 unsigned int I = 0 ;
2520- InstructionListType Instrs (10 );
2520+ InstructionListType Instrs (6 );
25212521
25222522 createPushRegisters (Instrs[I++], AArch64::X0, AArch64::X1);
2523- getSystemFlag (Instrs[I++], AArch64::X1);
25242523 InstructionListType Addr = materializeAddress (Target, Ctx, AArch64::X0);
25252524 assert (Addr.size () == 2 && " Invalid Addr size" );
25262525 std::copy (Addr.begin (), Addr.end (), Instrs.begin () + I);
25272526 I += Addr.size ();
2528- storeReg (Instrs[I++], AArch64::X2, AArch64::SP);
2529- InstructionListType Insts = createIncMemory (AArch64::X0, AArch64::X2);
2527+ InstructionListType Insts = createIncMemory (AArch64::X0, AArch64::X1);
25302528 assert (Insts.size () == 2 && " Invalid Insts size" );
25312529 std::copy (Insts.begin (), Insts.end (), Instrs.begin () + I);
25322530 I += Insts.size ();
2533- loadReg (Instrs[I++], AArch64::X2, AArch64::SP);
2534- setSystemFlag (Instrs[I++], AArch64::X1);
25352531 createPopRegisters (Instrs[I++], AArch64::X0, AArch64::X1);
25362532 return Instrs;
25372533 }
@@ -2620,6 +2616,122 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
26202616 getInstructionSize (const MCInst &Inst) const override {
26212617 return 4 ;
26222618 }
2619+
2620+ std::optional<uint64_t >
2621+ extractMoveImmediate (const MCInst &Inst, MCPhysReg TargetReg) const override {
2622+ // Match MOVZ instructions (both X and W register variants) with no shift.
2623+ if ((Inst.getOpcode () == AArch64::MOVZXi ||
2624+ Inst.getOpcode () == AArch64::MOVZWi) &&
2625+ Inst.getOperand (2 ).getImm () == 0 &&
2626+ getAliases (TargetReg)[Inst.getOperand (0 ).getReg ()])
2627+ return Inst.getOperand (1 ).getImm ();
2628+ return std::nullopt ;
2629+ }
2630+
2631+ std::optional<uint64_t >
2632+ findMemcpySizeInBytes (const BinaryBasicBlock &BB,
2633+ BinaryBasicBlock::iterator CallInst) const override {
2634+ MCPhysReg SizeReg = getIntArgRegister (2 );
2635+ if (SizeReg == getNoRegister ())
2636+ return std::nullopt ;
2637+
2638+ BitVector WrittenRegs (RegInfo->getNumRegs ());
2639+ const BitVector &SizeRegAliases = getAliases (SizeReg);
2640+
2641+ for (auto InstIt = BB.begin (); InstIt != CallInst; ++InstIt) {
2642+ const MCInst &Inst = *InstIt;
2643+ WrittenRegs.reset ();
2644+ getWrittenRegs (Inst, WrittenRegs);
2645+
2646+ if (WrittenRegs.anyCommon (SizeRegAliases))
2647+ return extractMoveImmediate (Inst, SizeReg);
2648+ }
2649+ return std::nullopt ;
2650+ }
2651+
2652+ InstructionListType
2653+ createInlineMemcpy (bool ReturnEnd,
2654+ std::optional<uint64_t > KnownSize) const override {
2655+ assert (KnownSize.has_value () &&
2656+ " AArch64 memcpy inlining requires known size" );
2657+ InstructionListType Code;
2658+ uint64_t Size = *KnownSize;
2659+
2660+ generateSizeSpecificMemcpy (Code, Size);
2661+
2662+ // If _memcpy8, adjust X0 to return dest+size instead of dest.
2663+ if (ReturnEnd)
2664+ Code.emplace_back (MCInstBuilder (AArch64::ADDXri)
2665+ .addReg (AArch64::X0)
2666+ .addReg (AArch64::X0)
2667+ .addImm (Size)
2668+ .addImm (0 ));
2669+ return Code;
2670+ }
2671+
2672+ InstructionListType generateSizeSpecificMemcpy (InstructionListType &Code,
2673+ uint64_t Size) const {
2674+ auto AddLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
2675+ unsigned Reg, unsigned Offset = 0 ) {
2676+ Code.emplace_back (MCInstBuilder (LoadOpc)
2677+ .addReg (Reg)
2678+ .addReg (AArch64::X1)
2679+ .addImm (Offset));
2680+ Code.emplace_back (MCInstBuilder (StoreOpc)
2681+ .addReg (Reg)
2682+ .addReg (AArch64::X0)
2683+ .addImm (Offset));
2684+ };
2685+
2686+ // Generate optimal instruction sequences based on exact size.
2687+ switch (Size) {
2688+ case 1 :
2689+ AddLoadStorePair (AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9);
2690+ break ;
2691+ case 2 :
2692+ AddLoadStorePair (AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9);
2693+ break ;
2694+ case 4 :
2695+ AddLoadStorePair (AArch64::LDRWui, AArch64::STRWui, AArch64::W9);
2696+ break ;
2697+ case 8 :
2698+ AddLoadStorePair (AArch64::LDRXui, AArch64::STRXui, AArch64::X9);
2699+ break ;
2700+ case 16 :
2701+ AddLoadStorePair (AArch64::LDRQui, AArch64::STRQui, AArch64::Q16);
2702+ break ;
2703+ case 32 :
2704+ AddLoadStorePair (AArch64::LDRQui, AArch64::STRQui, AArch64::Q16, 0 );
2705+ AddLoadStorePair (AArch64::LDRQui, AArch64::STRQui, AArch64::Q17, 1 );
2706+ break ;
2707+
2708+ default :
2709+ // For sizes up to 64 bytes, greedily use the largest possible loads.
2710+ // Caller should have already filtered out sizes > 64 bytes.
2711+ assert (Size <= 64 &&
2712+ " Size should be <= 64 bytes for AArch64 memcpy inlining" );
2713+
2714+ uint64_t Remaining = Size;
2715+ uint64_t Offset = 0 ;
2716+
2717+ const std::array<std::tuple<uint64_t , unsigned , unsigned , unsigned >, 5 >
2718+ LoadStoreOps = {
2719+ {{16 , AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
2720+ {8 , AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
2721+ {4 , AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
2722+ {2 , AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
2723+ {1 , AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};
2724+
2725+ for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
2726+ while (Remaining >= OpSize) {
2727+ AddLoadStorePair (LoadOp, StoreOp, TempReg, Offset / OpSize);
2728+ Remaining -= OpSize;
2729+ Offset += OpSize;
2730+ }
2731+ break ;
2732+ }
2733+ return Code;
2734+ }
26232735};
26242736
26252737} // end anonymous namespace
0 commit comments