@@ -2620,6 +2620,122 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
26202620 getInstructionSize (const MCInst &Inst) const override {
26212621 return 4 ;
26222622 }
2623+
2624+ std::optional<uint64_t >
2625+ extractMoveImmediate (const MCInst &Inst, MCPhysReg TargetReg) const override {
2626+ // Match MOVZ instructions (both X and W register variants) with no shift.
2627+ if ((Inst.getOpcode () == AArch64::MOVZXi ||
2628+ Inst.getOpcode () == AArch64::MOVZWi) &&
2629+ Inst.getOperand (2 ).getImm () == 0 &&
2630+ getAliases (TargetReg)[Inst.getOperand (0 ).getReg ()])
2631+ return Inst.getOperand (1 ).getImm ();
2632+ return std::nullopt ;
2633+ }
2634+
2635+ std::optional<uint64_t >
2636+ findMemcpySizeInBytes (const BinaryBasicBlock &BB,
2637+ BinaryBasicBlock::iterator CallInst) const override {
2638+ MCPhysReg SizeReg = getIntArgRegister (2 );
2639+ if (SizeReg == getNoRegister ())
2640+ return std::nullopt ;
2641+
2642+ BitVector WrittenRegs (RegInfo->getNumRegs ());
2643+ const BitVector &SizeRegAliases = getAliases (SizeReg);
2644+
2645+ for (auto InstIt = BB.begin (); InstIt != CallInst; ++InstIt) {
2646+ const MCInst &Inst = *InstIt;
2647+ WrittenRegs.reset ();
2648+ getWrittenRegs (Inst, WrittenRegs);
2649+
2650+ if (WrittenRegs.anyCommon (SizeRegAliases))
2651+ return extractMoveImmediate (Inst, SizeReg);
2652+ }
2653+ return std::nullopt ;
2654+ }
2655+
2656+ InstructionListType
2657+ createInlineMemcpy (bool ReturnEnd,
2658+ std::optional<uint64_t > KnownSize) const override {
2659+ assert (KnownSize.has_value () &&
2660+ " AArch64 memcpy inlining requires known size" );
2661+ InstructionListType Code;
2662+ uint64_t Size = *KnownSize;
2663+
2664+ generateSizeSpecificMemcpy (Code, Size);
2665+
2666+ // If _memcpy8, adjust X0 to return dest+size instead of dest.
2667+ if (ReturnEnd)
2668+ Code.emplace_back (MCInstBuilder (AArch64::ADDXri)
2669+ .addReg (AArch64::X0)
2670+ .addReg (AArch64::X0)
2671+ .addImm (Size)
2672+ .addImm (0 ));
2673+ return Code;
2674+ }
2675+
2676+ InstructionListType generateSizeSpecificMemcpy (InstructionListType &Code,
2677+ uint64_t Size) const {
2678+ auto AddLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
2679+ unsigned Reg, unsigned Offset = 0 ) {
2680+ Code.emplace_back (MCInstBuilder (LoadOpc)
2681+ .addReg (Reg)
2682+ .addReg (AArch64::X1)
2683+ .addImm (Offset));
2684+ Code.emplace_back (MCInstBuilder (StoreOpc)
2685+ .addReg (Reg)
2686+ .addReg (AArch64::X0)
2687+ .addImm (Offset));
2688+ };
2689+
2690+ // Generate optimal instruction sequences based on exact size.
2691+ switch (Size) {
2692+ case 1 :
2693+ AddLoadStorePair (AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9);
2694+ break ;
2695+ case 2 :
2696+ AddLoadStorePair (AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9);
2697+ break ;
2698+ case 4 :
2699+ AddLoadStorePair (AArch64::LDRWui, AArch64::STRWui, AArch64::W9);
2700+ break ;
2701+ case 8 :
2702+ AddLoadStorePair (AArch64::LDRXui, AArch64::STRXui, AArch64::X9);
2703+ break ;
2704+ case 16 :
2705+ AddLoadStorePair (AArch64::LDRQui, AArch64::STRQui, AArch64::Q16);
2706+ break ;
2707+ case 32 :
2708+ AddLoadStorePair (AArch64::LDRQui, AArch64::STRQui, AArch64::Q16, 0 );
2709+ AddLoadStorePair (AArch64::LDRQui, AArch64::STRQui, AArch64::Q17, 1 );
2710+ break ;
2711+
2712+ default :
2713+ // For sizes up to 64 bytes, greedily use the largest possible loads.
2714+ // Caller should have already filtered out sizes > 64 bytes.
2715+ assert (Size <= 64 &&
2716+ " Size should be <= 64 bytes for AArch64 memcpy inlining" );
2717+
2718+ uint64_t Remaining = Size;
2719+ uint64_t Offset = 0 ;
2720+
2721+ const std::array<std::tuple<uint64_t , unsigned , unsigned , unsigned >, 5 >
2722+ LoadStoreOps = {
2723+ {{16 , AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
2724+ {8 , AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
2725+ {4 , AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
2726+ {2 , AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
2727+ {1 , AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};
2728+
2729+ for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
2730+ while (Remaining >= OpSize) {
2731+ AddLoadStorePair (LoadOp, StoreOp, TempReg, Offset / OpSize);
2732+ Remaining -= OpSize;
2733+ Offset += OpSize;
2734+ }
2735+ break ;
2736+ }
2737+ return Code;
2738+ }
26232739};
26242740
26252741} // end anonymous namespace
0 commit comments