diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md index f3881c9a640a9..3fc0594514f6e 100644 --- a/bolt/docs/CommandLineArgumentReference.md +++ b/bolt/docs/CommandLineArgumentReference.md @@ -631,7 +631,7 @@ - `--inline-memcpy` - Inline memcpy using 'rep movsb' instruction (X86-only) + Inline memcpy using optimized instruction sequences (X86: 'rep movsb', AArch64: width-optimized register operations) - `--inline-small-functions` diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index e773250ce8734..3192472f5fbe0 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -14,6 +14,7 @@ #ifndef BOLT_CORE_MCPLUSBUILDER_H #define BOLT_CORE_MCPLUSBUILDER_H +#include "bolt/Core/BinaryBasicBlock.h" #include "bolt/Core/MCPlus.h" #include "bolt/Core/Relocation.h" #include "llvm/ADT/ArrayRef.h" @@ -1888,6 +1889,15 @@ class MCPlusBuilder { return {}; } + /// Find memcpy size in bytes by using preceding instructions. + /// Returns std::nullopt if size cannot be determined (no-op for most + /// targets). + virtual std::optional + findMemcpySizeInBytes(const BinaryBasicBlock &BB, + BinaryBasicBlock::iterator CallInst) const { + return std::nullopt; + } + /// Creates inline memcpy instruction. If \p ReturnEnd is true, then return /// (dest + n) instead of dest. virtual InstructionListType createInlineMemcpy(bool ReturnEnd) const { @@ -1895,6 +1905,22 @@ class MCPlusBuilder { return {}; } + /// Creates size-aware inline memcpy instruction. If \p KnownSize is provided, + /// generates optimized code for that specific size. Falls back to regular + /// createInlineMemcpy if size is unknown or not needed (e.g. with X86). + virtual InstructionListType + createInlineMemcpy(bool ReturnEnd, std::optional KnownSize) const { + return createInlineMemcpy(ReturnEnd); + } + + /// Extract immediate value from move instruction that sets the given + /// register. Returns the immediate value if the instruction is a + /// move-immediate to TargetReg. + virtual std::optional + extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const { + return std::nullopt; + } + /// Create a target-specific relocation out of the \p Fixup. /// Note that not every fixup could be converted into a relocation. virtual std::optional diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index d7f02b9470030..2f1bb21bc1fd8 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -1843,7 +1843,7 @@ Error StripRepRet::runOnFunctions(BinaryContext &BC) { } Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { - if (!BC.isX86()) + if (!BC.isX86() && !BC.isAArch64()) return Error::success(); uint64_t NumInlined = 0; @@ -1866,8 +1866,16 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8"); const bool IsTailCall = BC.MIB->isTailCall(Inst); + // Extract size from preceding instructions (AArch64 only). + // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2. + std::optional KnownSize = + BC.MIB->findMemcpySizeInBytes(BB, II); + + if (BC.isAArch64() && (!KnownSize.has_value() || *KnownSize > 64)) + continue; + const InstructionListType NewCode = - BC.MIB->createInlineMemcpy(IsMemcpy8); + BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize); II = BB.replaceInstruction(II, NewCode); std::advance(II, NewCode.size() - 1); if (IsTailCall) { diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp index 996d2e972599d..6b554598cf1bc 100644 --- a/bolt/lib/Rewrite/BinaryPassManager.cpp +++ b/bolt/lib/Rewrite/BinaryPassManager.cpp @@ -247,7 +247,9 @@ static cl::opt Stoke("stoke", cl::desc("turn on the stoke analysis"), static cl::opt StringOps( "inline-memcpy", - cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"), + cl::desc( + "inline memcpy using size-specific optimized instructions " + "(X86: 'rep movsb', AArch64: width-optimized register operations)"), cl::cat(BoltOptCategory)); static cl::opt StripRepRet( diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 973261765f951..eb402a5681c53 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -2597,6 +2597,122 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { getInstructionSize(const MCInst &Inst) const override { return 4; } + + std::optional + extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override { + // Match MOVZ instructions (both X and W register variants) with no shift. + if ((Inst.getOpcode() == AArch64::MOVZXi || + Inst.getOpcode() == AArch64::MOVZWi) && + Inst.getOperand(2).getImm() == 0 && + getAliases(TargetReg)[Inst.getOperand(0).getReg()]) + return Inst.getOperand(1).getImm(); + return std::nullopt; + } + + std::optional + findMemcpySizeInBytes(const BinaryBasicBlock &BB, + BinaryBasicBlock::iterator CallInst) const override { + MCPhysReg SizeReg = getIntArgRegister(2); + if (SizeReg == getNoRegister()) + return std::nullopt; + + BitVector WrittenRegs(RegInfo->getNumRegs()); + const BitVector &SizeRegAliases = getAliases(SizeReg); + + for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) { + const MCInst &Inst = *InstIt; + WrittenRegs.reset(); + getWrittenRegs(Inst, WrittenRegs); + + if (WrittenRegs.anyCommon(SizeRegAliases)) + return extractMoveImmediate(Inst, SizeReg); + } + return std::nullopt; + } + + InstructionListType + createInlineMemcpy(bool ReturnEnd, + std::optional KnownSize) const override { + assert(KnownSize.has_value() && + "AArch64 memcpy inlining requires known size"); + InstructionListType Code; + uint64_t Size = *KnownSize; + + generateSizeSpecificMemcpy(Code, Size); + + // If _memcpy8, adjust X0 to return dest+size instead of dest. + if (ReturnEnd) + Code.emplace_back(MCInstBuilder(AArch64::ADDXri) + .addReg(AArch64::X0) + .addReg(AArch64::X0) + .addImm(Size) + .addImm(0)); + return Code; + } + + InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code, + uint64_t Size) const { + auto AddLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc, + unsigned Reg, unsigned Offset = 0) { + Code.emplace_back(MCInstBuilder(LoadOpc) + .addReg(Reg) + .addReg(AArch64::X1) + .addImm(Offset)); + Code.emplace_back(MCInstBuilder(StoreOpc) + .addReg(Reg) + .addReg(AArch64::X0) + .addImm(Offset)); + }; + + // Generate optimal instruction sequences based on exact size. + switch (Size) { + case 1: + AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9); + break; + case 2: + AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9); + break; + case 4: + AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W9); + break; + case 8: + AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X9); + break; + case 16: + AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16); + break; + case 32: + AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16, 0); + AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q17, 1); + break; + + default: + // For sizes up to 64 bytes, greedily use the largest possible loads. + // Caller should have already filtered out sizes > 64 bytes. + assert(Size <= 64 && + "Size should be <= 64 bytes for AArch64 memcpy inlining"); + + uint64_t Remaining = Size; + uint64_t Offset = 0; + + const std::array, 5> + LoadStoreOps = { + {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16}, + {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9}, + {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9}, + {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9}, + {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}}; + + for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps) + while (Remaining >= OpSize) { + AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize); + Remaining -= OpSize; + Offset += OpSize; + } + break; + } + return Code; + } }; } // end anonymous namespace diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s new file mode 100644 index 0000000000000..dc59a08b889a7 --- /dev/null +++ b/bolt/test/runtime/AArch64/inline-memcpy.s @@ -0,0 +1,384 @@ +## This test checks that BOLT correctly inlines memcpy calls on AArch64. + +# REQUIRES: system-linux, aarch64-registered-target + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o +# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE +# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM + +# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 16 total calls) +# CHECK-INLINE: BOLT-INFO: inlined 11 memcpy() calls + +# Each function should use optimal size-specific instructions and NO memcpy calls + +# 1-byte copy should use single byte load/store (ldrb/strb) +# CHECK-ASM-LABEL: : +# CHECK-ASM: ldrb{{.*}}w9, [x1] +# CHECK-ASM-NEXT: strb{{.*}}w9, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldrh{{.*}}w9, [x1] +# CHECK-ASM-NEXT: strh{{.*}}w9, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}w9, [x1] +# CHECK-ASM-NEXT: str{{.*}}w9, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}x9, [x1] +# CHECK-ASM-NEXT: str{{.*}}x9, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}q16, [x1] +# CHECK-ASM-NEXT: str{{.*}}q16, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}q16, [x1] +# CHECK-ASM-NEXT: str{{.*}}q16, [x0] +# CHECK-ASM-NEXT: ldr{{.*}}q17, [x1, #0x10] +# CHECK-ASM-NEXT: str{{.*}}q17, [x0, #0x10] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}q16, [x1] +# CHECK-ASM-NEXT: str{{.*}}q16, [x0] +# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x10] +# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x10] +# CHECK-ASM-NEXT: ldr{{.*}}w9, [x1, #0x20] +# CHECK-ASM-NEXT: str{{.*}}w9, [x0, #0x20] +# CHECK-ASM-NEXT: ldrb{{.*}}w9, [x1, #0x24] +# CHECK-ASM-NEXT: strb{{.*}}w9, [x0, #0x24] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM-NOT: ldr +# CHECK-ASM-NOT: str +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: bl{{.*}}: +# CHECK-ASM: bl{{.*}}: +# CHECK-ASM: bl{{.*}}: +# CHECK-ASM: bl{{.*}}: +# CHECK-ASM: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}w9, [x1] +# CHECK-ASM-NEXT: str{{.*}}w9, [x0] +# CHECK-ASM-NEXT: add{{.*}}x0, x0, #0x4 +# CHECK-ASM-NOT: bl{{.*}}<_memcpy8 + +# Complex function with caller-saved X9 should inline 8-byte memcpy using X9 as temp register +# CHECK-ASM-LABEL: : +# CHECK-ASM: ldr{{.*}}x9, [x1] +# CHECK-ASM-NEXT: str{{.*}}x9, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}q16, [x1] +# CHECK-ASM-NEXT: str{{.*}}q16, [x0] +# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x10] +# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x10] +# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x20] +# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x20] +# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x30] +# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x30] +# CHECK-ASM-NOT: bl{{.*}}