Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
ce56f84
pre-commit test
yafet-a Aug 21, 2025
1c27d89
[BOLT] documentation
yafet-a Aug 21, 2025
db353b7
[BOLT][AArch64] Implement safe size-aware memcpy inlining
yafet-a Aug 21, 2025
2e5b22b
test target fix for CI cross-compilation issue
yafet-a Aug 22, 2025
385fa23
moved inline-memcpy to avoid CI cross-compilation PIE conflicts
yafet-a Aug 22, 2025
4f9ef67
removed old test
yafet-a Aug 22, 2025
e83126e
response to review
yafet-a Aug 22, 2025
cf8279a
Update conditional formatting and move check for size into binaryPasses
yafet-a Aug 27, 2025
c317eb0
Negative Tests (live-in, register move, non-mov instruction)
yafet-a Aug 27, 2025
df97d61
memcpy8 redundant handling removed
yafet-a Aug 27, 2025
25cfb58
nit: comment clean up
yafet-a Aug 27, 2025
e308855
minor refactor
yafet-a Aug 28, 2025
365a0bf
NFC: Post-review refactor
yafet-a Aug 28, 2025
84c904a
NFC: Test for corner case with size 0
yafet-a Aug 28, 2025
0561bcc
Use temp instead of argument registers
yafet-a Aug 28, 2025
cc49db7
Update early return
yafet-a Aug 28, 2025
115606b
Update tests to be more specific about registers + negative test on e…
yafet-a Aug 28, 2025
1986bfa
Complex test + register aliasing
yafet-a Aug 29, 2025
bd990ea
NFC use if initializer
yafet-a Sep 1, 2025
ee5f859
[style] trailing whitespaces removed
yafet-a Sep 4, 2025
ad503a7
[test] CHECK-NEXT used
yafet-a Sep 4, 2025
267432a
[test] updated negative test to check for negative size
yafet-a Sep 4, 2025
198744d
[nfc] minor refactor
yafet-a Sep 4, 2025
62b871e
[bug] memcpy call removed for sizes>64
yafet-a Sep 4, 2025
dcab6ac
[nfc][test] reordered test
yafet-a Sep 5, 2025
875156e
[nfc] added assert for default case (future-proofing for changes to B…
yafet-a Sep 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bolt/docs/CommandLineArgumentReference.md
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,7 @@

- `--inline-memcpy`

Inline memcpy using 'rep movsb' instruction (X86-only)
Inline memcpy using optimized instruction sequences (X86: 'rep movsb', AArch64: width-optimized register operations)

- `--inline-small-functions`

Expand Down
26 changes: 26 additions & 0 deletions bolt/include/bolt/Core/MCPlusBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#ifndef BOLT_CORE_MCPLUSBUILDER_H
#define BOLT_CORE_MCPLUSBUILDER_H

#include "bolt/Core/BinaryBasicBlock.h"
#include "bolt/Core/MCPlus.h"
#include "bolt/Core/Relocation.h"
#include "llvm/ADT/ArrayRef.h"
Expand Down Expand Up @@ -1888,13 +1889,38 @@ class MCPlusBuilder {
return {};
}

/// Find memcpy size in bytes by using preceding instructions.
/// Returns std::nullopt if size cannot be determined (no-op for most
/// targets).
virtual std::optional<uint64_t>
findMemcpySizeInBytes(const BinaryBasicBlock &BB,
BinaryBasicBlock::iterator CallInst) const {
return std::nullopt;
}

/// Creates inline memcpy instruction. If \p ReturnEnd is true, then return
/// (dest + n) instead of dest.
virtual InstructionListType createInlineMemcpy(bool ReturnEnd) const {
llvm_unreachable("not implemented");
return {};
}

/// Creates size-aware inline memcpy instruction. If \p KnownSize is provided,
/// generates optimized code for that specific size. Falls back to regular
/// createInlineMemcpy if size is unknown or not needed (e.g. with X86).
virtual InstructionListType
createInlineMemcpy(bool ReturnEnd, std::optional<uint64_t> KnownSize) const {
return createInlineMemcpy(ReturnEnd);
}

/// Extract immediate value from move instruction that sets the given
/// register. Returns the immediate value if the instruction is a
/// move-immediate to TargetReg.
virtual std::optional<uint64_t>
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const {
return std::nullopt;
}

/// Create a target-specific relocation out of the \p Fixup.
/// Note that not every fixup could be converted into a relocation.
virtual std::optional<Relocation>
Expand Down
12 changes: 10 additions & 2 deletions bolt/lib/Passes/BinaryPasses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1843,7 +1843,7 @@ Error StripRepRet::runOnFunctions(BinaryContext &BC) {
}

Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
if (!BC.isX86())
if (!BC.isX86() && !BC.isAArch64())
return Error::success();

uint64_t NumInlined = 0;
Expand All @@ -1866,8 +1866,16 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
const bool IsTailCall = BC.MIB->isTailCall(Inst);

// Extract size from preceding instructions (AArch64 only).
// Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2.
std::optional<uint64_t> KnownSize =
BC.MIB->findMemcpySizeInBytes(BB, II);

if (BC.isAArch64() && (!KnownSize.has_value() || *KnownSize > 64))
continue;

const InstructionListType NewCode =
BC.MIB->createInlineMemcpy(IsMemcpy8);
BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
II = BB.replaceInstruction(II, NewCode);
std::advance(II, NewCode.size() - 1);
if (IsTailCall) {
Expand Down
4 changes: 3 additions & 1 deletion bolt/lib/Rewrite/BinaryPassManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,9 @@ static cl::opt<bool> Stoke("stoke", cl::desc("turn on the stoke analysis"),

static cl::opt<bool> StringOps(
"inline-memcpy",
cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"),
cl::desc(
"inline memcpy using size-specific optimized instructions "
"(X86: 'rep movsb', AArch64: width-optimized register operations)"),
cl::cat(BoltOptCategory));

static cl::opt<bool> StripRepRet(
Expand Down
116 changes: 116 additions & 0 deletions bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2597,6 +2597,122 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
getInstructionSize(const MCInst &Inst) const override {
return 4;
}

std::optional<uint64_t>
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
// Match MOVZ instructions (both X and W register variants) with no shift.
if ((Inst.getOpcode() == AArch64::MOVZXi ||
Inst.getOpcode() == AArch64::MOVZWi) &&
Inst.getOperand(2).getImm() == 0 &&
getAliases(TargetReg)[Inst.getOperand(0).getReg()])
return Inst.getOperand(1).getImm();
return std::nullopt;
}

std::optional<uint64_t>
findMemcpySizeInBytes(const BinaryBasicBlock &BB,
BinaryBasicBlock::iterator CallInst) const override {
MCPhysReg SizeReg = getIntArgRegister(2);
if (SizeReg == getNoRegister())
return std::nullopt;

BitVector WrittenRegs(RegInfo->getNumRegs());
const BitVector &SizeRegAliases = getAliases(SizeReg);

for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
const MCInst &Inst = *InstIt;
WrittenRegs.reset();
getWrittenRegs(Inst, WrittenRegs);

if (WrittenRegs.anyCommon(SizeRegAliases))
return extractMoveImmediate(Inst, SizeReg);
}
return std::nullopt;
}

InstructionListType
createInlineMemcpy(bool ReturnEnd,
std::optional<uint64_t> KnownSize) const override {
assert(KnownSize.has_value() &&
"AArch64 memcpy inlining requires known size");
InstructionListType Code;
uint64_t Size = *KnownSize;

generateSizeSpecificMemcpy(Code, Size);

// If _memcpy8, adjust X0 to return dest+size instead of dest.
if (ReturnEnd)
Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
.addReg(AArch64::X0)
.addReg(AArch64::X0)
.addImm(Size)
.addImm(0));
return Code;
}

InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
uint64_t Size) const {
auto AddLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
unsigned Reg, unsigned Offset = 0) {
Code.emplace_back(MCInstBuilder(LoadOpc)
.addReg(Reg)
.addReg(AArch64::X1)
.addImm(Offset));
Code.emplace_back(MCInstBuilder(StoreOpc)
.addReg(Reg)
.addReg(AArch64::X0)
.addImm(Offset));
};

// Generate optimal instruction sequences based on exact size.
switch (Size) {
case 1:
AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9);
break;
case 2:
AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9);
break;
case 4:
AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W9);
break;
case 8:
AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X9);
break;
case 16:
AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16);
break;
case 32:
AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16, 0);
AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q17, 1);
break;

default:
// For sizes up to 64 bytes, greedily use the largest possible loads.
// Caller should have already filtered out sizes > 64 bytes.
assert(Size <= 64 &&
"Size should be <= 64 bytes for AArch64 memcpy inlining");

uint64_t Remaining = Size;
uint64_t Offset = 0;

const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
LoadStoreOps = {
{{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
{8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
{4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
{2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
{1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};

for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
while (Remaining >= OpSize) {
AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize);
Remaining -= OpSize;
Offset += OpSize;
}
break;
}
return Code;
}
};

} // end anonymous namespace
Expand Down
Loading
Loading