Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
ce56f84
pre-commit test
yafet-a Aug 21, 2025
1c27d89
[BOLT] documentation
yafet-a Aug 21, 2025
db353b7
[BOLT][AArch64] Implement safe size-aware memcpy inlining
yafet-a Aug 21, 2025
2e5b22b
test target fix for CI cross-compilation issue
yafet-a Aug 22, 2025
385fa23
moved inline-memcpy to avoid CI cross-compilation PIE conflicts
yafet-a Aug 22, 2025
4f9ef67
removed old test
yafet-a Aug 22, 2025
e83126e
response to review
yafet-a Aug 22, 2025
cf8279a
Update conditional formatting and move check for size into binaryPasses
yafet-a Aug 27, 2025
c317eb0
Negative Tests (live-in, register move, non-mov instruction)
yafet-a Aug 27, 2025
df97d61
memcpy8 redundant handling removed
yafet-a Aug 27, 2025
25cfb58
nit: comment clean up
yafet-a Aug 27, 2025
e308855
minor refactor
yafet-a Aug 28, 2025
365a0bf
NFC: Post-review refactor
yafet-a Aug 28, 2025
84c904a
NFC: Test for corner case with size 0
yafet-a Aug 28, 2025
0561bcc
Use temp instead of argument registers
yafet-a Aug 28, 2025
cc49db7
Update early return
yafet-a Aug 28, 2025
115606b
Update tests to be more specific about registers + negative test on e…
yafet-a Aug 28, 2025
1986bfa
Complex test + register aliasing
yafet-a Aug 29, 2025
bd990ea
NFC use if initializer
yafet-a Sep 1, 2025
ee5f859
[style] trailing whitespaces removed
yafet-a Sep 4, 2025
ad503a7
[test] CHECK-NEXT used
yafet-a Sep 4, 2025
267432a
[test] updated negative test to check for negative size
yafet-a Sep 4, 2025
198744d
[nfc] minor refactor
yafet-a Sep 4, 2025
62b871e
[bug] memcpy call removed for sizes>64
yafet-a Sep 4, 2025
dcab6ac
[nfc][test] reordered test
yafet-a Sep 5, 2025
875156e
[nfc] added assert for default case (future-proofing for changes to B…
yafet-a Sep 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bolt/docs/CommandLineArgumentReference.md
Original file line number Diff line number Diff line change
Expand Up @@ -631,7 +631,7 @@

- `--inline-memcpy`

Inline memcpy using 'rep movsb' instruction (X86-only)
Inline memcpy using optimized instruction sequences (X86: 'rep movsb', AArch64: width-optimized register operations)

- `--inline-small-functions`

Expand Down
16 changes: 16 additions & 0 deletions bolt/include/bolt/Core/MCPlusBuilder.h
Original file line number Diff line number Diff line change
Expand Up @@ -1895,6 +1895,22 @@ class MCPlusBuilder {
return {};
}

/// Creates size-aware inline memcpy instruction. If \p KnownSize is provided,
/// generates optimized code for that specific size. Falls back to regular
/// createInlineMemcpy if size is unknown or not needed (e.g. with X86).
virtual InstructionListType
createInlineMemcpy(bool ReturnEnd, std::optional<uint64_t> KnownSize) const {
return createInlineMemcpy(ReturnEnd);
}

/// Extract immediate value from move instruction that sets the given
/// register. Returns the immediate value if the instruction is a
/// move-immediate to TargetReg.
virtual std::optional<uint64_t>
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const {
return std::nullopt;
}

/// Create a target-specific relocation out of the \p Fixup.
/// Note that not every fixup could be converted into a relocation.
virtual std::optional<Relocation>
Expand Down
28 changes: 26 additions & 2 deletions bolt/lib/Passes/BinaryPasses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1843,7 +1843,7 @@ Error StripRepRet::runOnFunctions(BinaryContext &BC) {
}

Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
if (!BC.isX86())
if (!BC.isX86() && !BC.isAArch64())
return Error::success();

uint64_t NumInlined = 0;
Expand All @@ -1866,8 +1866,32 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
const bool IsTailCall = BC.MIB->isTailCall(Inst);

// Extract the size of thecopy from preceding instructions by looking
// for writes to the size register
std::optional<uint64_t> KnownSize = std::nullopt;
BitVector WrittenRegs(BC.MRI->getNumRegs());

// Get the size register (3rd arg register, index 2 for AArch64)
MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2);

// Look backwards through the basic block for size-setting instr
for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) {
MCInst &Inst = *InstIt;
WrittenRegs.reset(); // Clear and check what the instruction writes to
BC.MIB->getWrittenRegs(Inst, WrittenRegs);

// Check for writes to the size register
if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) {
if (std::optional<uint64_t> ExtractedSize =
BC.MIB->extractMoveImmediate(Inst, SizeReg)) {
KnownSize = *ExtractedSize;
break;
}
}
}

const InstructionListType NewCode =
BC.MIB->createInlineMemcpy(IsMemcpy8);
BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
II = BB.replaceInstruction(II, NewCode);
std::advance(II, NewCode.size() - 1);
if (IsTailCall) {
Expand Down
4 changes: 3 additions & 1 deletion bolt/lib/Rewrite/BinaryPassManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -247,7 +247,9 @@ static cl::opt<bool> Stoke("stoke", cl::desc("turn on the stoke analysis"),

static cl::opt<bool> StringOps(
"inline-memcpy",
cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"),
cl::desc(
"inline memcpy using size-specific optimized instructions "
"(X86: 'rep movsb', AArch64: width-optimized register operations)"),
cl::cat(BoltOptCategory));

static cl::opt<bool> StripRepRet(
Expand Down
204 changes: 204 additions & 0 deletions bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2597,6 +2597,210 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
getInstructionSize(const MCInst &Inst) const override {
return 4;
}

InstructionListType createInlineMemcpy(bool ReturnEnd) const override {
// Fallback
return createInlineMemcpy(ReturnEnd, std::nullopt);
}

std::optional<uint64_t>
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3) {
if (Inst.getOperand(0).isReg() &&
Inst.getOperand(0).getReg() == TargetReg &&
Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() &&
Inst.getOperand(2).getImm() == 0) {
return Inst.getOperand(1).getImm();
}
}
return std::nullopt;
}

InstructionListType
createInlineMemcpy(bool ReturnEnd,
std::optional<uint64_t> KnownSize) const override {
InstructionListType Code;
if (ReturnEnd) {
if (KnownSize.has_value() && (*KnownSize >> 12) == 0) {
// Use immediate if size is known and fits in 12-bit immediate (0-4095)
Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
.addReg(AArch64::X0)
.addReg(AArch64::X0)
.addImm(*KnownSize)
.addImm(0));
} else {
// Fall back to register add for unknown or large sizes
Code.emplace_back(MCInstBuilder(AArch64::ADDXrr)
.addReg(AArch64::X0)
.addReg(AArch64::X0)
.addReg(AArch64::X2));
}
}

if (!KnownSize.has_value()) {
return Code;
}

uint64_t Size = *KnownSize;
return generateSizeSpecificMemcpy(Code, Size);
}

InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
uint64_t Size) const {
// Generate optimal instruction sequences based on exact size
switch (Size) {
case 1:
// Single byte copy
Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
.addReg(AArch64::W3)
.addReg(AArch64::X1)
.addImm(0));
Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
.addReg(AArch64::W3)
.addReg(AArch64::X0)
.addImm(0));
break;

case 2:
// 2-byte copy using 16-bit load/store
Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
.addReg(AArch64::W3)
.addReg(AArch64::X1)
.addImm(0));
Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
.addReg(AArch64::W3)
.addReg(AArch64::X0)
.addImm(0));
break;

case 4:
// 4-byte copy using 32-bit load/store
Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
.addReg(AArch64::W3)
.addReg(AArch64::X1)
.addImm(0));
Code.emplace_back(MCInstBuilder(AArch64::STRWui)
.addReg(AArch64::W3)
.addReg(AArch64::X0)
.addImm(0));
break;

case 8:
// 8-byte copy using 64-bit load/store
Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
.addReg(AArch64::X3)
.addReg(AArch64::X1)
.addImm(0));
Code.emplace_back(MCInstBuilder(AArch64::STRXui)
.addReg(AArch64::X3)
.addReg(AArch64::X0)
.addImm(0));
break;

case 16:
// 16-byte copy using 128-bit SIMD
Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
.addReg(AArch64::Q0)
.addReg(AArch64::X1)
.addImm(0));
Code.emplace_back(MCInstBuilder(AArch64::STRQui)
.addReg(AArch64::Q0)
.addReg(AArch64::X0)
.addImm(0));
break;

case 32:
// 32-byte copy using two 128-bit SIMD operations
Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
.addReg(AArch64::Q0)
.addReg(AArch64::X1)
.addImm(0));
Code.emplace_back(MCInstBuilder(AArch64::STRQui)
.addReg(AArch64::Q0)
.addReg(AArch64::X0)
.addImm(0));
Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
.addReg(AArch64::Q1)
.addReg(AArch64::X1)
.addImm(1));
Code.emplace_back(MCInstBuilder(AArch64::STRQui)
.addReg(AArch64::Q1)
.addReg(AArch64::X0)
.addImm(1));
break;

default:
if (Size <= 64) {
// For sizes up to 64 bytes, greedily use the largest possible loads in
// descending order
uint64_t Remaining = Size;
uint64_t Offset = 0;

while (Remaining >= 16) {
Code.emplace_back(MCInstBuilder(AArch64::LDRQui)
.addReg(AArch64::Q0)
.addReg(AArch64::X1)
.addImm(Offset / 16));
Code.emplace_back(MCInstBuilder(AArch64::STRQui)
.addReg(AArch64::Q0)
.addReg(AArch64::X0)
.addImm(Offset / 16));
Remaining -= 16;
Offset += 16;
}
if (Remaining >= 8) {
Code.emplace_back(MCInstBuilder(AArch64::LDRXui)
.addReg(AArch64::X3)
.addReg(AArch64::X1)
.addImm(Offset / 8));
Code.emplace_back(MCInstBuilder(AArch64::STRXui)
.addReg(AArch64::X3)
.addReg(AArch64::X0)
.addImm(Offset / 8));
Remaining -= 8;
Offset += 8;
}
if (Remaining >= 4) {
Code.emplace_back(MCInstBuilder(AArch64::LDRWui)
.addReg(AArch64::W3)
.addReg(AArch64::X1)
.addImm(Offset / 4));
Code.emplace_back(MCInstBuilder(AArch64::STRWui)
.addReg(AArch64::W3)
.addReg(AArch64::X0)
.addImm(Offset / 4));
Remaining -= 4;
Offset += 4;
}
if (Remaining >= 2) {
Code.emplace_back(MCInstBuilder(AArch64::LDRHHui)
.addReg(AArch64::W3)
.addReg(AArch64::X1)
.addImm(Offset / 2));
Code.emplace_back(MCInstBuilder(AArch64::STRHHui)
.addReg(AArch64::W3)
.addReg(AArch64::X0)
.addImm(Offset / 2));
Remaining -= 2;
Offset += 2;
}
if (Remaining == 1) {
Code.emplace_back(MCInstBuilder(AArch64::LDRBBui)
.addReg(AArch64::W3)
.addReg(AArch64::X1)
.addImm(Offset));
Code.emplace_back(MCInstBuilder(AArch64::STRBBui)
.addReg(AArch64::W3)
.addReg(AArch64::X0)
.addImm(Offset));
}
} else {
Code.clear();
}
break;
}
return Code;
}
};

} // end anonymous namespace
Expand Down
Loading
Loading