From ce56f84aa7c86e1b35cf0ca4218a1f23702a206e Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Thu, 21 Aug 2025 10:12:03 -0700 Subject: [PATCH 01/26] pre-commit test --- bolt/test/AArch64/inline-memcpy.s | 193 ++++++++++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 bolt/test/AArch64/inline-memcpy.s diff --git a/bolt/test/AArch64/inline-memcpy.s b/bolt/test/AArch64/inline-memcpy.s new file mode 100644 index 0000000000000..3bb498e600fb6 --- /dev/null +++ b/bolt/test/AArch64/inline-memcpy.s @@ -0,0 +1,193 @@ +## This test checks that BOLT correctly inlines memcpy calls on AArch64. + +# REQUIRES: system-linux + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o +# RUN: %clang --target=aarch64-unknown-linux-gnu %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE +# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM + +# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed) +# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls + +# Each function should use optimal size-specific instructions and NO memcpy calls + +# 1-byte copy should use single byte load/store (ldrb/strb) +# CHECK-ASM-LABEL: : +# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1] +# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1] +# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1] +# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1] +# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1] +# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1] +# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0] +# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10] +# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1] +# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0] +# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10] +# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10] +# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20] +# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20] +# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24] +# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM-NOT: bl{{.*}} Date: Thu, 21 Aug 2025 10:17:40 -0700 Subject: [PATCH 02/26] [BOLT] documentation --- bolt/docs/CommandLineArgumentReference.md | 2 +- bolt/lib/Rewrite/BinaryPassManager.cpp | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md index f3881c9a640a9..3fc0594514f6e 100644 --- a/bolt/docs/CommandLineArgumentReference.md +++ b/bolt/docs/CommandLineArgumentReference.md @@ -631,7 +631,7 @@ - `--inline-memcpy` - Inline memcpy using 'rep movsb' instruction (X86-only) + Inline memcpy using optimized instruction sequences (X86: 'rep movsb', AArch64: width-optimized register operations) - `--inline-small-functions` diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp index 996d2e972599d..6b554598cf1bc 100644 --- a/bolt/lib/Rewrite/BinaryPassManager.cpp +++ b/bolt/lib/Rewrite/BinaryPassManager.cpp @@ -247,7 +247,9 @@ static cl::opt Stoke("stoke", cl::desc("turn on the stoke analysis"), static cl::opt StringOps( "inline-memcpy", - cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"), + cl::desc( + "inline memcpy using size-specific optimized instructions " + "(X86: 'rep movsb', AArch64: width-optimized register operations)"), cl::cat(BoltOptCategory)); static cl::opt StripRepRet( From db353b759b298aed2e0ebf86f99d6049a5a62e12 Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Thu, 21 Aug 2025 11:25:05 -0700 Subject: [PATCH 03/26] [BOLT][AArch64] Implement safe size-aware memcpy inlining --- bolt/include/bolt/Core/MCPlusBuilder.h | 16 ++ bolt/lib/Passes/BinaryPasses.cpp | 28 ++- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 204 ++++++++++++++++++ 3 files changed, 246 insertions(+), 2 deletions(-) diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index e773250ce8734..6cbf288f3b8f4 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -1895,6 +1895,22 @@ class MCPlusBuilder { return {}; } + /// Creates size-aware inline memcpy instruction. If \p KnownSize is provided, + /// generates optimized code for that specific size. Falls back to regular + /// createInlineMemcpy if size is unknown or not needed (e.g. with X86). + virtual InstructionListType + createInlineMemcpy(bool ReturnEnd, std::optional KnownSize) const { + return createInlineMemcpy(ReturnEnd); + } + + /// Extract immediate value from move instruction that sets the given + /// register. Returns the immediate value if the instruction is a + /// move-immediate to TargetReg. + virtual std::optional + extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const { + return std::nullopt; + } + /// Create a target-specific relocation out of the \p Fixup. /// Note that not every fixup could be converted into a relocation. virtual std::optional diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index d7f02b9470030..0068c1ad0bf1c 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -1843,7 +1843,7 @@ Error StripRepRet::runOnFunctions(BinaryContext &BC) { } Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { - if (!BC.isX86()) + if (!BC.isX86() && !BC.isAArch64()) return Error::success(); uint64_t NumInlined = 0; @@ -1866,8 +1866,32 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8"); const bool IsTailCall = BC.MIB->isTailCall(Inst); + // Extract the size of thecopy from preceding instructions by looking + // for writes to the size register + std::optional KnownSize = std::nullopt; + BitVector WrittenRegs(BC.MRI->getNumRegs()); + + // Get the size register (3rd arg register, index 2 for AArch64) + MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2); + + // Look backwards through the basic block for size-setting instr + for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) { + MCInst &Inst = *InstIt; + WrittenRegs.reset(); // Clear and check what the instruction writes to + BC.MIB->getWrittenRegs(Inst, WrittenRegs); + + // Check for writes to the size register + if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) { + if (std::optional ExtractedSize = + BC.MIB->extractMoveImmediate(Inst, SizeReg)) { + KnownSize = *ExtractedSize; + break; + } + } + } + const InstructionListType NewCode = - BC.MIB->createInlineMemcpy(IsMemcpy8); + BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize); II = BB.replaceInstruction(II, NewCode); std::advance(II, NewCode.size() - 1); if (IsTailCall) { diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 973261765f951..03f62117ea096 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -2597,6 +2597,210 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { getInstructionSize(const MCInst &Inst) const override { return 4; } + + InstructionListType createInlineMemcpy(bool ReturnEnd) const override { + // Fallback + return createInlineMemcpy(ReturnEnd, std::nullopt); + } + + std::optional + extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override { + if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3) { + if (Inst.getOperand(0).isReg() && + Inst.getOperand(0).getReg() == TargetReg && + Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() && + Inst.getOperand(2).getImm() == 0) { + return Inst.getOperand(1).getImm(); + } + } + return std::nullopt; + } + + InstructionListType + createInlineMemcpy(bool ReturnEnd, + std::optional KnownSize) const override { + InstructionListType Code; + if (ReturnEnd) { + if (KnownSize.has_value() && (*KnownSize >> 12) == 0) { + // Use immediate if size is known and fits in 12-bit immediate (0-4095) + Code.emplace_back(MCInstBuilder(AArch64::ADDXri) + .addReg(AArch64::X0) + .addReg(AArch64::X0) + .addImm(*KnownSize) + .addImm(0)); + } else { + // Fall back to register add for unknown or large sizes + Code.emplace_back(MCInstBuilder(AArch64::ADDXrr) + .addReg(AArch64::X0) + .addReg(AArch64::X0) + .addReg(AArch64::X2)); + } + } + + if (!KnownSize.has_value()) { + return Code; + } + + uint64_t Size = *KnownSize; + return generateSizeSpecificMemcpy(Code, Size); + } + + InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code, + uint64_t Size) const { + // Generate optimal instruction sequences based on exact size + switch (Size) { + case 1: + // Single byte copy + Code.emplace_back(MCInstBuilder(AArch64::LDRBBui) + .addReg(AArch64::W3) + .addReg(AArch64::X1) + .addImm(0)); + Code.emplace_back(MCInstBuilder(AArch64::STRBBui) + .addReg(AArch64::W3) + .addReg(AArch64::X0) + .addImm(0)); + break; + + case 2: + // 2-byte copy using 16-bit load/store + Code.emplace_back(MCInstBuilder(AArch64::LDRHHui) + .addReg(AArch64::W3) + .addReg(AArch64::X1) + .addImm(0)); + Code.emplace_back(MCInstBuilder(AArch64::STRHHui) + .addReg(AArch64::W3) + .addReg(AArch64::X0) + .addImm(0)); + break; + + case 4: + // 4-byte copy using 32-bit load/store + Code.emplace_back(MCInstBuilder(AArch64::LDRWui) + .addReg(AArch64::W3) + .addReg(AArch64::X1) + .addImm(0)); + Code.emplace_back(MCInstBuilder(AArch64::STRWui) + .addReg(AArch64::W3) + .addReg(AArch64::X0) + .addImm(0)); + break; + + case 8: + // 8-byte copy using 64-bit load/store + Code.emplace_back(MCInstBuilder(AArch64::LDRXui) + .addReg(AArch64::X3) + .addReg(AArch64::X1) + .addImm(0)); + Code.emplace_back(MCInstBuilder(AArch64::STRXui) + .addReg(AArch64::X3) + .addReg(AArch64::X0) + .addImm(0)); + break; + + case 16: + // 16-byte copy using 128-bit SIMD + Code.emplace_back(MCInstBuilder(AArch64::LDRQui) + .addReg(AArch64::Q0) + .addReg(AArch64::X1) + .addImm(0)); + Code.emplace_back(MCInstBuilder(AArch64::STRQui) + .addReg(AArch64::Q0) + .addReg(AArch64::X0) + .addImm(0)); + break; + + case 32: + // 32-byte copy using two 128-bit SIMD operations + Code.emplace_back(MCInstBuilder(AArch64::LDRQui) + .addReg(AArch64::Q0) + .addReg(AArch64::X1) + .addImm(0)); + Code.emplace_back(MCInstBuilder(AArch64::STRQui) + .addReg(AArch64::Q0) + .addReg(AArch64::X0) + .addImm(0)); + Code.emplace_back(MCInstBuilder(AArch64::LDRQui) + .addReg(AArch64::Q1) + .addReg(AArch64::X1) + .addImm(1)); + Code.emplace_back(MCInstBuilder(AArch64::STRQui) + .addReg(AArch64::Q1) + .addReg(AArch64::X0) + .addImm(1)); + break; + + default: + if (Size <= 64) { + // For sizes up to 64 bytes, greedily use the largest possible loads in + // descending order + uint64_t Remaining = Size; + uint64_t Offset = 0; + + while (Remaining >= 16) { + Code.emplace_back(MCInstBuilder(AArch64::LDRQui) + .addReg(AArch64::Q0) + .addReg(AArch64::X1) + .addImm(Offset / 16)); + Code.emplace_back(MCInstBuilder(AArch64::STRQui) + .addReg(AArch64::Q0) + .addReg(AArch64::X0) + .addImm(Offset / 16)); + Remaining -= 16; + Offset += 16; + } + if (Remaining >= 8) { + Code.emplace_back(MCInstBuilder(AArch64::LDRXui) + .addReg(AArch64::X3) + .addReg(AArch64::X1) + .addImm(Offset / 8)); + Code.emplace_back(MCInstBuilder(AArch64::STRXui) + .addReg(AArch64::X3) + .addReg(AArch64::X0) + .addImm(Offset / 8)); + Remaining -= 8; + Offset += 8; + } + if (Remaining >= 4) { + Code.emplace_back(MCInstBuilder(AArch64::LDRWui) + .addReg(AArch64::W3) + .addReg(AArch64::X1) + .addImm(Offset / 4)); + Code.emplace_back(MCInstBuilder(AArch64::STRWui) + .addReg(AArch64::W3) + .addReg(AArch64::X0) + .addImm(Offset / 4)); + Remaining -= 4; + Offset += 4; + } + if (Remaining >= 2) { + Code.emplace_back(MCInstBuilder(AArch64::LDRHHui) + .addReg(AArch64::W3) + .addReg(AArch64::X1) + .addImm(Offset / 2)); + Code.emplace_back(MCInstBuilder(AArch64::STRHHui) + .addReg(AArch64::W3) + .addReg(AArch64::X0) + .addImm(Offset / 2)); + Remaining -= 2; + Offset += 2; + } + if (Remaining == 1) { + Code.emplace_back(MCInstBuilder(AArch64::LDRBBui) + .addReg(AArch64::W3) + .addReg(AArch64::X1) + .addImm(Offset)); + Code.emplace_back(MCInstBuilder(AArch64::STRBBui) + .addReg(AArch64::W3) + .addReg(AArch64::X0) + .addImm(Offset)); + } + } else { + Code.clear(); + } + break; + } + return Code; + } }; } // end anonymous namespace From 2e5b22b501a83796ff10ae30520e07cb44b21332 Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Fri, 22 Aug 2025 05:14:11 -0700 Subject: [PATCH 04/26] test target fix for CI cross-compilation issue --- bolt/test/AArch64/inline-memcpy.s | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bolt/test/AArch64/inline-memcpy.s b/bolt/test/AArch64/inline-memcpy.s index 3bb498e600fb6..e46308286e07b 100644 --- a/bolt/test/AArch64/inline-memcpy.s +++ b/bolt/test/AArch64/inline-memcpy.s @@ -1,6 +1,6 @@ ## This test checks that BOLT correctly inlines memcpy calls on AArch64. -# REQUIRES: system-linux +# REQUIRES: system-linux, aarch64-registered-target # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o # RUN: %clang --target=aarch64-unknown-linux-gnu %t.o -o %t.exe -Wl,-q From 385fa23691e05fbdb6ffb24cc6a9526ff8d08020 Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Fri, 22 Aug 2025 05:49:37 -0700 Subject: [PATCH 05/26] moved inline-memcpy to avoid CI cross-compilation PIE conflicts --- bolt/test/runtime/AArch64/inline-memcpy.s | 193 ++++++++++++++++++++++ 1 file changed, 193 insertions(+) create mode 100644 bolt/test/runtime/AArch64/inline-memcpy.s diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s new file mode 100644 index 0000000000000..0e16b6a7e963f --- /dev/null +++ b/bolt/test/runtime/AArch64/inline-memcpy.s @@ -0,0 +1,193 @@ +## This test checks that BOLT correctly inlines memcpy calls on AArch64. + +# REQUIRES: system-linux, aarch64-registered-target + +# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o +# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q +# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE +# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM + +# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed) +# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls + +# Each function should use optimal size-specific instructions and NO memcpy calls + +# 1-byte copy should use single byte load/store (ldrb/strb) +# CHECK-ASM-LABEL: : +# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1] +# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1] +# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1] +# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1] +# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1] +# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1] +# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0] +# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10] +# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1] +# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0] +# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10] +# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10] +# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20] +# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20] +# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24] +# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM-NOT: bl{{.*}} Date: Fri, 22 Aug 2025 05:56:47 -0700 Subject: [PATCH 06/26] removed old test --- bolt/test/AArch64/inline-memcpy.s | 193 ------------------------------ 1 file changed, 193 deletions(-) delete mode 100644 bolt/test/AArch64/inline-memcpy.s diff --git a/bolt/test/AArch64/inline-memcpy.s b/bolt/test/AArch64/inline-memcpy.s deleted file mode 100644 index e46308286e07b..0000000000000 --- a/bolt/test/AArch64/inline-memcpy.s +++ /dev/null @@ -1,193 +0,0 @@ -## This test checks that BOLT correctly inlines memcpy calls on AArch64. - -# REQUIRES: system-linux, aarch64-registered-target - -# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o -# RUN: %clang --target=aarch64-unknown-linux-gnu %t.o -o %t.exe -Wl,-q -# RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE -# RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM - -# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed) -# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls - -# Each function should use optimal size-specific instructions and NO memcpy calls - -# 1-byte copy should use single byte load/store (ldrb/strb) -# CHECK-ASM-LABEL: : -# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1] -# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0] -# CHECK-ASM-NOT: bl{{.*}}: -# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1] -# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0] -# CHECK-ASM-NOT: bl{{.*}}: -# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1] -# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0] -# CHECK-ASM-NOT: bl{{.*}}: -# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1] -# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0] -# CHECK-ASM-NOT: bl{{.*}}: -# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1] -# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0] -# CHECK-ASM-NOT: bl{{.*}}: -# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1] -# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0] -# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10] -# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10] -# CHECK-ASM-NOT: bl{{.*}}: -# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1] -# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0] -# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10] -# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10] -# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20] -# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20] -# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24] -# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24] -# CHECK-ASM-NOT: bl{{.*}}: -# CHECK-ASM-NOT: bl{{.*}} Date: Fri, 22 Aug 2025 08:51:18 -0700 Subject: [PATCH 07/26] response to review --- bolt/lib/Passes/BinaryPasses.cpp | 37 +++-- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 140 ++++-------------- 2 files changed, 49 insertions(+), 128 deletions(-) diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index 0068c1ad0bf1c..e532c2aa0422d 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -1866,26 +1866,25 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8"); const bool IsTailCall = BC.MIB->isTailCall(Inst); - // Extract the size of thecopy from preceding instructions by looking - // for writes to the size register + // Extract size from preceding instructions (AArch64 only) + // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2 std::optional KnownSize = std::nullopt; - BitVector WrittenRegs(BC.MRI->getNumRegs()); - - // Get the size register (3rd arg register, index 2 for AArch64) - MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2); - - // Look backwards through the basic block for size-setting instr - for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) { - MCInst &Inst = *InstIt; - WrittenRegs.reset(); // Clear and check what the instruction writes to - BC.MIB->getWrittenRegs(Inst, WrittenRegs); - - // Check for writes to the size register - if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) { - if (std::optional ExtractedSize = - BC.MIB->extractMoveImmediate(Inst, SizeReg)) { - KnownSize = *ExtractedSize; - break; + if (BC.isAArch64()) { + BitVector WrittenRegs(BC.MRI->getNumRegs()); + MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2); + + // Look backwards for size-setting instruction + for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) { + MCInst &Inst = *InstIt; + WrittenRegs.reset(); + BC.MIB->getWrittenRegs(Inst, WrittenRegs); + + if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) { + if (std::optional ExtractedSize = + BC.MIB->extractMoveImmediate(Inst, SizeReg)) { + KnownSize = *ExtractedSize; + break; + } } } } diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 03f62117ea096..e640044ec762d 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -2647,152 +2647,74 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code, uint64_t Size) const { + // Helper to add load/store pair + auto addLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc, + unsigned Reg, unsigned Offset = 0) { + Code.emplace_back(MCInstBuilder(LoadOpc) + .addReg(Reg) + .addReg(AArch64::X1) + .addImm(Offset)); + Code.emplace_back(MCInstBuilder(StoreOpc) + .addReg(Reg) + .addReg(AArch64::X0) + .addImm(Offset)); + }; + // Generate optimal instruction sequences based on exact size switch (Size) { case 1: - // Single byte copy - Code.emplace_back(MCInstBuilder(AArch64::LDRBBui) - .addReg(AArch64::W3) - .addReg(AArch64::X1) - .addImm(0)); - Code.emplace_back(MCInstBuilder(AArch64::STRBBui) - .addReg(AArch64::W3) - .addReg(AArch64::X0) - .addImm(0)); + addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3); break; - case 2: - // 2-byte copy using 16-bit load/store - Code.emplace_back(MCInstBuilder(AArch64::LDRHHui) - .addReg(AArch64::W3) - .addReg(AArch64::X1) - .addImm(0)); - Code.emplace_back(MCInstBuilder(AArch64::STRHHui) - .addReg(AArch64::W3) - .addReg(AArch64::X0) - .addImm(0)); + addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3); break; - case 4: - // 4-byte copy using 32-bit load/store - Code.emplace_back(MCInstBuilder(AArch64::LDRWui) - .addReg(AArch64::W3) - .addReg(AArch64::X1) - .addImm(0)); - Code.emplace_back(MCInstBuilder(AArch64::STRWui) - .addReg(AArch64::W3) - .addReg(AArch64::X0) - .addImm(0)); + addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3); break; - case 8: - // 8-byte copy using 64-bit load/store - Code.emplace_back(MCInstBuilder(AArch64::LDRXui) - .addReg(AArch64::X3) - .addReg(AArch64::X1) - .addImm(0)); - Code.emplace_back(MCInstBuilder(AArch64::STRXui) - .addReg(AArch64::X3) - .addReg(AArch64::X0) - .addImm(0)); + addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3); break; - case 16: - // 16-byte copy using 128-bit SIMD - Code.emplace_back(MCInstBuilder(AArch64::LDRQui) - .addReg(AArch64::Q0) - .addReg(AArch64::X1) - .addImm(0)); - Code.emplace_back(MCInstBuilder(AArch64::STRQui) - .addReg(AArch64::Q0) - .addReg(AArch64::X0) - .addImm(0)); + addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0); break; - case 32: - // 32-byte copy using two 128-bit SIMD operations - Code.emplace_back(MCInstBuilder(AArch64::LDRQui) - .addReg(AArch64::Q0) - .addReg(AArch64::X1) - .addImm(0)); - Code.emplace_back(MCInstBuilder(AArch64::STRQui) - .addReg(AArch64::Q0) - .addReg(AArch64::X0) - .addImm(0)); - Code.emplace_back(MCInstBuilder(AArch64::LDRQui) - .addReg(AArch64::Q1) - .addReg(AArch64::X1) - .addImm(1)); - Code.emplace_back(MCInstBuilder(AArch64::STRQui) - .addReg(AArch64::Q1) - .addReg(AArch64::X0) - .addImm(1)); + addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0); + addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1); break; default: if (Size <= 64) { - // For sizes up to 64 bytes, greedily use the largest possible loads in - // descending order + // For sizes up to 64 bytes, greedily use the largest possible loads uint64_t Remaining = Size; uint64_t Offset = 0; while (Remaining >= 16) { - Code.emplace_back(MCInstBuilder(AArch64::LDRQui) - .addReg(AArch64::Q0) - .addReg(AArch64::X1) - .addImm(Offset / 16)); - Code.emplace_back(MCInstBuilder(AArch64::STRQui) - .addReg(AArch64::Q0) - .addReg(AArch64::X0) - .addImm(Offset / 16)); + addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, + Offset / 16); Remaining -= 16; Offset += 16; } if (Remaining >= 8) { - Code.emplace_back(MCInstBuilder(AArch64::LDRXui) - .addReg(AArch64::X3) - .addReg(AArch64::X1) - .addImm(Offset / 8)); - Code.emplace_back(MCInstBuilder(AArch64::STRXui) - .addReg(AArch64::X3) - .addReg(AArch64::X0) - .addImm(Offset / 8)); + addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3, + Offset / 8); Remaining -= 8; Offset += 8; } if (Remaining >= 4) { - Code.emplace_back(MCInstBuilder(AArch64::LDRWui) - .addReg(AArch64::W3) - .addReg(AArch64::X1) - .addImm(Offset / 4)); - Code.emplace_back(MCInstBuilder(AArch64::STRWui) - .addReg(AArch64::W3) - .addReg(AArch64::X0) - .addImm(Offset / 4)); + addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3, + Offset / 4); Remaining -= 4; Offset += 4; } if (Remaining >= 2) { - Code.emplace_back(MCInstBuilder(AArch64::LDRHHui) - .addReg(AArch64::W3) - .addReg(AArch64::X1) - .addImm(Offset / 2)); - Code.emplace_back(MCInstBuilder(AArch64::STRHHui) - .addReg(AArch64::W3) - .addReg(AArch64::X0) - .addImm(Offset / 2)); + addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3, + Offset / 2); Remaining -= 2; Offset += 2; } if (Remaining == 1) { - Code.emplace_back(MCInstBuilder(AArch64::LDRBBui) - .addReg(AArch64::W3) - .addReg(AArch64::X1) - .addImm(Offset)); - Code.emplace_back(MCInstBuilder(AArch64::STRBBui) - .addReg(AArch64::W3) - .addReg(AArch64::X0) - .addImm(Offset)); + addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3, + Offset); } } else { Code.clear(); From cf8279a8b5081eec657a1f835c54470653186787 Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Wed, 27 Aug 2025 03:57:43 -0700 Subject: [PATCH 08/26] Update conditional formatting and move check for size into binaryPasses --- bolt/lib/Passes/BinaryPasses.cpp | 5 +++++ bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 13 ++++--------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index e532c2aa0422d..1aade44286052 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -1889,6 +1889,11 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { } } + if (BC.isAArch64() && !KnownSize.has_value()) { + ++II; + continue; + } + const InstructionListType NewCode = BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize); II = BB.replaceInstruction(II, NewCode); diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index e640044ec762d..9d30fdface0c5 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -2621,24 +2621,19 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { std::optional KnownSize) const override { InstructionListType Code; if (ReturnEnd) { - if (KnownSize.has_value() && (*KnownSize >> 12) == 0) { - // Use immediate if size is known and fits in 12-bit immediate (0-4095) + // Use immediate if size fits in 12-bit immediate (0-4095) + // Otherwise, fall back to register add for large sizes + if ((*KnownSize >> 12) == 0) Code.emplace_back(MCInstBuilder(AArch64::ADDXri) .addReg(AArch64::X0) .addReg(AArch64::X0) .addImm(*KnownSize) .addImm(0)); - } else { - // Fall back to register add for unknown or large sizes + else Code.emplace_back(MCInstBuilder(AArch64::ADDXrr) .addReg(AArch64::X0) .addReg(AArch64::X0) .addReg(AArch64::X2)); - } - } - - if (!KnownSize.has_value()) { - return Code; } uint64_t Size = *KnownSize; From c317eb0cbd62ac6f164cf44b75d40e082167ce3d Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Wed, 27 Aug 2025 04:55:17 -0700 Subject: [PATCH 09/26] Negative Tests (live-in, register move, non-mov instruction) --- bolt/test/runtime/AArch64/inline-memcpy.s | 61 ++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s index 0e16b6a7e963f..417b444f6a4bb 100644 --- a/bolt/test/runtime/AArch64/inline-memcpy.s +++ b/bolt/test/runtime/AArch64/inline-memcpy.s @@ -7,7 +7,7 @@ # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM -# Verify BOLT reports that it inlined memcpy calls (all 8 calls processed) +# Verify BOLT reports that it inlined memcpy calls (8 successful inlines out of 11 total calls) # CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls # Each function should use optimal size-specific instructions and NO memcpy calls @@ -67,6 +67,18 @@ # CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: bl{{.*}}: +# CHECK-ASM: bl{{.*}}: +# CHECK-ASM: bl{{.*}} Date: Wed, 27 Aug 2025 06:51:08 -0700 Subject: [PATCH 10/26] memcpy8 redundant handling removed --- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 29 ++++++--------- bolt/test/runtime/AArch64/inline-memcpy.s | 37 ++++++++++++++++++- 2 files changed, 47 insertions(+), 19 deletions(-) diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 9d30fdface0c5..366d4183bca51 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -2620,24 +2620,19 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { createInlineMemcpy(bool ReturnEnd, std::optional KnownSize) const override { InstructionListType Code; - if (ReturnEnd) { - // Use immediate if size fits in 12-bit immediate (0-4095) - // Otherwise, fall back to register add for large sizes - if ((*KnownSize >> 12) == 0) - Code.emplace_back(MCInstBuilder(AArch64::ADDXri) - .addReg(AArch64::X0) - .addReg(AArch64::X0) - .addImm(*KnownSize) - .addImm(0)); - else - Code.emplace_back(MCInstBuilder(AArch64::ADDXrr) - .addReg(AArch64::X0) - .addReg(AArch64::X0) - .addReg(AArch64::X2)); - } - uint64_t Size = *KnownSize; - return generateSizeSpecificMemcpy(Code, Size); + + // Generate the optimized memcpy sequence + generateSizeSpecificMemcpy(Code, Size); + + // If _memcpy8, adjust X0 to return dest+size instead of dest + if (ReturnEnd) + Code.emplace_back(MCInstBuilder(AArch64::ADDXri) + .addReg(AArch64::X0) + .addReg(AArch64::X0) + .addImm(Size) + .addImm(0)); + return Code; } InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code, diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s index 417b444f6a4bb..961e21f82851d 100644 --- a/bolt/test/runtime/AArch64/inline-memcpy.s +++ b/bolt/test/runtime/AArch64/inline-memcpy.s @@ -7,8 +7,8 @@ # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM -# Verify BOLT reports that it inlined memcpy calls (8 successful inlines out of 11 total calls) -# CHECK-INLINE: BOLT-INFO: inlined 8 memcpy() calls +# Verify BOLT reports that it inlined memcpy calls (9 successful inlines out of 12 total calls) +# CHECK-INLINE: BOLT-INFO: inlined 9 memcpy() calls # Each function should use optimal size-specific instructions and NO memcpy calls @@ -79,6 +79,13 @@ # CHECK-ASM-LABEL: : # CHECK-ASM: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1] +# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0] +# CHECK-ASM: add{{.*}}x0, x0, #0x4 +# CHECK-ASM-NOT: bl{{.*}}<_memcpy8 + .text .globl test_1_byte_direct .type test_1_byte_direct,@function @@ -226,7 +233,31 @@ test_live_in_negative: ret .size test_live_in_negative, .-test_live_in_negative + .globl test_memcpy8_4_byte + .type test_memcpy8_4_byte,@function +test_memcpy8_4_byte: + stp x29, x30, [sp, #-32]! + mov x29, sp + add x1, sp, #16 + add x0, sp, #8 + mov x2, #4 + bl _memcpy8 + ldp x29, x30, [sp], #32 + ret + .size test_memcpy8_4_byte, .-test_memcpy8_4_byte + # Simple _memcpy8 implementation that calls memcpy and returns dest+size + .globl _memcpy8 + .type _memcpy8,@function +_memcpy8: + stp x29, x30, [sp, #-16]! + mov x29, sp + mov x3, x0 + bl memcpy + add x0, x3, x2 + ldp x29, x30, [sp], #16 + ret + .size _memcpy8, .-_memcpy8 .globl main .type main,@function @@ -245,6 +276,8 @@ main: bl test_4_byte_add_immediate bl test_register_move_negative bl test_live_in_negative + bl test_memcpy8_4_byte + bl test_memcpy8_large_size mov w0, #0 ldp x29, x30, [sp], #16 From 25cfb58b165fd1190f9b1b52cce1423d2db5d3c1 Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Wed, 27 Aug 2025 06:54:14 -0700 Subject: [PATCH 11/26] nit: comment clean up --- bolt/lib/Passes/BinaryPasses.cpp | 6 +++--- bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 9 ++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index 1aade44286052..e8124dd3cb4f4 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -1866,14 +1866,14 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8"); const bool IsTailCall = BC.MIB->isTailCall(Inst); - // Extract size from preceding instructions (AArch64 only) - // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2 + // Extract size from preceding instructions (AArch64 only). + // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2. std::optional KnownSize = std::nullopt; if (BC.isAArch64()) { BitVector WrittenRegs(BC.MRI->getNumRegs()); MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2); - // Look backwards for size-setting instruction + // Look backwards for size-setting instruction. for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) { MCInst &Inst = *InstIt; WrittenRegs.reset(); diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 366d4183bca51..67febc2324e14 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -2622,10 +2622,10 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { InstructionListType Code; uint64_t Size = *KnownSize; - // Generate the optimized memcpy sequence + // Generate the optimized memcpy sequence. generateSizeSpecificMemcpy(Code, Size); - // If _memcpy8, adjust X0 to return dest+size instead of dest + // If _memcpy8, adjust X0 to return dest+size instead of dest. if (ReturnEnd) Code.emplace_back(MCInstBuilder(AArch64::ADDXri) .addReg(AArch64::X0) @@ -2637,7 +2637,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code, uint64_t Size) const { - // Helper to add load/store pair auto addLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc, unsigned Reg, unsigned Offset = 0) { Code.emplace_back(MCInstBuilder(LoadOpc) @@ -2650,7 +2649,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { .addImm(Offset)); }; - // Generate optimal instruction sequences based on exact size + // Generate optimal instruction sequences based on exact size. switch (Size) { case 1: addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3); @@ -2674,7 +2673,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { default: if (Size <= 64) { - // For sizes up to 64 bytes, greedily use the largest possible loads + // For sizes up to 64 bytes, greedily use the largest possible loads. uint64_t Remaining = Size; uint64_t Offset = 0; From e308855758965504cca82484f66065d186c64093 Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Thu, 28 Aug 2025 02:12:27 -0700 Subject: [PATCH 12/26] minor refactor --- bolt/lib/Passes/BinaryPasses.cpp | 11 +++++----- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 22 +++++++------------ 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index e8124dd3cb4f4..022d06ae80e7b 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -1872,6 +1872,7 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { if (BC.isAArch64()) { BitVector WrittenRegs(BC.MRI->getNumRegs()); MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2); + std::optional ExtractedSize; // Look backwards for size-setting instruction. for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) { @@ -1879,12 +1880,10 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { WrittenRegs.reset(); BC.MIB->getWrittenRegs(Inst, WrittenRegs); - if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg]) { - if (std::optional ExtractedSize = - BC.MIB->extractMoveImmediate(Inst, SizeReg)) { - KnownSize = *ExtractedSize; - break; - } + if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg] && + (ExtractedSize = BC.MIB->extractMoveImmediate(Inst, SizeReg))) { + KnownSize = *ExtractedSize; + break; } } } diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 67febc2324e14..dfb5fe3cfe30d 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -2599,20 +2599,17 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { } InstructionListType createInlineMemcpy(bool ReturnEnd) const override { - // Fallback return createInlineMemcpy(ReturnEnd, std::nullopt); } std::optional extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override { - if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3) { - if (Inst.getOperand(0).isReg() && - Inst.getOperand(0).getReg() == TargetReg && - Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() && - Inst.getOperand(2).getImm() == 0) { - return Inst.getOperand(1).getImm(); - } - } + if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3 && + Inst.getOperand(0).isReg() && + Inst.getOperand(0).getReg() == TargetReg && + Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() && + Inst.getOperand(2).getImm() == 0) + return Inst.getOperand(1).getImm(); return std::nullopt; } @@ -2622,7 +2619,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { InstructionListType Code; uint64_t Size = *KnownSize; - // Generate the optimized memcpy sequence. generateSizeSpecificMemcpy(Code, Size); // If _memcpy8, adjust X0 to return dest+size instead of dest. @@ -2701,13 +2697,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { Remaining -= 2; Offset += 2; } - if (Remaining == 1) { + if (Remaining == 1) addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3, Offset); - } - } else { + } else Code.clear(); - } break; } return Code; From 365a0bfaa0d68e9a5c45f9b5163af49ca6d5c1b8 Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Thu, 28 Aug 2025 06:33:39 -0700 Subject: [PATCH 13/26] NFC: Post-review refactor --- bolt/include/bolt/Core/MCPlusBuilder.h | 10 +++ bolt/lib/Passes/BinaryPasses.cpp | 21 +---- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 81 ++++++++++--------- 3 files changed, 55 insertions(+), 57 deletions(-) diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 6cbf288f3b8f4..3192472f5fbe0 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -14,6 +14,7 @@ #ifndef BOLT_CORE_MCPLUSBUILDER_H #define BOLT_CORE_MCPLUSBUILDER_H +#include "bolt/Core/BinaryBasicBlock.h" #include "bolt/Core/MCPlus.h" #include "bolt/Core/Relocation.h" #include "llvm/ADT/ArrayRef.h" @@ -1888,6 +1889,15 @@ class MCPlusBuilder { return {}; } + /// Find memcpy size in bytes by using preceding instructions. + /// Returns std::nullopt if size cannot be determined (no-op for most + /// targets). + virtual std::optional + findMemcpySizeInBytes(const BinaryBasicBlock &BB, + BinaryBasicBlock::iterator CallInst) const { + return std::nullopt; + } + /// Creates inline memcpy instruction. If \p ReturnEnd is true, then return /// (dest + n) instead of dest. virtual InstructionListType createInlineMemcpy(bool ReturnEnd) const { diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index 022d06ae80e7b..f1807f6eb997e 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -1868,25 +1868,8 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { // Extract size from preceding instructions (AArch64 only). // Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2. - std::optional KnownSize = std::nullopt; - if (BC.isAArch64()) { - BitVector WrittenRegs(BC.MRI->getNumRegs()); - MCPhysReg SizeReg = BC.MIB->getIntArgRegister(2); - std::optional ExtractedSize; - - // Look backwards for size-setting instruction. - for (auto InstIt = BB.begin(); InstIt != II; ++InstIt) { - MCInst &Inst = *InstIt; - WrittenRegs.reset(); - BC.MIB->getWrittenRegs(Inst, WrittenRegs); - - if (SizeReg != BC.MIB->getNoRegister() && WrittenRegs[SizeReg] && - (ExtractedSize = BC.MIB->extractMoveImmediate(Inst, SizeReg))) { - KnownSize = *ExtractedSize; - break; - } - } - } + std::optional KnownSize = + BC.MIB->findMemcpySizeInBytes(BB, II); if (BC.isAArch64() && !KnownSize.has_value()) { ++II; diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index dfb5fe3cfe30d..6f539b8588f2e 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -2604,15 +2604,33 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { std::optional extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override { - if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getNumOperands() >= 3 && - Inst.getOperand(0).isReg() && + // Match MOVZXi with the target register and no shift. + if (Inst.getOpcode() == AArch64::MOVZXi && Inst.getOperand(0).getReg() == TargetReg && - Inst.getOperand(1).isImm() && Inst.getOperand(2).isImm() && Inst.getOperand(2).getImm() == 0) return Inst.getOperand(1).getImm(); return std::nullopt; } + std::optional + findMemcpySizeInBytes(const BinaryBasicBlock &BB, + BinaryBasicBlock::iterator CallInst) const override { + BitVector WrittenRegs(RegInfo->getNumRegs()); + MCPhysReg SizeReg = getIntArgRegister(2); + std::optional ExtractedSize; + + for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) { + const MCInst &Inst = *InstIt; + WrittenRegs.reset(); + getWrittenRegs(Inst, WrittenRegs); + + if (SizeReg != getNoRegister() && WrittenRegs[SizeReg] && + (ExtractedSize = extractMoveImmediate(Inst, SizeReg))) + return *ExtractedSize; + } + return std::nullopt; + } + InstructionListType createInlineMemcpy(bool ReturnEnd, std::optional KnownSize) const override { @@ -2633,7 +2651,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code, uint64_t Size) const { - auto addLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc, + auto AddLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc, unsigned Reg, unsigned Offset = 0) { Code.emplace_back(MCInstBuilder(LoadOpc) .addReg(Reg) @@ -2648,23 +2666,23 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { // Generate optimal instruction sequences based on exact size. switch (Size) { case 1: - addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3); + AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3); break; case 2: - addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3); + AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3); break; case 4: - addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3); + AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3); break; case 8: - addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3); + AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3); break; case 16: - addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0); + AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0); break; case 32: - addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0); - addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1); + AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0); + AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1); break; default: @@ -2673,33 +2691,20 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { uint64_t Remaining = Size; uint64_t Offset = 0; - while (Remaining >= 16) { - addLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, - Offset / 16); - Remaining -= 16; - Offset += 16; - } - if (Remaining >= 8) { - addLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3, - Offset / 8); - Remaining -= 8; - Offset += 8; - } - if (Remaining >= 4) { - addLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3, - Offset / 4); - Remaining -= 4; - Offset += 4; - } - if (Remaining >= 2) { - addLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3, - Offset / 2); - Remaining -= 2; - Offset += 2; - } - if (Remaining == 1) - addLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3, - Offset); + const std::array, 5> + LoadStoreOps = { + {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q0}, + {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X3}, + {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W3}, + {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3}, + {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3}}}; + + for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps) + while (Remaining >= OpSize) { + AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize); + Remaining -= OpSize; + Offset += OpSize; + } } else Code.clear(); break; From 84c904ac68b263b48227b3308ad16c795382b7c3 Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Thu, 28 Aug 2025 06:42:47 -0700 Subject: [PATCH 14/26] NFC: Test for corner case with size 0 --- bolt/test/runtime/AArch64/inline-memcpy.s | 25 ++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s index 961e21f82851d..3acb5e394d52d 100644 --- a/bolt/test/runtime/AArch64/inline-memcpy.s +++ b/bolt/test/runtime/AArch64/inline-memcpy.s @@ -7,8 +7,8 @@ # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM -# Verify BOLT reports that it inlined memcpy calls (9 successful inlines out of 12 total calls) -# CHECK-INLINE: BOLT-INFO: inlined 9 memcpy() calls +# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 13 total calls) +# CHECK-INLINE: BOLT-INFO: inlined 10 memcpy() calls # Each function should use optimal size-specific instructions and NO memcpy calls @@ -62,6 +62,12 @@ # CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24] # CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM-NOT: ldr +# CHECK-ASM-NOT: str +# CHECK-ASM-NOT: bl{{.*}}: # CHECK-ASM-NOT: bl{{.*}} Date: Thu, 28 Aug 2025 10:01:21 -0700 Subject: [PATCH 15/26] Use temp instead of argument registers --- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 6f539b8588f2e..f17a91bc3ba76 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -2666,23 +2666,23 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { // Generate optimal instruction sequences based on exact size. switch (Size) { case 1: - AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3); + AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9); break; case 2: - AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3); + AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9); break; case 4: - AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W3); + AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W9); break; case 8: - AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X3); + AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X9); break; case 16: - AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0); + AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16); break; case 32: - AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q0, 0); - AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q1, 1); + AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16, 0); + AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q17, 1); break; default: @@ -2693,11 +2693,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { const std::array, 5> LoadStoreOps = { - {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q0}, - {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X3}, - {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W3}, - {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W3}, - {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W3}}}; + {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16}, + {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9}, + {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9}, + {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9}, + {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}}; for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps) while (Remaining >= OpSize) { From cc49db79eea544305571e5e91caa3328c91cf4a7 Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Thu, 28 Aug 2025 10:01:54 -0700 Subject: [PATCH 16/26] Update early return --- bolt/lib/Passes/BinaryPasses.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index f1807f6eb997e..d40f5fb78c7f3 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -1871,10 +1871,8 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { std::optional KnownSize = BC.MIB->findMemcpySizeInBytes(BB, II); - if (BC.isAArch64() && !KnownSize.has_value()) { - ++II; + if (BC.isAArch64() && !KnownSize.has_value()) continue; - } const InstructionListType NewCode = BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize); From 115606be208c8b6675df59b9f231dd709ea863fd Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Thu, 28 Aug 2025 10:02:48 -0700 Subject: [PATCH 17/26] Update tests to be more specific about registers + negative test on early return check --- bolt/test/runtime/AArch64/inline-memcpy.s | 70 +++++++++++++++-------- 1 file changed, 45 insertions(+), 25 deletions(-) diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s index 3acb5e394d52d..14a95d91dd189 100644 --- a/bolt/test/runtime/AArch64/inline-memcpy.s +++ b/bolt/test/runtime/AArch64/inline-memcpy.s @@ -7,59 +7,59 @@ # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM -# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 13 total calls) +# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 14 total calls) # CHECK-INLINE: BOLT-INFO: inlined 10 memcpy() calls # Each function should use optimal size-specific instructions and NO memcpy calls # 1-byte copy should use single byte load/store (ldrb/strb) # CHECK-ASM-LABEL: : -# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1] -# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0] +# CHECK-ASM: ldrb{{.*}}w9, [x1] +# CHECK-ASM: strb{{.*}}w9, [x0] # CHECK-ASM-NOT: bl{{.*}}: -# CHECK-ASM: ldrh{{.*}}w{{[0-9]+}}, [x1] -# CHECK-ASM: strh{{.*}}w{{[0-9]+}}, [x0] +# CHECK-ASM: ldrh{{.*}}w9, [x1] +# CHECK-ASM: strh{{.*}}w9, [x0] # CHECK-ASM-NOT: bl{{.*}}: -# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1] -# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0] +# CHECK-ASM: ldr{{.*}}w9, [x1] +# CHECK-ASM: str{{.*}}w9, [x0] # CHECK-ASM-NOT: bl{{.*}}: -# CHECK-ASM: ldr{{.*}}x{{[0-9]+}}, [x1] -# CHECK-ASM: str{{.*}}x{{[0-9]+}}, [x0] +# CHECK-ASM: ldr{{.*}}x9, [x1] +# CHECK-ASM: str{{.*}}x9, [x0] # CHECK-ASM-NOT: bl{{.*}}: -# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1] -# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0] +# CHECK-ASM: ldr{{.*}}q16, [x1] +# CHECK-ASM: str{{.*}}q16, [x0] # CHECK-ASM-NOT: bl{{.*}}: -# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1] -# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0] -# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10] -# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10] +# CHECK-ASM: ldr{{.*}}q16, [x1] +# CHECK-ASM: str{{.*}}q16, [x0] +# CHECK-ASM: ldr{{.*}}q17, [x1, #0x10] +# CHECK-ASM: str{{.*}}q17, [x0, #0x10] # CHECK-ASM-NOT: bl{{.*}}: -# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1] -# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0] -# CHECK-ASM: ldr{{.*}}q{{[0-9]+}}, [x1, #0x10] -# CHECK-ASM: str{{.*}}q{{[0-9]+}}, [x0, #0x10] -# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1, #0x20] -# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0, #0x20] -# CHECK-ASM: ldrb{{.*}}w{{[0-9]+}}, [x1, #0x24] -# CHECK-ASM: strb{{.*}}w{{[0-9]+}}, [x0, #0x24] +# CHECK-ASM: ldr{{.*}}q16, [x1] +# CHECK-ASM: str{{.*}}q16, [x0] +# CHECK-ASM: ldr{{.*}}q16, [x1, #0x10] +# CHECK-ASM: str{{.*}}q16, [x0, #0x10] +# CHECK-ASM: ldr{{.*}}w9, [x1, #0x20] +# CHECK-ASM: str{{.*}}w9, [x0, #0x20] +# CHECK-ASM: ldrb{{.*}}w9, [x1, #0x24] +# CHECK-ASM: strb{{.*}}w9, [x0, #0x24] # CHECK-ASM-NOT: bl{{.*}}: # CHECK-ASM: bl{{.*}}: +# CHECK-ASM: bl{{.*}}: -# CHECK-ASM: ldr{{.*}}w{{[0-9]+}}, [x1] -# CHECK-ASM: str{{.*}}w{{[0-9]+}}, [x0] +# CHECK-ASM: ldr{{.*}}w9, [x1] +# CHECK-ASM: str{{.*}}w9, [x0] # CHECK-ASM: add{{.*}}x0, x0, #0x4 # CHECK-ASM-NOT: bl{{.*}}<_memcpy8 @@ -252,6 +256,21 @@ test_live_in_negative: ret .size test_live_in_negative, .-test_live_in_negative + .globl test_register_size_negative + .type test_register_size_negative,@function +test_register_size_negative: + # This would crash without isAArch64() check: size from register parameter + stp x29, x30, [sp, #-32]! + mov x29, sp + add x1, sp, #16 + add x0, sp, #8 + mov x3, #4 + mov x2, x3 + bl memcpy + ldp x29, x30, [sp], #32 + ret + .size test_register_size_negative, .-test_register_size_negative + .globl test_memcpy8_4_byte .type test_memcpy8_4_byte,@function test_memcpy8_4_byte: @@ -296,6 +315,7 @@ main: bl test_4_byte_add_immediate bl test_register_move_negative bl test_live_in_negative + bl test_register_size_negative bl test_memcpy8_4_byte mov w0, #0 From 1986bfac3fcfdd3b8036096c72d7f1ed03fea1bc Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Fri, 29 Aug 2025 08:03:58 -0700 Subject: [PATCH 18/26] Complex test + register aliasing --- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 18 +-- bolt/test/runtime/AArch64/inline-memcpy.s | 107 +++++++++++++++++- 2 files changed, 113 insertions(+), 12 deletions(-) diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index f17a91bc3ba76..12e226a00e26d 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -2604,10 +2604,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { std::optional extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override { - // Match MOVZXi with the target register and no shift. - if (Inst.getOpcode() == AArch64::MOVZXi && - Inst.getOperand(0).getReg() == TargetReg && - Inst.getOperand(2).getImm() == 0) + // Match MOVZ instructions (both X and W register variants) with no shift. + if ((Inst.getOpcode() == AArch64::MOVZXi || + Inst.getOpcode() == AArch64::MOVZWi) && + Inst.getOperand(2).getImm() == 0 && + getAliases(TargetReg)[Inst.getOperand(0).getReg()]) return Inst.getOperand(1).getImm(); return std::nullopt; } @@ -2617,16 +2618,17 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { BinaryBasicBlock::iterator CallInst) const override { BitVector WrittenRegs(RegInfo->getNumRegs()); MCPhysReg SizeReg = getIntArgRegister(2); - std::optional ExtractedSize; + const BitVector &SizeRegAliases = getAliases(SizeReg); for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) { const MCInst &Inst = *InstIt; WrittenRegs.reset(); getWrittenRegs(Inst, WrittenRegs); - if (SizeReg != getNoRegister() && WrittenRegs[SizeReg] && - (ExtractedSize = extractMoveImmediate(Inst, SizeReg))) - return *ExtractedSize; + if (SizeReg != getNoRegister() && WrittenRegs.anyCommon(SizeRegAliases)) { + if (auto ExtractedSize = extractMoveImmediate(Inst, SizeReg)) + return *ExtractedSize; + } } return std::nullopt; } diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s index 14a95d91dd189..eb6851bbe7e0b 100644 --- a/bolt/test/runtime/AArch64/inline-memcpy.s +++ b/bolt/test/runtime/AArch64/inline-memcpy.s @@ -7,8 +7,8 @@ # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM -# Verify BOLT reports that it inlined memcpy calls (10 successful inlines out of 14 total calls) -# CHECK-INLINE: BOLT-INFO: inlined 10 memcpy() calls +# Verify BOLT reports that it inlined memcpy calls (12 successful inlines out of 16 total calls) +# CHECK-INLINE: BOLT-INFO: inlined 12 memcpy() calls # Each function should use optimal size-specific instructions and NO memcpy calls @@ -96,6 +96,24 @@ # CHECK-ASM: add{{.*}}x0, x0, #0x4 # CHECK-ASM-NOT: bl{{.*}}<_memcpy8 +# Complex function with caller-saved X9 should inline 8-byte memcpy using X9 as temp register +# CHECK-ASM-LABEL: : +# CHECK-ASM: ldr{{.*}}x9, [x1] +# CHECK-ASM: str{{.*}}x9, [x0] +# CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: ldr{{.*}}q16, [x1] +# CHECK-ASM: str{{.*}}q16, [x0] +# CHECK-ASM: ldr{{.*}}q16, [x1, #0x10] +# CHECK-ASM: str{{.*}}q16, [x0, #0x10] +# CHECK-ASM: ldr{{.*}}q16, [x1, #0x20] +# CHECK-ASM: str{{.*}}q16, [x0, #0x20] +# CHECK-ASM: ldr{{.*}}q16, [x1, #0x30] +# CHECK-ASM: str{{.*}}q16, [x0, #0x30] +# CHECK-ASM-NOT: bl{{.*}} Date: Mon, 1 Sep 2025 01:40:32 -0700 Subject: [PATCH 19/26] NFC use if initializer --- bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 12e226a00e26d..707856b5874ea 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -2625,10 +2625,9 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { WrittenRegs.reset(); getWrittenRegs(Inst, WrittenRegs); - if (SizeReg != getNoRegister() && WrittenRegs.anyCommon(SizeRegAliases)) { - if (auto ExtractedSize = extractMoveImmediate(Inst, SizeReg)) - return *ExtractedSize; - } + if (SizeReg != getNoRegister() && WrittenRegs.anyCommon(SizeRegAliases); + auto ExtractedSize = extractMoveImmediate(Inst, SizeReg)) + return *ExtractedSize; } return std::nullopt; } From ee5f859f26eb3272934ff03cef8bcb52ab772e89 Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Thu, 4 Sep 2025 09:07:33 -0700 Subject: [PATCH 20/26] [style] trailing whitespaces removed --- bolt/test/runtime/AArch64/inline-memcpy.s | 52 +++++++++++------------ 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s index eb6851bbe7e0b..0bcb7514afad3 100644 --- a/bolt/test/runtime/AArch64/inline-memcpy.s +++ b/bolt/test/runtime/AArch64/inline-memcpy.s @@ -3,7 +3,7 @@ # REQUIRES: system-linux, aarch64-registered-target # RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o -# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q +# RUN: %clang %cflags -no-pie %t.o -o %t.exe -Wl,-q # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM @@ -65,7 +65,7 @@ # 0-byte copy should be inlined with no load/store instructions (nothing to copy) # CHECK-ASM-LABEL: : # CHECK-ASM-NOT: ldr -# CHECK-ASM-NOT: str +# CHECK-ASM-NOT: str # CHECK-ASM-NOT: bl{{.*}} Date: Thu, 4 Sep 2025 09:18:07 -0700 Subject: [PATCH 21/26] [test] CHECK-NEXT used --- bolt/test/runtime/AArch64/inline-memcpy.s | 50 +++++++++++------------ 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s index 0bcb7514afad3..3222935b74fef 100644 --- a/bolt/test/runtime/AArch64/inline-memcpy.s +++ b/bolt/test/runtime/AArch64/inline-memcpy.s @@ -15,51 +15,51 @@ # 1-byte copy should use single byte load/store (ldrb/strb) # CHECK-ASM-LABEL: : # CHECK-ASM: ldrb{{.*}}w9, [x1] -# CHECK-ASM: strb{{.*}}w9, [x0] +# CHECK-ASM-NEXT: strb{{.*}}w9, [x0] # CHECK-ASM-NOT: bl{{.*}}: # CHECK-ASM: ldrh{{.*}}w9, [x1] -# CHECK-ASM: strh{{.*}}w9, [x0] +# CHECK-ASM-NEXT: strh{{.*}}w9, [x0] # CHECK-ASM-NOT: bl{{.*}}: # CHECK-ASM: ldr{{.*}}w9, [x1] -# CHECK-ASM: str{{.*}}w9, [x0] +# CHECK-ASM-NEXT: str{{.*}}w9, [x0] # CHECK-ASM-NOT: bl{{.*}}: # CHECK-ASM: ldr{{.*}}x9, [x1] -# CHECK-ASM: str{{.*}}x9, [x0] +# CHECK-ASM-NEXT: str{{.*}}x9, [x0] # CHECK-ASM-NOT: bl{{.*}}: # CHECK-ASM: ldr{{.*}}q16, [x1] -# CHECK-ASM: str{{.*}}q16, [x0] +# CHECK-ASM-NEXT: str{{.*}}q16, [x0] # CHECK-ASM-NOT: bl{{.*}}: # CHECK-ASM: ldr{{.*}}q16, [x1] -# CHECK-ASM: str{{.*}}q16, [x0] -# CHECK-ASM: ldr{{.*}}q17, [x1, #0x10] -# CHECK-ASM: str{{.*}}q17, [x0, #0x10] +# CHECK-ASM-NEXT: str{{.*}}q16, [x0] +# CHECK-ASM-NEXT: ldr{{.*}}q17, [x1, #0x10] +# CHECK-ASM-NEXT: str{{.*}}q17, [x0, #0x10] # CHECK-ASM-NOT: bl{{.*}}: # CHECK-ASM: ldr{{.*}}q16, [x1] -# CHECK-ASM: str{{.*}}q16, [x0] -# CHECK-ASM: ldr{{.*}}q16, [x1, #0x10] -# CHECK-ASM: str{{.*}}q16, [x0, #0x10] -# CHECK-ASM: ldr{{.*}}w9, [x1, #0x20] -# CHECK-ASM: str{{.*}}w9, [x0, #0x20] -# CHECK-ASM: ldrb{{.*}}w9, [x1, #0x24] -# CHECK-ASM: strb{{.*}}w9, [x0, #0x24] +# CHECK-ASM-NEXT: str{{.*}}q16, [x0] +# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x10] +# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x10] +# CHECK-ASM-NEXT: ldr{{.*}}w9, [x1, #0x20] +# CHECK-ASM-NEXT: str{{.*}}w9, [x0, #0x20] +# CHECK-ASM-NEXT: ldrb{{.*}}w9, [x1, #0x24] +# CHECK-ASM-NEXT: strb{{.*}}w9, [x0, #0x24] # CHECK-ASM-NOT: bl{{.*}}: # CHECK-ASM: ldr{{.*}}w9, [x1] -# CHECK-ASM: str{{.*}}w9, [x0] -# CHECK-ASM: add{{.*}}x0, x0, #0x4 +# CHECK-ASM-NEXT: str{{.*}}w9, [x0] +# CHECK-ASM-NEXT: add{{.*}}x0, x0, #0x4 # CHECK-ASM-NOT: bl{{.*}}<_memcpy8 # Complex function with caller-saved X9 should inline 8-byte memcpy using X9 as temp register # CHECK-ASM-LABEL: : # CHECK-ASM: ldr{{.*}}x9, [x1] -# CHECK-ASM: str{{.*}}x9, [x0] +# CHECK-ASM-NEXT: str{{.*}}x9, [x0] # CHECK-ASM-NOT: bl{{.*}}: # CHECK-ASM: ldr{{.*}}q16, [x1] -# CHECK-ASM: str{{.*}}q16, [x0] -# CHECK-ASM: ldr{{.*}}q16, [x1, #0x10] -# CHECK-ASM: str{{.*}}q16, [x0, #0x10] -# CHECK-ASM: ldr{{.*}}q16, [x1, #0x20] -# CHECK-ASM: str{{.*}}q16, [x0, #0x20] -# CHECK-ASM: ldr{{.*}}q16, [x1, #0x30] -# CHECK-ASM: str{{.*}}q16, [x0, #0x30] +# CHECK-ASM-NEXT: str{{.*}}q16, [x0] +# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x10] +# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x10] +# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x20] +# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x20] +# CHECK-ASM-NEXT: ldr{{.*}}q16, [x1, #0x30] +# CHECK-ASM-NEXT: str{{.*}}q16, [x0, #0x30] # CHECK-ASM-NOT: bl{{.*}} Date: Thu, 4 Sep 2025 09:58:09 -0700 Subject: [PATCH 22/26] [test] updated negative test to check for negative size --- bolt/test/runtime/AArch64/inline-memcpy.s | 54 ++++------------------- 1 file changed, 8 insertions(+), 46 deletions(-) diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s index 3222935b74fef..ee934bc50dbd5 100644 --- a/bolt/test/runtime/AArch64/inline-memcpy.s +++ b/bolt/test/runtime/AArch64/inline-memcpy.s @@ -85,8 +85,8 @@ # CHECK-ASM-LABEL: : # CHECK-ASM: bl{{.*}}: +# Negative size should NOT be inlined (invalid size parameter) +# CHECK-ASM-LABEL: : # CHECK-ASM: bl{{.*}} Date: Thu, 4 Sep 2025 10:04:51 -0700 Subject: [PATCH 23/26] [nfc] minor refactor --- bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 707856b5874ea..9e1cec4c14a93 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -2598,10 +2598,6 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { return 4; } - InstructionListType createInlineMemcpy(bool ReturnEnd) const override { - return createInlineMemcpy(ReturnEnd, std::nullopt); - } - std::optional extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override { // Match MOVZ instructions (both X and W register variants) with no shift. @@ -2616,8 +2612,11 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { std::optional findMemcpySizeInBytes(const BinaryBasicBlock &BB, BinaryBasicBlock::iterator CallInst) const override { - BitVector WrittenRegs(RegInfo->getNumRegs()); MCPhysReg SizeReg = getIntArgRegister(2); + if (SizeReg == getNoRegister()) + return std::nullopt; + + BitVector WrittenRegs(RegInfo->getNumRegs()); const BitVector &SizeRegAliases = getAliases(SizeReg); for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) { @@ -2625,9 +2624,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { WrittenRegs.reset(); getWrittenRegs(Inst, WrittenRegs); - if (SizeReg != getNoRegister() && WrittenRegs.anyCommon(SizeRegAliases); - auto ExtractedSize = extractMoveImmediate(Inst, SizeReg)) - return *ExtractedSize; + if (WrittenRegs.anyCommon(SizeRegAliases)) + return extractMoveImmediate(Inst, SizeReg); } return std::nullopt; } @@ -2635,6 +2633,8 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { InstructionListType createInlineMemcpy(bool ReturnEnd, std::optional KnownSize) const override { + assert(KnownSize.has_value() && + "AArch64 memcpy inlining requires known size"); InstructionListType Code; uint64_t Size = *KnownSize; From 62b871ec4204cd629e2a59e6f07f291c009c0f0a Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Thu, 4 Sep 2025 10:16:34 -0700 Subject: [PATCH 24/26] [bug] memcpy call removed for sizes>64 --- bolt/lib/Passes/BinaryPasses.cpp | 2 +- bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp | 3 +-- bolt/test/runtime/AArch64/inline-memcpy.s | 9 ++++----- 3 files changed, 6 insertions(+), 8 deletions(-) diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp index d40f5fb78c7f3..2f1bb21bc1fd8 100644 --- a/bolt/lib/Passes/BinaryPasses.cpp +++ b/bolt/lib/Passes/BinaryPasses.cpp @@ -1871,7 +1871,7 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) { std::optional KnownSize = BC.MIB->findMemcpySizeInBytes(BB, II); - if (BC.isAArch64() && !KnownSize.has_value()) + if (BC.isAArch64() && (!KnownSize.has_value() || *KnownSize > 64)) continue; const InstructionListType NewCode = diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 9e1cec4c14a93..bcc9809b52fab 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -2706,8 +2706,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { Remaining -= OpSize; Offset += OpSize; } - } else - Code.clear(); + } break; } return Code; diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s index ee934bc50dbd5..e0072f38db2d2 100644 --- a/bolt/test/runtime/AArch64/inline-memcpy.s +++ b/bolt/test/runtime/AArch64/inline-memcpy.s @@ -7,8 +7,8 @@ # RUN: llvm-bolt %t.exe --inline-memcpy -o %t.bolt 2>&1 | FileCheck %s --check-prefix=CHECK-INLINE # RUN: llvm-objdump -d %t.bolt | FileCheck %s --check-prefix=CHECK-ASM -# Verify BOLT reports that it inlined memcpy calls (12 successful inlines out of 16 total calls) -# CHECK-INLINE: BOLT-INFO: inlined 12 memcpy() calls +# Verify BOLT reports that it inlined memcpy calls (11 successful inlines out of 16 total calls) +# CHECK-INLINE: BOLT-INFO: inlined 11 memcpy() calls # Each function should use optimal size-specific instructions and NO memcpy calls @@ -68,10 +68,9 @@ # CHECK-ASM-NOT: str # CHECK-ASM-NOT: bl{{.*}}: -# CHECK-ASM-NOT: bl{{.*}}: From dcab6acd61085456c885d0d8f76d99138829d25e Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Fri, 5 Sep 2025 09:16:48 -0700 Subject: [PATCH 25/26] [nfc][test] reordered test --- bolt/test/runtime/AArch64/inline-memcpy.s | 36 +++++++++++------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/bolt/test/runtime/AArch64/inline-memcpy.s b/bolt/test/runtime/AArch64/inline-memcpy.s index e0072f38db2d2..dc59a08b889a7 100644 --- a/bolt/test/runtime/AArch64/inline-memcpy.s +++ b/bolt/test/runtime/AArch64/inline-memcpy.s @@ -68,6 +68,10 @@ # CHECK-ASM-NOT: str # CHECK-ASM-NOT: bl{{.*}}: +# CHECK-ASM: bl{{.*}}: # CHECK-ASM: bl{{.*}}: # CHECK-ASM: bl{{.*}}: -# CHECK-ASM: bl{{.*}}: # CHECK-ASM: ldr{{.*}}w9, [x1] @@ -218,6 +218,20 @@ test_0_byte: ret .size test_0_byte, .-test_0_byte + .globl test_negative_size + .type test_negative_size,@function +test_negative_size: + # Negative size should not be inlined + stp x29, x30, [sp, #-32]! + mov x29, sp + add x1, sp, #16 + add x0, sp, #8 + mov x2, #-1 + bl memcpy + ldp x29, x30, [sp], #32 + ret + .size test_negative_size, .-test_negative_size + .globl test_128_byte_too_large .type test_128_byte_too_large,@function test_128_byte_too_large: @@ -273,20 +287,6 @@ test_live_in_negative: ret .size test_live_in_negative, .-test_live_in_negative - .globl test_negative_size - .type test_negative_size,@function -test_negative_size: - # Negative size should not be inlined - stp x29, x30, [sp, #-32]! - mov x29, sp - add x1, sp, #16 - add x0, sp, #8 - mov x2, #-1 - bl memcpy - ldp x29, x30, [sp], #32 - ret - .size test_negative_size, .-test_negative_size - .globl test_memcpy8_4_byte .type test_memcpy8_4_byte,@function test_memcpy8_4_byte: From 875156e6bf82cb3e9ba27df0bf541374350ff69e Mon Sep 17 00:00:00 2001 From: Yafet Beyene Date: Fri, 5 Sep 2025 09:18:20 -0700 Subject: [PATCH 26/26] [nfc] added assert for default case (future-proofing for changes to BinaryPasses.cpp) --- .../Target/AArch64/AArch64MCPlusBuilder.cpp | 42 ++++++++++--------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index bcc9809b52fab..eb402a5681c53 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -2687,26 +2687,28 @@ class AArch64MCPlusBuilder : public MCPlusBuilder { break; default: - if (Size <= 64) { - // For sizes up to 64 bytes, greedily use the largest possible loads. - uint64_t Remaining = Size; - uint64_t Offset = 0; - - const std::array, 5> - LoadStoreOps = { - {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16}, - {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9}, - {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9}, - {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9}, - {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}}; - - for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps) - while (Remaining >= OpSize) { - AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize); - Remaining -= OpSize; - Offset += OpSize; - } - } + // For sizes up to 64 bytes, greedily use the largest possible loads. + // Caller should have already filtered out sizes > 64 bytes. + assert(Size <= 64 && + "Size should be <= 64 bytes for AArch64 memcpy inlining"); + + uint64_t Remaining = Size; + uint64_t Offset = 0; + + const std::array, 5> + LoadStoreOps = { + {{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16}, + {8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9}, + {4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9}, + {2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9}, + {1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}}; + + for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps) + while (Remaining >= OpSize) { + AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize); + Remaining -= OpSize; + Offset += OpSize; + } break; } return Code;