Skip to content

Commit 2df621d

Browse files
committed
Merge remote-tracking branch 'origin/main' into data-layout-refactor
2 parents 25ab042 + 8da3ab1 commit 2df621d

File tree

1,981 files changed

+194136
-54391
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,981 files changed

+194136
-54391
lines changed

.ci/all_requirements.txt

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -194,9 +194,9 @@ ml-dtypes==0.5.1 ; python_version < "3.13" \
194194
--hash=sha256:d13755f8e8445b3870114e5b6240facaa7cb0c3361e54beba3e07fa912a6e12b \
195195
--hash=sha256:fd918d4e6a4e0c110e2e05be7a7814d10dc1b95872accbf6512b80a109b71ae1
196196
# via -r mlir/python/requirements.txt
197-
nanobind==2.9.2 \
198-
--hash=sha256:c37957ffd5eac7eda349cff3622ecd32e5ee1244ecc912c99b5bc8188bafd16e \
199-
--hash=sha256:e7608472de99d375759814cab3e2c94aba3f9ec80e62cfef8ced495ca5c27d6e
197+
nanobind==2.7.0 \
198+
--hash=sha256:73b12d0e751d140d6c1bf4b215e18818a8debfdb374f08dc3776ad208d808e74 \
199+
--hash=sha256:f9f1b160580c50dcf37b6495a0fd5ec61dc0d95dae5f8004f87dd9ad7eb46b34
200200
# via -r mlir/python/requirements.txt
201201
numpy==2.0.2 \
202202
--hash=sha256:0123ffdaa88fa4ab64835dcbde75dcdf89c453c922f18dced6e27c90d1d0ec5a \
@@ -383,10 +383,6 @@ swig==4.3.1 \
383383
--hash=sha256:efec16327029f682f649a26da726bb0305be8800bd0f1fa3e81bf0769cf5b476 \
384384
--hash=sha256:fc496c0d600cf1bb2d91e28d3d6eae9c4301e5ea7a0dec5a4281b5efed4245a8
385385
# via -r lldb/test/requirements.txt
386-
typing-extensions==4.15.0 \
387-
--hash=sha256:0cea48d173cc12fa28ecabc3b837ea3cf6f38c6d1136f85cbaaf598984861466 \
388-
--hash=sha256:f0fa19c6845758ab08074a0cfa8b7aecb71c999ca73d62883bc25cc018c4e548
389-
# via -r mlir/python/requirements.txt
390386
urllib3==2.5.0 \
391387
--hash=sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760 \
392388
--hash=sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc

.github/new-prs-labeler.yml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1090,6 +1090,14 @@ clang:openmp:
10901090
- llvm/unittests/Frontend/OpenMP*
10911091
- llvm/test/Transforms/OpenMP/**
10921092

1093+
clang:temporal-safety:
1094+
- clang/include/clang/Analysis/Analyses/LifetimeSafety*
1095+
- clang/lib/Analysis/LifetimeSafety*
1096+
- clang/unittests/Analysis/LifetimeSafety*
1097+
- clang/test/Sema/*lifetime-safety*
1098+
- clang/test/Sema/*lifetime-analysis*
1099+
- clang/test/Analysis/LifetimeSafety/**
1100+
10931101
clang:as-a-library:
10941102
- clang/tools/libclang/**
10951103
- clang/bindings/**

bolt/docs/CommandLineArgumentReference.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -637,7 +637,7 @@
637637

638638
- `--inline-memcpy`
639639

640-
Inline memcpy using 'rep movsb' instruction (X86-only)
640+
Inline memcpy using optimized instruction sequences (X86: 'rep movsb', AArch64: width-optimized register operations)
641641

642642
- `--inline-small-functions`
643643

bolt/include/bolt/Core/MCPlusBuilder.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#ifndef BOLT_CORE_MCPLUSBUILDER_H
1515
#define BOLT_CORE_MCPLUSBUILDER_H
1616

17+
#include "bolt/Core/BinaryBasicBlock.h"
1718
#include "bolt/Core/MCPlus.h"
1819
#include "bolt/Core/Relocation.h"
1920
#include "llvm/ADT/ArrayRef.h"
@@ -1902,13 +1903,38 @@ class MCPlusBuilder {
19021903
return {};
19031904
}
19041905

1906+
/// Find memcpy size in bytes by using preceding instructions.
1907+
/// Returns std::nullopt if size cannot be determined (no-op for most
1908+
/// targets).
1909+
virtual std::optional<uint64_t>
1910+
findMemcpySizeInBytes(const BinaryBasicBlock &BB,
1911+
BinaryBasicBlock::iterator CallInst) const {
1912+
return std::nullopt;
1913+
}
1914+
19051915
/// Creates inline memcpy instruction. If \p ReturnEnd is true, then return
19061916
/// (dest + n) instead of dest.
19071917
virtual InstructionListType createInlineMemcpy(bool ReturnEnd) const {
19081918
llvm_unreachable("not implemented");
19091919
return {};
19101920
}
19111921

1922+
/// Creates size-aware inline memcpy instruction. If \p KnownSize is provided,
1923+
/// generates optimized code for that specific size. Falls back to regular
1924+
/// createInlineMemcpy if size is unknown or not needed (e.g. with X86).
1925+
virtual InstructionListType
1926+
createInlineMemcpy(bool ReturnEnd, std::optional<uint64_t> KnownSize) const {
1927+
return createInlineMemcpy(ReturnEnd);
1928+
}
1929+
1930+
/// Extract immediate value from move instruction that sets the given
1931+
/// register. Returns the immediate value if the instruction is a
1932+
/// move-immediate to TargetReg.
1933+
virtual std::optional<uint64_t>
1934+
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const {
1935+
return std::nullopt;
1936+
}
1937+
19121938
/// Create a target-specific relocation out of the \p Fixup.
19131939
/// Note that not every fixup could be converted into a relocation.
19141940
virtual std::optional<Relocation>

bolt/lib/Passes/BinaryPasses.cpp

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,12 @@ static cl::opt<DynoStatsSortOrder> DynoStatsSortOrderOpt(
6060
"print-sorted-by-order",
6161
cl::desc("use ascending or descending order when printing functions "
6262
"ordered by dyno stats"),
63-
cl::init(DynoStatsSortOrder::Descending), cl::cat(BoltOptCategory));
63+
cl::init(DynoStatsSortOrder::Descending),
64+
cl::values(clEnumValN(DynoStatsSortOrder::Ascending, "ascending",
65+
"Ascending order"),
66+
clEnumValN(DynoStatsSortOrder::Descending, "descending",
67+
"Descending order")),
68+
cl::cat(BoltOptCategory));
6469

6570
cl::list<std::string>
6671
HotTextMoveSections("hot-text-move-sections",
@@ -1843,7 +1848,7 @@ Error StripRepRet::runOnFunctions(BinaryContext &BC) {
18431848
}
18441849

18451850
Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
1846-
if (!BC.isX86())
1851+
if (!BC.isX86() && !BC.isAArch64())
18471852
return Error::success();
18481853

18491854
uint64_t NumInlined = 0;
@@ -1866,8 +1871,16 @@ Error InlineMemcpy::runOnFunctions(BinaryContext &BC) {
18661871
const bool IsMemcpy8 = (CalleeSymbol->getName() == "_memcpy8");
18671872
const bool IsTailCall = BC.MIB->isTailCall(Inst);
18681873

1874+
// Extract size from preceding instructions (AArch64 only).
1875+
// Pattern: MOV X2, #nb-bytes; BL memcpy src, dest, X2.
1876+
std::optional<uint64_t> KnownSize =
1877+
BC.MIB->findMemcpySizeInBytes(BB, II);
1878+
1879+
if (BC.isAArch64() && (!KnownSize.has_value() || *KnownSize > 64))
1880+
continue;
1881+
18691882
const InstructionListType NewCode =
1870-
BC.MIB->createInlineMemcpy(IsMemcpy8);
1883+
BC.MIB->createInlineMemcpy(IsMemcpy8, KnownSize);
18711884
II = BB.replaceInstruction(II, NewCode);
18721885
std::advance(II, NewCode.size() - 1);
18731886
if (IsTailCall) {

bolt/lib/Rewrite/BinaryPassManager.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,7 +248,9 @@ static cl::opt<bool> Stoke("stoke", cl::desc("turn on the stoke analysis"),
248248

249249
static cl::opt<bool> StringOps(
250250
"inline-memcpy",
251-
cl::desc("inline memcpy using 'rep movsb' instruction (X86-only)"),
251+
cl::desc(
252+
"inline memcpy using size-specific optimized instructions "
253+
"(X86: 'rep movsb', AArch64: width-optimized register operations)"),
252254
cl::cat(BoltOptCategory));
253255

254256
static cl::opt<bool> StripRepRet(

bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp

Lines changed: 118 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2517,21 +2517,17 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
25172517
createInstrIncMemory(const MCSymbol *Target, MCContext *Ctx, bool IsLeaf,
25182518
unsigned CodePointerSize) const override {
25192519
unsigned int I = 0;
2520-
InstructionListType Instrs(10);
2520+
InstructionListType Instrs(6);
25212521

25222522
createPushRegisters(Instrs[I++], AArch64::X0, AArch64::X1);
2523-
getSystemFlag(Instrs[I++], AArch64::X1);
25242523
InstructionListType Addr = materializeAddress(Target, Ctx, AArch64::X0);
25252524
assert(Addr.size() == 2 && "Invalid Addr size");
25262525
std::copy(Addr.begin(), Addr.end(), Instrs.begin() + I);
25272526
I += Addr.size();
2528-
storeReg(Instrs[I++], AArch64::X2, AArch64::SP);
2529-
InstructionListType Insts = createIncMemory(AArch64::X0, AArch64::X2);
2527+
InstructionListType Insts = createIncMemory(AArch64::X0, AArch64::X1);
25302528
assert(Insts.size() == 2 && "Invalid Insts size");
25312529
std::copy(Insts.begin(), Insts.end(), Instrs.begin() + I);
25322530
I += Insts.size();
2533-
loadReg(Instrs[I++], AArch64::X2, AArch64::SP);
2534-
setSystemFlag(Instrs[I++], AArch64::X1);
25352531
createPopRegisters(Instrs[I++], AArch64::X0, AArch64::X1);
25362532
return Instrs;
25372533
}
@@ -2620,6 +2616,122 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
26202616
getInstructionSize(const MCInst &Inst) const override {
26212617
return 4;
26222618
}
2619+
2620+
std::optional<uint64_t>
2621+
extractMoveImmediate(const MCInst &Inst, MCPhysReg TargetReg) const override {
2622+
// Match MOVZ instructions (both X and W register variants) with no shift.
2623+
if ((Inst.getOpcode() == AArch64::MOVZXi ||
2624+
Inst.getOpcode() == AArch64::MOVZWi) &&
2625+
Inst.getOperand(2).getImm() == 0 &&
2626+
getAliases(TargetReg)[Inst.getOperand(0).getReg()])
2627+
return Inst.getOperand(1).getImm();
2628+
return std::nullopt;
2629+
}
2630+
2631+
std::optional<uint64_t>
2632+
findMemcpySizeInBytes(const BinaryBasicBlock &BB,
2633+
BinaryBasicBlock::iterator CallInst) const override {
2634+
MCPhysReg SizeReg = getIntArgRegister(2);
2635+
if (SizeReg == getNoRegister())
2636+
return std::nullopt;
2637+
2638+
BitVector WrittenRegs(RegInfo->getNumRegs());
2639+
const BitVector &SizeRegAliases = getAliases(SizeReg);
2640+
2641+
for (auto InstIt = BB.begin(); InstIt != CallInst; ++InstIt) {
2642+
const MCInst &Inst = *InstIt;
2643+
WrittenRegs.reset();
2644+
getWrittenRegs(Inst, WrittenRegs);
2645+
2646+
if (WrittenRegs.anyCommon(SizeRegAliases))
2647+
return extractMoveImmediate(Inst, SizeReg);
2648+
}
2649+
return std::nullopt;
2650+
}
2651+
2652+
InstructionListType
2653+
createInlineMemcpy(bool ReturnEnd,
2654+
std::optional<uint64_t> KnownSize) const override {
2655+
assert(KnownSize.has_value() &&
2656+
"AArch64 memcpy inlining requires known size");
2657+
InstructionListType Code;
2658+
uint64_t Size = *KnownSize;
2659+
2660+
generateSizeSpecificMemcpy(Code, Size);
2661+
2662+
// If _memcpy8, adjust X0 to return dest+size instead of dest.
2663+
if (ReturnEnd)
2664+
Code.emplace_back(MCInstBuilder(AArch64::ADDXri)
2665+
.addReg(AArch64::X0)
2666+
.addReg(AArch64::X0)
2667+
.addImm(Size)
2668+
.addImm(0));
2669+
return Code;
2670+
}
2671+
2672+
InstructionListType generateSizeSpecificMemcpy(InstructionListType &Code,
2673+
uint64_t Size) const {
2674+
auto AddLoadStorePair = [&](unsigned LoadOpc, unsigned StoreOpc,
2675+
unsigned Reg, unsigned Offset = 0) {
2676+
Code.emplace_back(MCInstBuilder(LoadOpc)
2677+
.addReg(Reg)
2678+
.addReg(AArch64::X1)
2679+
.addImm(Offset));
2680+
Code.emplace_back(MCInstBuilder(StoreOpc)
2681+
.addReg(Reg)
2682+
.addReg(AArch64::X0)
2683+
.addImm(Offset));
2684+
};
2685+
2686+
// Generate optimal instruction sequences based on exact size.
2687+
switch (Size) {
2688+
case 1:
2689+
AddLoadStorePair(AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9);
2690+
break;
2691+
case 2:
2692+
AddLoadStorePair(AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9);
2693+
break;
2694+
case 4:
2695+
AddLoadStorePair(AArch64::LDRWui, AArch64::STRWui, AArch64::W9);
2696+
break;
2697+
case 8:
2698+
AddLoadStorePair(AArch64::LDRXui, AArch64::STRXui, AArch64::X9);
2699+
break;
2700+
case 16:
2701+
AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16);
2702+
break;
2703+
case 32:
2704+
AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q16, 0);
2705+
AddLoadStorePair(AArch64::LDRQui, AArch64::STRQui, AArch64::Q17, 1);
2706+
break;
2707+
2708+
default:
2709+
// For sizes up to 64 bytes, greedily use the largest possible loads.
2710+
// Caller should have already filtered out sizes > 64 bytes.
2711+
assert(Size <= 64 &&
2712+
"Size should be <= 64 bytes for AArch64 memcpy inlining");
2713+
2714+
uint64_t Remaining = Size;
2715+
uint64_t Offset = 0;
2716+
2717+
const std::array<std::tuple<uint64_t, unsigned, unsigned, unsigned>, 5>
2718+
LoadStoreOps = {
2719+
{{16, AArch64::LDRQui, AArch64::STRQui, AArch64::Q16},
2720+
{8, AArch64::LDRXui, AArch64::STRXui, AArch64::X9},
2721+
{4, AArch64::LDRWui, AArch64::STRWui, AArch64::W9},
2722+
{2, AArch64::LDRHHui, AArch64::STRHHui, AArch64::W9},
2723+
{1, AArch64::LDRBBui, AArch64::STRBBui, AArch64::W9}}};
2724+
2725+
for (const auto &[OpSize, LoadOp, StoreOp, TempReg] : LoadStoreOps)
2726+
while (Remaining >= OpSize) {
2727+
AddLoadStorePair(LoadOp, StoreOp, TempReg, Offset / OpSize);
2728+
Remaining -= OpSize;
2729+
Offset += OpSize;
2730+
}
2731+
break;
2732+
}
2733+
return Code;
2734+
}
26232735
};
26242736

26252737
} // end anonymous namespace
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Check that --print-sorted-by-order=<ascending/descending> option works properly in llvm-bolt
2+
#
3+
# RUN: llvm-mc -filetype=obj -triple aarch64-unknown-unknown %s -o %t.o
4+
# RUN: %clang %cflags -fPIC -pie %t.o -o %t.exe -nostdlib -Wl,-q
5+
# RUN: link_fdata %s %t.o %t.fdata
6+
# RUN: llvm-bolt %t.exe -o %t.bolt --print-sorted-by=all --print-sorted-by-order=ascending \
7+
# RUN: --data %t.fdata | FileCheck %s -check-prefix=CHECK-ASCEND
8+
# RUN: llvm-bolt %t.exe -o %t.bolt --print-sorted-by=all --print-sorted-by-order=descending \
9+
# RUN: --data %t.fdata | FileCheck %s -check-prefix=CHECK-DESCEND
10+
11+
# CHECK-ASCEND: BOLT-INFO: top functions sorted by dyno stats are:
12+
# CHECK-ASCEND-NEXT: bar
13+
# CHECK-ASCEND-NEXT: foo
14+
# CHECK-DESCEND: BOLT-INFO: top functions sorted by dyno stats are:
15+
# CHECK-DESCEND-NEXT: foo
16+
# CHECK-DESCEND-NEXT: bar
17+
18+
.text
19+
.align 4
20+
.global bar
21+
.type bar, %function
22+
bar:
23+
mov w0, wzr
24+
ret
25+
26+
.global foo
27+
.type foo, %function
28+
foo:
29+
# FDATA: 1 foo 0 1 bar 0 0 1
30+
bl bar
31+
ret

0 commit comments

Comments
 (0)