Skip to content

Commit 0ff13d8

Browse files
committed
split got optimization and relax and fix the range of tryGotToPCRel
1 parent ae010c8 commit 0ff13d8

File tree

2 files changed

+222
-26
lines changed

2 files changed

+222
-26
lines changed

lld/ELF/Arch/LoongArch.cpp

Lines changed: 74 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1154,7 +1154,7 @@ void LoongArch::tlsdescToLe(uint8_t *loc, const Relocation &rel,
11541154
}
11551155
}
11561156

1157-
// Try GOT indirection to PC relative optimization when relaxation is enabled.
1157+
// Try GOT indirection to PC relative optimization.
11581158
// From:
11591159
// * pcalau12i $a0, %got_pc_hi20(sym_got)
11601160
// * ld.w/d $a0, $a0, %got_pc_lo12(sym_got)
@@ -1167,28 +1167,49 @@ void LoongArch::tlsdescToLe(uint8_t *loc, const Relocation &rel,
11671167
// complexity.
11681168
bool LoongArch::tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
11691169
const Relocation &rLo12, uint64_t secAddr) const {
1170-
if (!rHi20.sym->isDefined() || rHi20.sym->isPreemptible ||
1171-
rHi20.sym->isGnuIFunc() ||
1172-
(ctx.arg.isPic && !cast<Defined>(*rHi20.sym).section))
1170+
// Check if the relocations apply to consecutive instructions.
1171+
if (rHi20.offset + 4 != rLo12.offset)
11731172
return false;
11741173

1175-
Symbol &sym = *rHi20.sym;
1176-
uint64_t symLocal = sym.getVA(ctx) + rHi20.addend;
1177-
// Check if the address difference is within +/-2GB range.
1178-
// For simplicity, the range mentioned here is an approximate estimate and is
1179-
// not fully equivalent to the entire region that PC-relative addressing can
1180-
// cover.
1181-
int64_t pageOffset =
1182-
getLoongArchPage(symLocal) - getLoongArchPage(secAddr + rHi20.offset);
1183-
if (!isInt<20>(pageOffset >> 12))
1174+
// Check if the relocations reference the same symbol and skip undefined,
1175+
// preemptible and STT_GNU_IFUNC symbols.
1176+
if (!rHi20.sym || rHi20.sym != rLo12.sym || !rHi20.sym->isDefined() ||
1177+
rHi20.sym->isPreemptible || rHi20.sym->isGnuIFunc())
1178+
return false;
1179+
1180+
// GOT references to absolute symbols can't be relaxed to use PCALAU12I/ADDI
1181+
// in position-independent code because these instructions produce a relative
1182+
// address.
1183+
if ((ctx.arg.isPic && !cast<Defined>(*rHi20.sym).section))
1184+
return false;
1185+
1186+
// Check if the addends of the both relocations are zero.
1187+
if (rHi20.addend != 0 || rLo12.addend != 0)
11841188
return false;
11851189

11861190
const uint32_t currInsn = read32le(loc);
11871191
const uint32_t nextInsn = read32le(loc + 4);
1192+
const uint32_t ldOpcode = ctx.arg.is64 ? LD_D : LD_W;
1193+
// Check if the first instruction is PCALAU12I and the second instruction is
1194+
// LD.
1195+
if ((currInsn & 0xfe000000) != PCALAU12I ||
1196+
(nextInsn & 0xffc00000) != ldOpcode)
1197+
return false;
1198+
11881199
// Check if use the same register.
11891200
if (getD5(currInsn) != getJ5(nextInsn) || getJ5(nextInsn) != getD5(nextInsn))
11901201
return false;
11911202

1203+
Symbol &sym = *rHi20.sym;
1204+
uint64_t symLocal = sym.getVA(ctx);
1205+
const int64_t displace = symLocal - getLoongArchPage(secAddr + rHi20.offset);
1206+
// Check if the symbol address is in
1207+
// [(PC & ~0xfff) - 2GiB - 0x800, (PC & ~0xfff) + 2GiB - 0x800).
1208+
const int64_t underflow = -0x80000000LL - 0x800;
1209+
const int64_t overflow = 0x80000000LL - 0x800;
1210+
if (!(displace >= underflow && displace < overflow))
1211+
return false;
1212+
11921213
Relocation newRHi20 = {RE_LOONGARCH_PAGE_PC, R_LARCH_PCALA_HI20, rHi20.offset,
11931214
rHi20.addend, &sym};
11941215
Relocation newRLo12 = {R_ABS, R_LARCH_PCALA_LO12, rLo12.offset, rLo12.addend,
@@ -1222,6 +1243,30 @@ RelExpr LoongArch::adjustTlsExpr(RelType type, RelExpr expr) const {
12221243
return expr;
12231244
}
12241245

1246+
static bool pairForGotRels(ArrayRef<Relocation> relocs) {
1247+
// Check if R_LARCH_GOT_PC_HI20 and R_LARCH_GOT_PC_LO12 always appear in
1248+
// pairs.
1249+
size_t i = 0;
1250+
const size_t size = relocs.size();
1251+
for (; i != size; ++i) {
1252+
if (relocs[i].type == R_LARCH_GOT_PC_HI20) {
1253+
if (i + 1 < size && relocs[i + 1].type == R_LARCH_GOT_PC_LO12) {
1254+
++i;
1255+
continue;
1256+
}
1257+
if (relaxable(relocs, i) && i + 2 < size &&
1258+
relocs[i + 2].type == R_LARCH_GOT_PC_LO12) {
1259+
i += 2;
1260+
continue;
1261+
}
1262+
break;
1263+
} else if (relocs[i].type == R_LARCH_GOT_PC_LO12) {
1264+
break;
1265+
}
1266+
}
1267+
return i == size;
1268+
}
1269+
12251270
void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
12261271
const unsigned bits = ctx.arg.is64 ? 64 : 32;
12271272
uint64_t secAddr = sec.getOutputSection()->addr;
@@ -1231,6 +1276,7 @@ void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
12311276
secAddr += ehIn->getParent()->outSecOff;
12321277
bool isExtreme = false, isRelax = false;
12331278
const MutableArrayRef<Relocation> relocs = sec.relocs();
1279+
const bool isPairForGotRels = pairForGotRels(relocs);
12341280
for (size_t i = 0, size = relocs.size(); i != size; ++i) {
12351281
Relocation &rel = relocs[i];
12361282
uint8_t *loc = buf + rel.offset;
@@ -1315,19 +1361,21 @@ void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
13151361
}
13161362
continue;
13171363
case RE_LOONGARCH_GOT_PAGE_PC:
1318-
// In LoongArch, we try GOT indirection to PC relative optimization only
1319-
// when relaxation is enabled. This approach avoids determining whether
1320-
// relocation types are paired and whether the destination register of
1321-
// pcalau12i is only used by the immediately following instruction.
1322-
// Moreover, if the original code sequence can be relaxed to a single
1323-
// instruction `pcaddi`, the first instruction will be removed and it will
1324-
// not reach here.
1325-
if (isPairRelaxable(relocs, i) && rel.type == R_LARCH_GOT_PC_HI20 &&
1326-
relocs[i + 2].type == R_LARCH_GOT_PC_LO12 &&
1327-
tryGotToPCRel(loc, rel, relocs[i + 2], secAddr)) {
1328-
i = i + 3; // skip relocations R_LARCH_RELAX, R_LARCH_GOT_PC_LO12,
1329-
// R_LARCH_RELAX
1330-
continue;
1364+
// In LoongArch, we try GOT indirection to PC relative optimization in
1365+
// normal or medium code model, whether or not with R_LARCH_RELAX
1366+
// relocation. Moreover, if the original code sequence can be relaxed to a
1367+
// single instruction `pcaddi`, the first instruction will be removed and
1368+
// it will not reach here.
1369+
if (isPairForGotRels && rel.type == R_LARCH_GOT_PC_HI20) {
1370+
bool isRelax = relaxable(relocs, i);
1371+
const Relocation lo12Rel = isRelax ? relocs[i + 2] : relocs[i + 1];
1372+
if (lo12Rel.type == R_LARCH_GOT_PC_LO12 &&
1373+
tryGotToPCRel(loc, rel, lo12Rel, secAddr)) {
1374+
// isRelax: skip relocations R_LARCH_RELAX, R_LARCH_GOT_PC_LO12
1375+
// !isRelax: skip relocation R_LARCH_GOT_PC_LO12
1376+
i += isRelax ? 2 : 1;
1377+
continue;
1378+
}
13311379
}
13321380
break;
13331381
default:
Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
# REQUIRES: loongarch
2+
# RUN: rm -rf %t && split-file %s %t && cd %t
3+
4+
# RUN: llvm-mc --filetype=obj --triple=loongarch64 a.s -o a.o
5+
# RUN: llvm-mc --filetype=obj --triple=loongarch64 unpaired.s -o unpaired.o
6+
# RUN: llvm-mc --filetype=obj --triple=loongarch64 lone-ldr.s -o lone-ldr.o
7+
8+
# RUN: ld.lld a.o -T within-range.t -o a
9+
# RUN: llvm-objdump -d --no-show-raw-insn a | FileCheck %s
10+
11+
## This test verifies the encoding when the register $a0 is used.
12+
# CHECK: pcalau12i $a0, 0
13+
# CHECK-NEXT: addi.d $a0, $a0, -2048
14+
15+
## PCALAU12I contains a nonzero addend, no relaxations should be applied.
16+
# CHECK-NEXT: pcalau12i $a1, 2
17+
# CHECK-NEXT: ld.d $a1, $a1, -2048
18+
19+
## LD contains a nonzero addend, no relaxations should be applied.
20+
# CHECK-NEXT: pcalau12i $a2, 2
21+
# CHECK-NEXT: ld.d $a2, $a2, -2040
22+
23+
## PCALAU12I and LD use different registers, no relaxations should be applied.
24+
# CHECK-NEXT: pcalau12i $a3, 2
25+
# CHECK-NEXT: ld.d $a4, $a3, -2048
26+
27+
## PCALAU12I and LD use different registers, no relaxations should be applied.
28+
# CHECK-NEXT: pcalau12i $a5, 2
29+
# CHECK-NEXT: ld.d $a5, $a6, -2048
30+
31+
# RUN: ld.lld a.o -T underflow-range.t -o a
32+
# RUN: llvm-objdump -d --no-show-raw-insn a | FileCheck --check-prefix=OUTRANGE %s
33+
34+
# RUN: ld.lld a.o -T overflow-range.t -o a
35+
# RUN: llvm-objdump -d --no-show-raw-insn a | FileCheck --check-prefix=OUTRANGE %s
36+
37+
# OUTRANGE: pcalau12i $a0, 1
38+
# OUTRANGE-NEXT: ld.d $a0, $a0, 0
39+
40+
## Relocations do not appear in pairs, no relaxations should be applied.
41+
# RUN: ld.lld unpaired.o -T within-range.t -o unpaired
42+
# RUN: llvm-objdump --no-show-raw-insn -d unpaired | FileCheck --check-prefix=UNPAIRED %s
43+
44+
# UNPAIRED: pcalau12i $a0, 2
45+
# UNPAIRED-NEXT: b 8
46+
# UNPAIRED-NEXT: pcalau12i $a0, 2
47+
# UNPAIRED: ld.d $a0, $a0, -2048
48+
49+
## Relocations do not appear in pairs, no relaxations should be applied.
50+
# RUN: ld.lld lone-ldr.o -T within-range.t -o lone-ldr
51+
# RUN: llvm-objdump --no-show-raw-insn -d lone-ldr | FileCheck --check-prefix=LONE-LDR %s
52+
53+
# LONE-LDR: ld.d $a0, $a0, -2048
54+
55+
## 32-bit code is mostly the same. We only test a few variants.
56+
# RUN: llvm-mc --filetype=obj --triple=loongarch32 a.32.s -o a.32.o
57+
# RUN: ld.lld a.32.o -T within-range.t -o a32
58+
# RUN: llvm-objdump -d --no-show-raw-insn a32 | FileCheck --check-prefix=CHECK32 %s
59+
60+
## This test verifies the encoding when the register $a0 is used.
61+
# CHECK32: pcalau12i $a0, 0
62+
# CHECK32-NEXT: addi.w $a0, $a0, -2048
63+
64+
65+
## This linker script ensures that .rodata and .text are sufficiently close to
66+
## each other so that the pcalau12i + ld pair can be relaxed to pcalau12i + add.
67+
#--- within-range.t
68+
SECTIONS {
69+
.rodata 0x1800: { *(.rodata) }
70+
.text 0x2800: { *(.text) }
71+
.got 0x3800: { *(.got) }
72+
}
73+
74+
## This linker script ensures that .rodata and .text are sufficiently far apart
75+
## so that the pcalau12i + ld pair cannot be relaxed to pcalau12i + add.
76+
#--- underflow-range.t
77+
SECTIONS {
78+
.rodata 0x800-4: { *(.rodata) }
79+
.got 0x80002000: { *(.got) }
80+
.text 0x80001000: { *(.text) } /* (0x800-4)+2GB+0x800+4 */
81+
}
82+
83+
#--- overflow-range.t
84+
SECTIONS {
85+
.text 0x1000: { *(.text) }
86+
.got 0x2000: { *(.got) }
87+
.rodate 0x80000800 : { *(.rodata) } /* 0x1000+2GB-0x800 */
88+
}
89+
90+
## This linker script ensures that .rodata and .text are sufficiently (>4GB)
91+
## far apart so that the pcalau12i + ld pair cannot be relaxed to pcalau12i + add.
92+
93+
#--- a.s
94+
## Symbol 'x' is nonpreemptible, the optimization should be applied.
95+
.rodata
96+
.hidden x
97+
x:
98+
.word 10
99+
100+
.text
101+
.global _start
102+
_start:
103+
pcalau12i $a0, %got_pc_hi20(x)
104+
ld.d $a0, $a0, %got_pc_lo12(x)
105+
pcalau12i $a1, %got_pc_hi20(x+1)
106+
ld.d $a1, $a1, %got_pc_lo12(x)
107+
pcalau12i $a2, %got_pc_hi20(x)
108+
ld.d $a2, $a2, %got_pc_lo12(x+8)
109+
pcalau12i $a3, %got_pc_hi20(x)
110+
ld.d $a4, $a3, %got_pc_lo12(x)
111+
pcalau12i $a5, %got_pc_hi20(x)
112+
ld.d $a5, $a6, %got_pc_lo12(x)
113+
114+
#--- unpaired.s
115+
.text
116+
.hidden x
117+
x:
118+
nop
119+
.global _start
120+
_start:
121+
pcalau12i $a0, %got_pc_hi20(x)
122+
b L
123+
pcalau12i $a0, %got_pc_hi20(x)
124+
L:
125+
ld.d $a0, $a0, %got_pc_lo12(x)
126+
127+
#--- lone-ldr.s
128+
.text
129+
.hidden x
130+
x:
131+
nop
132+
.global _start
133+
_start:
134+
ld.d $a0, $a0, %got_pc_lo12(x)
135+
136+
137+
#--- a.32.s
138+
## Symbol 'x' is nonpreemptible, the optimization should be applied.
139+
.rodata
140+
.hidden x
141+
x:
142+
.word 10
143+
144+
.text
145+
.global _start
146+
_start:
147+
pcalau12i $a0, %got_pc_hi20(x)
148+
ld.w $a0, $a0, %got_pc_lo12(x)

0 commit comments

Comments
 (0)