Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 74 additions & 26 deletions lld/ELF/Arch/LoongArch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1154,7 +1154,7 @@ void LoongArch::tlsdescToLe(uint8_t *loc, const Relocation &rel,
}
}

// Try GOT indirection to PC relative optimization when relaxation is enabled.
// Try GOT indirection to PC relative optimization.
// From:
// * pcalau12i $a0, %got_pc_hi20(sym_got)
// * ld.w/d $a0, $a0, %got_pc_lo12(sym_got)
Expand All @@ -1167,28 +1167,49 @@ void LoongArch::tlsdescToLe(uint8_t *loc, const Relocation &rel,
// complexity.
bool LoongArch::tryGotToPCRel(uint8_t *loc, const Relocation &rHi20,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Only apply this relax when --relax is enabled for lld.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Clarification: After careful consideration, I think we do not need to check the --relax option because linker relaxation is about to reduce the number of instructions while this pr is not.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that the term "linker optimization" is often used when the number of bytes does not change while "linker relaxation" is used when the number of bytes decreases.

While x86-64 and s390x don't have linker relaxation, they do support --no-relax. --no-relax is useful to disable this optimization.

const Relocation &rLo12, uint64_t secAddr) const {
if (!rHi20.sym->isDefined() || rHi20.sym->isPreemptible ||
rHi20.sym->isGnuIFunc() ||
(ctx.arg.isPic && !cast<Defined>(*rHi20.sym).section))
// Check if the relocations apply to consecutive instructions.
if (rHi20.offset + 4 != rLo12.offset)
return false;

Symbol &sym = *rHi20.sym;
uint64_t symLocal = sym.getVA(ctx) + rHi20.addend;
// Check if the address difference is within +/-2GB range.
// For simplicity, the range mentioned here is an approximate estimate and is
// not fully equivalent to the entire region that PC-relative addressing can
// cover.
int64_t pageOffset =
getLoongArchPage(symLocal) - getLoongArchPage(secAddr + rHi20.offset);
if (!isInt<20>(pageOffset >> 12))
// Check if the relocations reference the same symbol and skip undefined,
// preemptible and STT_GNU_IFUNC symbols.
if (!rHi20.sym || rHi20.sym != rLo12.sym || !rHi20.sym->isDefined() ||
rHi20.sym->isPreemptible || rHi20.sym->isGnuIFunc())
return false;

// GOT references to absolute symbols can't be relaxed to use PCALAU12I/ADDI
// in position-independent code because these instructions produce a relative
// address.
if ((ctx.arg.isPic && !cast<Defined>(*rHi20.sym).section))
return false;

// Check if the addends of the both relocations are zero.
if (rHi20.addend != 0 || rLo12.addend != 0)
return false;

const uint32_t currInsn = read32le(loc);
const uint32_t nextInsn = read32le(loc + 4);
const uint32_t ldOpcode = ctx.arg.is64 ? LD_D : LD_W;
// Check if the first instruction is PCALAU12I and the second instruction is
// LD.
if ((currInsn & 0xfe000000) != PCALAU12I ||
(nextInsn & 0xffc00000) != ldOpcode)
return false;

// Check if use the same register.
if (getD5(currInsn) != getJ5(nextInsn) || getJ5(nextInsn) != getD5(nextInsn))
return false;

Symbol &sym = *rHi20.sym;
uint64_t symLocal = sym.getVA(ctx);
const int64_t displace = symLocal - getLoongArchPage(secAddr + rHi20.offset);
// Check if the symbol address is in
// [(PC & ~0xfff) - 2GiB - 0x800, (PC & ~0xfff) + 2GiB - 0x800).
const int64_t underflow = -0x80000000LL - 0x800;
const int64_t overflow = 0x80000000LL - 0x800;
if (!(displace >= underflow && displace < overflow))
return false;

Relocation newRHi20 = {RE_LOONGARCH_PAGE_PC, R_LARCH_PCALA_HI20, rHi20.offset,
rHi20.addend, &sym};
Relocation newRLo12 = {R_ABS, R_LARCH_PCALA_LO12, rLo12.offset, rLo12.addend,
Expand Down Expand Up @@ -1222,6 +1243,30 @@ RelExpr LoongArch::adjustTlsExpr(RelType type, RelExpr expr) const {
return expr;
}

static bool pairForGotRels(ArrayRef<Relocation> relocs) {
// Check if R_LARCH_GOT_PC_HI20 and R_LARCH_GOT_PC_LO12 always appear in
// pairs.
size_t i = 0;
const size_t size = relocs.size();
for (; i != size; ++i) {
if (relocs[i].type == R_LARCH_GOT_PC_HI20) {
if (i + 1 < size && relocs[i + 1].type == R_LARCH_GOT_PC_LO12) {
++i;
continue;
}
if (relaxable(relocs, i) && i + 2 < size &&
relocs[i + 2].type == R_LARCH_GOT_PC_LO12) {
i += 2;
continue;
}
break;
} else if (relocs[i].type == R_LARCH_GOT_PC_LO12) {
break;
}
}
return i == size;
}

void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
const unsigned bits = ctx.arg.is64 ? 64 : 32;
uint64_t secAddr = sec.getOutputSection()->addr;
Expand All @@ -1231,6 +1276,7 @@ void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
secAddr += ehIn->getParent()->outSecOff;
bool isExtreme = false, isRelax = false;
const MutableArrayRef<Relocation> relocs = sec.relocs();
const bool isPairForGotRels = pairForGotRels(relocs);
for (size_t i = 0, size = relocs.size(); i != size; ++i) {
Relocation &rel = relocs[i];
uint8_t *loc = buf + rel.offset;
Expand Down Expand Up @@ -1315,19 +1361,21 @@ void LoongArch::relocateAlloc(InputSectionBase &sec, uint8_t *buf) const {
}
continue;
case RE_LOONGARCH_GOT_PAGE_PC:
// In LoongArch, we try GOT indirection to PC relative optimization only
// when relaxation is enabled. This approach avoids determining whether
// relocation types are paired and whether the destination register of
// pcalau12i is only used by the immediately following instruction.
// Moreover, if the original code sequence can be relaxed to a single
// instruction `pcaddi`, the first instruction will be removed and it will
// not reach here.
if (isPairRelaxable(relocs, i) && rel.type == R_LARCH_GOT_PC_HI20 &&
relocs[i + 2].type == R_LARCH_GOT_PC_LO12 &&
tryGotToPCRel(loc, rel, relocs[i + 2], secAddr)) {
i = i + 3; // skip relocations R_LARCH_RELAX, R_LARCH_GOT_PC_LO12,
// R_LARCH_RELAX
continue;
// In LoongArch, we try GOT indirection to PC relative optimization in
// normal or medium code model, whether or not with R_LARCH_RELAX
// relocation. Moreover, if the original code sequence can be relaxed to a
// single instruction `pcaddi`, the first instruction will be removed and
// it will not reach here.
if (isPairForGotRels && rel.type == R_LARCH_GOT_PC_HI20) {
bool isRelax = relaxable(relocs, i);
const Relocation lo12Rel = isRelax ? relocs[i + 2] : relocs[i + 1];
if (lo12Rel.type == R_LARCH_GOT_PC_LO12 &&
tryGotToPCRel(loc, rel, lo12Rel, secAddr)) {
// isRelax: skip relocations R_LARCH_RELAX, R_LARCH_GOT_PC_LO12
// !isRelax: skip relocation R_LARCH_GOT_PC_LO12
i += isRelax ? 2 : 1;
continue;
}
}
break;
default:
Expand Down
148 changes: 148 additions & 0 deletions lld/test/ELF/loongarch-pc-hi20-lo12-got.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# REQUIRES: loongarch
# RUN: rm -rf %t && split-file %s %t && cd %t

# RUN: llvm-mc --filetype=obj --triple=loongarch64 a.s -o a.o
# RUN: llvm-mc --filetype=obj --triple=loongarch64 unpaired.s -o unpaired.o
# RUN: llvm-mc --filetype=obj --triple=loongarch64 lone-ldr.s -o lone-ldr.o

# RUN: ld.lld a.o -T within-range.t -o a
# RUN: llvm-objdump -d --no-show-raw-insn a | FileCheck %s

## This test verifies the encoding when the register $a0 is used.
# CHECK: pcalau12i $a0, 0
# CHECK-NEXT: addi.d $a0, $a0, -2048

## PCALAU12I contains a nonzero addend, no relaxations should be applied.
# CHECK-NEXT: pcalau12i $a1, 2
# CHECK-NEXT: ld.d $a1, $a1, -2048

## LD contains a nonzero addend, no relaxations should be applied.
# CHECK-NEXT: pcalau12i $a2, 2
# CHECK-NEXT: ld.d $a2, $a2, -2040

## PCALAU12I and LD use different registers, no relaxations should be applied.
# CHECK-NEXT: pcalau12i $a3, 2
# CHECK-NEXT: ld.d $a4, $a3, -2048

## PCALAU12I and LD use different registers, no relaxations should be applied.
# CHECK-NEXT: pcalau12i $a5, 2
# CHECK-NEXT: ld.d $a5, $a6, -2048

# RUN: ld.lld a.o -T underflow-range.t -o a
# RUN: llvm-objdump -d --no-show-raw-insn a | FileCheck --check-prefix=OUTRANGE %s

# RUN: ld.lld a.o -T overflow-range.t -o a
# RUN: llvm-objdump -d --no-show-raw-insn a | FileCheck --check-prefix=OUTRANGE %s

# OUTRANGE: pcalau12i $a0, 1
# OUTRANGE-NEXT: ld.d $a0, $a0, 0

## Relocations do not appear in pairs, no relaxations should be applied.
# RUN: ld.lld unpaired.o -T within-range.t -o unpaired
# RUN: llvm-objdump --no-show-raw-insn -d unpaired | FileCheck --check-prefix=UNPAIRED %s

# UNPAIRED: pcalau12i $a0, 2
# UNPAIRED-NEXT: b 8
# UNPAIRED-NEXT: pcalau12i $a0, 2
# UNPAIRED: ld.d $a0, $a0, -2048

## Relocations do not appear in pairs, no relaxations should be applied.
# RUN: ld.lld lone-ldr.o -T within-range.t -o lone-ldr
# RUN: llvm-objdump --no-show-raw-insn -d lone-ldr | FileCheck --check-prefix=LONE-LDR %s

# LONE-LDR: ld.d $a0, $a0, -2048

## 32-bit code is mostly the same. We only test a few variants.
# RUN: llvm-mc --filetype=obj --triple=loongarch32 a.32.s -o a.32.o
# RUN: ld.lld a.32.o -T within-range.t -o a32
# RUN: llvm-objdump -d --no-show-raw-insn a32 | FileCheck --check-prefix=CHECK32 %s

## This test verifies the encoding when the register $a0 is used.
# CHECK32: pcalau12i $a0, 0
# CHECK32-NEXT: addi.w $a0, $a0, -2048


## This linker script ensures that .rodata and .text are sufficiently close to
## each other so that the pcalau12i + ld pair can be relaxed to pcalau12i + add.
#--- within-range.t
SECTIONS {
.rodata 0x1800: { *(.rodata) }
.text 0x2800: { *(.text) }
.got 0x3800: { *(.got) }
}

## This linker script ensures that .rodata and .text are sufficiently far apart
## so that the pcalau12i + ld pair cannot be relaxed to pcalau12i + add.
#--- underflow-range.t
SECTIONS {
.rodata 0x800-4: { *(.rodata) }
.got 0x80002000: { *(.got) }
.text 0x80001000: { *(.text) } /* (0x800-4)+2GB+0x800+4 */
}

#--- overflow-range.t
SECTIONS {
.text 0x1000: { *(.text) }
.got 0x2000: { *(.got) }
.rodate 0x80000800 : { *(.rodata) } /* 0x1000+2GB-0x800 */
}

## This linker script ensures that .rodata and .text are sufficiently (>4GB)
## far apart so that the pcalau12i + ld pair cannot be relaxed to pcalau12i + add.

#--- a.s
## Symbol 'x' is nonpreemptible, the optimization should be applied.
.rodata
.hidden x
x:
.word 10

.text
.global _start
_start:
pcalau12i $a0, %got_pc_hi20(x)
ld.d $a0, $a0, %got_pc_lo12(x)
pcalau12i $a1, %got_pc_hi20(x+1)
ld.d $a1, $a1, %got_pc_lo12(x)
pcalau12i $a2, %got_pc_hi20(x)
ld.d $a2, $a2, %got_pc_lo12(x+8)
pcalau12i $a3, %got_pc_hi20(x)
ld.d $a4, $a3, %got_pc_lo12(x)
pcalau12i $a5, %got_pc_hi20(x)
ld.d $a5, $a6, %got_pc_lo12(x)

#--- unpaired.s
.text
.hidden x
x:
nop
.global _start
_start:
pcalau12i $a0, %got_pc_hi20(x)
b L
pcalau12i $a0, %got_pc_hi20(x)
L:
ld.d $a0, $a0, %got_pc_lo12(x)

#--- lone-ldr.s
.text
.hidden x
x:
nop
.global _start
_start:
ld.d $a0, $a0, %got_pc_lo12(x)


#--- a.32.s
## Symbol 'x' is nonpreemptible, the optimization should be applied.
.rodata
.hidden x
x:
.word 10

.text
.global _start
_start:
pcalau12i $a0, %got_pc_hi20(x)
ld.w $a0, $a0, %got_pc_lo12(x)