Skip to content

Commit be17c0d

Browse files
SiFiveHollandpalmer-dabbelt
authored andcommitted
riscv: module: Optimize PLT/GOT entry counting
perf reports that 99.63% of the cycles from `modprobe amdgpu` are spent inside module_frob_arch_sections(). This is because amdgpu.ko contains about 300000 relocations in its .rela.text section, and the algorithm in count_max_entries() takes quadratic time. Apply two optimizations from the arm64 code, which together reduce the total execution time by 99.58%. First, sort the relocations so duplicate entries are adjacent. Second, reduce the number of relocations that must be sorted by filtering to only relocations that need PLT/GOT entries, as done in commit d4e0340 ("arm64/module: Optimize module load time by optimizing PLT counting"). Unlike the arm64 code, here the filtering and sorting is done in a scratch buffer, because the HI20 relocation search optimization in apply_relocate_add() depends on the original order of the relocations. This allows accumulating PLT/GOT relocations across sections so sorting and counting is only done once per module. Signed-off-by: Samuel Holland <[email protected]> Reviewed-by: Andrew Jones <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Alexandre Ghiti <[email protected]> Signed-off-by: Palmer Dabbelt <[email protected]>
1 parent 881dadf commit be17c0d

File tree

1 file changed

+65
-16
lines changed

1 file changed

+65
-16
lines changed

arch/riscv/kernel/module-sections.c

Lines changed: 65 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
#include <linux/kernel.h>
1010
#include <linux/module.h>
1111
#include <linux/moduleloader.h>
12+
#include <linux/sort.h>
1213

1314
unsigned long module_emit_got_entry(struct module *mod, unsigned long val)
1415
{
@@ -55,44 +56,70 @@ unsigned long module_emit_plt_entry(struct module *mod, unsigned long val)
5556
return (unsigned long)&plt[i];
5657
}
5758

58-
static int is_rela_equal(const Elf_Rela *x, const Elf_Rela *y)
59+
#define cmp_3way(a, b) ((a) < (b) ? -1 : (a) > (b))
60+
61+
static int cmp_rela(const void *a, const void *b)
5962
{
60-
return x->r_info == y->r_info && x->r_addend == y->r_addend;
63+
const Elf_Rela *x = a, *y = b;
64+
int i;
65+
66+
/* sort by type, symbol index and addend */
67+
i = cmp_3way(x->r_info, y->r_info);
68+
if (i == 0)
69+
i = cmp_3way(x->r_addend, y->r_addend);
70+
return i;
6171
}
6272

6373
static bool duplicate_rela(const Elf_Rela *rela, int idx)
6474
{
65-
int i;
66-
for (i = 0; i < idx; i++) {
67-
if (is_rela_equal(&rela[i], &rela[idx]))
68-
return true;
69-
}
70-
return false;
75+
/*
76+
* Entries are sorted by type, symbol index and addend. That means
77+
* that, if a duplicate entry exists, it must be in the preceding slot.
78+
*/
79+
return idx > 0 && cmp_rela(rela + idx, rela + idx - 1) == 0;
7180
}
7281

73-
static void count_max_entries(Elf_Rela *relas, int num,
82+
static void count_max_entries(const Elf_Rela *relas, size_t num,
7483
unsigned int *plts, unsigned int *gots)
7584
{
76-
for (int i = 0; i < num; i++) {
85+
for (size_t i = 0; i < num; i++) {
86+
if (duplicate_rela(relas, i))
87+
continue;
88+
7789
switch (ELF_R_TYPE(relas[i].r_info)) {
7890
case R_RISCV_CALL_PLT:
7991
case R_RISCV_PLT32:
80-
if (!duplicate_rela(relas, i))
81-
(*plts)++;
92+
(*plts)++;
8293
break;
8394
case R_RISCV_GOT_HI20:
84-
if (!duplicate_rela(relas, i))
85-
(*gots)++;
95+
(*gots)++;
8696
break;
97+
default:
98+
unreachable();
8799
}
88100
}
89101
}
90102

103+
static bool rela_needs_plt_got_entry(const Elf_Rela *rela)
104+
{
105+
switch (ELF_R_TYPE(rela->r_info)) {
106+
case R_RISCV_CALL_PLT:
107+
case R_RISCV_GOT_HI20:
108+
case R_RISCV_PLT32:
109+
return true;
110+
default:
111+
return false;
112+
}
113+
}
114+
91115
int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
92116
char *secstrings, struct module *mod)
93117
{
118+
size_t num_scratch_relas = 0;
94119
unsigned int num_plts = 0;
95120
unsigned int num_gots = 0;
121+
Elf_Rela *scratch = NULL;
122+
size_t scratch_size = 0;
96123
int i;
97124

98125
/*
@@ -122,9 +149,10 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
122149

123150
/* Calculate the maxinum number of entries */
124151
for (i = 0; i < ehdr->e_shnum; i++) {
152+
size_t num_relas = sechdrs[i].sh_size / sizeof(Elf_Rela);
125153
Elf_Rela *relas = (void *)ehdr + sechdrs[i].sh_offset;
126-
int num_rela = sechdrs[i].sh_size / sizeof(Elf_Rela);
127154
Elf_Shdr *dst_sec = sechdrs + sechdrs[i].sh_info;
155+
size_t scratch_size_needed;
128156

129157
if (sechdrs[i].sh_type != SHT_RELA)
130158
continue;
@@ -133,7 +161,28 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
133161
if (!(dst_sec->sh_flags & SHF_EXECINSTR))
134162
continue;
135163

136-
count_max_entries(relas, num_rela, &num_plts, &num_gots);
164+
/*
165+
* apply_relocate_add() relies on HI20 and LO12 relocation pairs being
166+
* close together, so sort a copy of the section to avoid interfering.
167+
*/
168+
scratch_size_needed = (num_scratch_relas + num_relas) * sizeof(*scratch);
169+
if (scratch_size_needed > scratch_size) {
170+
scratch_size = scratch_size_needed;
171+
scratch = kvrealloc(scratch, scratch_size, GFP_KERNEL);
172+
if (!scratch)
173+
return -ENOMEM;
174+
}
175+
176+
for (size_t j = 0; j < num_relas; j++)
177+
if (rela_needs_plt_got_entry(&relas[j]))
178+
scratch[num_scratch_relas++] = relas[j];
179+
}
180+
181+
if (scratch) {
182+
/* sort the accumulated PLT/GOT relocations so duplicates are adjacent */
183+
sort(scratch, num_scratch_relas, sizeof(*scratch), cmp_rela, NULL);
184+
count_max_entries(scratch, num_scratch_relas, &num_plts, &num_gots);
185+
kvfree(scratch);
137186
}
138187

139188
mod->arch.plt.shdr->sh_type = SHT_NOBITS;

0 commit comments

Comments
 (0)