Skip to content

Commit 03ec56d

Browse files
visitorckwakpm00
authored andcommitted
lib min_heap: avoid indirect function call by providing default swap
The non-inline min heap API can result in an indirect function call to the custom swap function. This becomes particularly costly when CONFIG_MITIGATION_RETPOLINE is enabled, as indirect function calls are expensive in this case. To address this, copy the code from lib/sort.c and provide a default builtin swap implementation that performs element swaps based on the element size. This change allows most users to avoid the overhead of indirect function calls, improving efficiency. Link: https://lkml.kernel.org/r/[email protected] Signed-off-by: Kuan-Wei Chiu <[email protected]> Cc: Adrian Hunter <[email protected]> Cc: Arnaldo Carvalho de Melo <[email protected]> Cc: Ching-Chun (Jim) Huang <[email protected]> Cc: Coly Li <[email protected]> Cc: Ian Rogers <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Jiri Olsa <[email protected]> Cc: Jonathan Corbet <[email protected]> Cc: Kent Overstreet <[email protected]> Cc: "Liang, Kan" <[email protected]> Cc: Mark Rutland <[email protected]> Cc: Matthew Sakai <[email protected]> Cc: Matthew Wilcox (Oracle) <[email protected]> Cc: Namhyung Kim <[email protected]> Cc: Peter Zijlstra <[email protected]> Signed-off-by: Andrew Morton <[email protected]>
1 parent aa5888a commit 03ec56d

File tree

1 file changed

+156
-3
lines changed

1 file changed

+156
-3
lines changed

include/linux/min_heap.h

Lines changed: 156 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,147 @@ struct min_heap_callbacks {
3838
void (*swp)(void *lhs, void *rhs, void *args);
3939
};
4040

41+
/**
42+
* is_aligned - is this pointer & size okay for word-wide copying?
43+
* @base: pointer to data
44+
* @size: size of each element
45+
* @align: required alignment (typically 4 or 8)
46+
*
47+
* Returns true if elements can be copied using word loads and stores.
48+
* The size must be a multiple of the alignment, and the base address must
49+
* be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
50+
*
51+
* For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
52+
* to "if ((a | b) & mask)", so we do that by hand.
53+
*/
54+
__attribute_const__ __always_inline
55+
static bool is_aligned(const void *base, size_t size, unsigned char align)
56+
{
57+
unsigned char lsbits = (unsigned char)size;
58+
59+
(void)base;
60+
#ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
61+
lsbits |= (unsigned char)(uintptr_t)base;
62+
#endif
63+
return (lsbits & (align - 1)) == 0;
64+
}
65+
66+
/**
67+
* swap_words_32 - swap two elements in 32-bit chunks
68+
* @a: pointer to the first element to swap
69+
* @b: pointer to the second element to swap
70+
* @n: element size (must be a multiple of 4)
71+
*
72+
* Exchange the two objects in memory. This exploits base+index addressing,
73+
* which basically all CPUs have, to minimize loop overhead computations.
74+
*
75+
* For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
76+
* bottom of the loop, even though the zero flag is still valid from the
77+
* subtract (since the intervening mov instructions don't alter the flags).
78+
* Gcc 8.1.0 doesn't have that problem.
79+
*/
80+
static __always_inline
81+
void swap_words_32(void *a, void *b, size_t n)
82+
{
83+
do {
84+
u32 t = *(u32 *)(a + (n -= 4));
85+
*(u32 *)(a + n) = *(u32 *)(b + n);
86+
*(u32 *)(b + n) = t;
87+
} while (n);
88+
}
89+
90+
/**
91+
* swap_words_64 - swap two elements in 64-bit chunks
92+
* @a: pointer to the first element to swap
93+
* @b: pointer to the second element to swap
94+
* @n: element size (must be a multiple of 8)
95+
*
96+
* Exchange the two objects in memory. This exploits base+index
97+
* addressing, which basically all CPUs have, to minimize loop overhead
98+
* computations.
99+
*
100+
* We'd like to use 64-bit loads if possible. If they're not, emulating
101+
* one requires base+index+4 addressing which x86 has but most other
102+
* processors do not. If CONFIG_64BIT, we definitely have 64-bit loads,
103+
* but it's possible to have 64-bit loads without 64-bit pointers (e.g.
104+
* x32 ABI). Are there any cases the kernel needs to worry about?
105+
*/
106+
static __always_inline
107+
void swap_words_64(void *a, void *b, size_t n)
108+
{
109+
do {
110+
#ifdef CONFIG_64BIT
111+
u64 t = *(u64 *)(a + (n -= 8));
112+
*(u64 *)(a + n) = *(u64 *)(b + n);
113+
*(u64 *)(b + n) = t;
114+
#else
115+
/* Use two 32-bit transfers to avoid base+index+4 addressing */
116+
u32 t = *(u32 *)(a + (n -= 4));
117+
*(u32 *)(a + n) = *(u32 *)(b + n);
118+
*(u32 *)(b + n) = t;
119+
120+
t = *(u32 *)(a + (n -= 4));
121+
*(u32 *)(a + n) = *(u32 *)(b + n);
122+
*(u32 *)(b + n) = t;
123+
#endif
124+
} while (n);
125+
}
126+
127+
/**
128+
* swap_bytes - swap two elements a byte at a time
129+
* @a: pointer to the first element to swap
130+
* @b: pointer to the second element to swap
131+
* @n: element size
132+
*
133+
* This is the fallback if alignment doesn't allow using larger chunks.
134+
*/
135+
static __always_inline
136+
void swap_bytes(void *a, void *b, size_t n)
137+
{
138+
do {
139+
char t = ((char *)a)[--n];
140+
((char *)a)[n] = ((char *)b)[n];
141+
((char *)b)[n] = t;
142+
} while (n);
143+
}
144+
145+
/*
146+
* The values are arbitrary as long as they can't be confused with
147+
* a pointer, but small integers make for the smallest compare
148+
* instructions.
149+
*/
150+
#define SWAP_WORDS_64 ((void (*)(void *, void *, void *))0)
151+
#define SWAP_WORDS_32 ((void (*)(void *, void *, void *))1)
152+
#define SWAP_BYTES ((void (*)(void *, void *, void *))2)
153+
154+
/*
155+
* Selects the appropriate swap function based on the element size.
156+
*/
157+
static __always_inline
158+
void *select_swap_func(const void *base, size_t size)
159+
{
160+
if (is_aligned(base, size, 8))
161+
return SWAP_WORDS_64;
162+
else if (is_aligned(base, size, 4))
163+
return SWAP_WORDS_32;
164+
else
165+
return SWAP_BYTES;
166+
}
167+
168+
static __always_inline
169+
void do_swap(void *a, void *b, size_t size, void (*swap_func)(void *lhs, void *rhs, void *args),
170+
void *priv)
171+
{
172+
if (swap_func == SWAP_WORDS_64)
173+
swap_words_64(a, b, size);
174+
else if (swap_func == SWAP_WORDS_32)
175+
swap_words_32(a, b, size);
176+
else if (swap_func == SWAP_BYTES)
177+
swap_bytes(a, b, size);
178+
else
179+
swap_func(a, b, priv);
180+
}
181+
41182
/**
42183
* parent - given the offset of the child, find the offset of the parent.
43184
* @i: the offset of the heap element whose parent is sought. Non-zero.
@@ -106,11 +247,15 @@ void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size,
106247
{
107248
const unsigned long lsbit = elem_size & -elem_size;
108249
void *data = heap->data;
250+
void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
109251
/* pre-scale counters for performance */
110252
size_t a = pos * elem_size;
111253
size_t b, c, d;
112254
size_t n = heap->nr * elem_size;
113255

256+
if (!swp)
257+
swp = select_swap_func(data, elem_size);
258+
114259
/* Find the sift-down path all the way to the leaves. */
115260
for (b = a; c = 2 * b + elem_size, (d = c + elem_size) < n;)
116261
b = func->less(data + c, data + d, args) ? c : d;
@@ -127,7 +272,7 @@ void __min_heap_sift_down_inline(min_heap_char *heap, int pos, size_t elem_size,
127272
c = b;
128273
while (b != a) {
129274
b = parent(b, lsbit, elem_size);
130-
func->swp(data + b, data + c, args);
275+
do_swap(data + b, data + c, elem_size, swp, args);
131276
}
132277
}
133278

@@ -142,14 +287,18 @@ void __min_heap_sift_up_inline(min_heap_char *heap, size_t elem_size, size_t idx
142287
{
143288
const unsigned long lsbit = elem_size & -elem_size;
144289
void *data = heap->data;
290+
void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
145291
/* pre-scale counters for performance */
146292
size_t a = idx * elem_size, b;
147293

294+
if (!swp)
295+
swp = select_swap_func(data, elem_size);
296+
148297
while (a) {
149298
b = parent(a, lsbit, elem_size);
150299
if (func->less(data + b, data + a, args))
151300
break;
152-
func->swp(data + a, data + b, args);
301+
do_swap(data + a, data + b, elem_size, swp, args);
153302
a = b;
154303
}
155304
}
@@ -242,15 +391,19 @@ bool __min_heap_del_inline(min_heap_char *heap, size_t elem_size, size_t idx,
242391
const struct min_heap_callbacks *func, void *args)
243392
{
244393
void *data = heap->data;
394+
void (*swp)(void *lhs, void *rhs, void *args) = func->swp;
245395

246396
if (WARN_ONCE(heap->nr <= 0, "Popping an empty heap"))
247397
return false;
248398

399+
if (!swp)
400+
swp = select_swap_func(data, elem_size);
401+
249402
/* Place last element at the root (position 0) and then sift down. */
250403
heap->nr--;
251404
if (idx == heap->nr)
252405
return true;
253-
func->swp(data + (idx * elem_size), data + (heap->nr * elem_size), args);
406+
do_swap(data + (idx * elem_size), data + (heap->nr * elem_size), elem_size, swp, args);
254407
__min_heap_sift_up_inline(heap, elem_size, idx, func, args);
255408
__min_heap_sift_down_inline(heap, idx, elem_size, func, args);
256409

0 commit comments

Comments
 (0)