Skip to content

Commit 5780e39

Browse files
committed
Merge tag 'x86-asm-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 assembly code updates from Ingo Molnar: - Micro-optimize the x86 bitops code - Define target-specific {raw,this}_cpu_try_cmpxchg{64,128}() to improve code generation - Define and use raw_cpu_try_cmpxchg() preempt_count_set() - Do not clobber %rsi in percpu_{try_,}cmpxchg{64,128}_op - Remove the unused __sw_hweight64() implementation on x86-32 - Misc fixes and cleanups * tag 'x86-asm-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/lib: Address kernel-doc warnings x86/entry: Fix typos in comments x86/entry: Remove unused argument %rsi passed to exc_nmi() x86/bitops: Remove unused __sw_hweight64() assembly implementation on x86-32 x86/percpu: Do not clobber %rsi in percpu_{try_,}cmpxchg{64,128}_op x86/percpu: Use raw_cpu_try_cmpxchg() in preempt_count_set() x86/percpu: Define raw_cpu_try_cmpxchg and this_cpu_try_cmpxchg() x86/percpu: Define {raw,this}_cpu_try_cmpxchg{64,128} x86/asm/bitops: Use __builtin_clz{l|ll} to evaluate constant expressions
2 parents 2b95bb0 + 8ae292c commit 5780e39

File tree

6 files changed

+125
-33
lines changed

6 files changed

+125
-33
lines changed

arch/x86/entry/entry_64.S

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1163,8 +1163,8 @@ SYM_CODE_START(asm_exc_nmi)
11631163
* anyway.
11641164
*
11651165
* To handle this case we do the following:
1166-
* Check the a special location on the stack that contains
1167-
* a variable that is set when NMIs are executing.
1166+
* Check a special location on the stack that contains a
1167+
* variable that is set when NMIs are executing.
11681168
* The interrupted task's stack is also checked to see if it
11691169
* is an NMI stack.
11701170
* If the variable is not set and the stack is not the NMI
@@ -1237,7 +1237,6 @@ SYM_CODE_START(asm_exc_nmi)
12371237
*/
12381238

12391239
movq %rsp, %rdi
1240-
movq $-1, %rsi
12411240
call exc_nmi
12421241

12431242
/*
@@ -1295,8 +1294,8 @@ SYM_CODE_START(asm_exc_nmi)
12951294
* end_repeat_nmi, then we are a nested NMI. We must not
12961295
* modify the "iret" frame because it's being written by
12971296
* the outer NMI. That's okay; the outer NMI handler is
1298-
* about to about to call exc_nmi() anyway, so we can just
1299-
* resume the outer NMI.
1297+
* about to call exc_nmi() anyway, so we can just resume
1298+
* the outer NMI.
13001299
*/
13011300

13021301
movq $repeat_nmi, %rdx
@@ -1451,7 +1450,6 @@ end_repeat_nmi:
14511450
UNWIND_HINT_REGS
14521451

14531452
movq %rsp, %rdi
1454-
movq $-1, %rsi
14551453
call exc_nmi
14561454

14571455
/* Always restore stashed SPEC_CTRL value (see paranoid_entry) */

arch/x86/include/asm/bitops.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,9 @@ static __always_inline unsigned long variable_ffz(unsigned long word)
293293
*/
294294
static __always_inline unsigned long __fls(unsigned long word)
295295
{
296+
if (__builtin_constant_p(word))
297+
return BITS_PER_LONG - 1 - __builtin_clzl(word);
298+
296299
asm("bsr %1,%0"
297300
: "=r" (word)
298301
: "rm" (word));
@@ -360,6 +363,9 @@ static __always_inline int fls(unsigned int x)
360363
{
361364
int r;
362365

366+
if (__builtin_constant_p(x))
367+
return x ? 32 - __builtin_clz(x) : 0;
368+
363369
#ifdef CONFIG_X86_64
364370
/*
365371
* AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the
@@ -401,6 +407,9 @@ static __always_inline int fls(unsigned int x)
401407
static __always_inline int fls64(__u64 x)
402408
{
403409
int bitpos = -1;
410+
411+
if (__builtin_constant_p(x))
412+
return x ? 64 - __builtin_clzll(x) : 0;
404413
/*
405414
* AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the
406415
* dest reg is undefined if x==0, but their CPU architect says its

arch/x86/include/asm/percpu.h

Lines changed: 104 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -210,6 +210,25 @@ do { \
210210
(typeof(_var))(unsigned long) pco_old__; \
211211
})
212212

213+
#define percpu_try_cmpxchg_op(size, qual, _var, _ovalp, _nval) \
214+
({ \
215+
bool success; \
216+
__pcpu_type_##size *pco_oval__ = (__pcpu_type_##size *)(_ovalp); \
217+
__pcpu_type_##size pco_old__ = *pco_oval__; \
218+
__pcpu_type_##size pco_new__ = __pcpu_cast_##size(_nval); \
219+
asm qual (__pcpu_op2_##size("cmpxchg", "%[nval]", \
220+
__percpu_arg([var])) \
221+
CC_SET(z) \
222+
: CC_OUT(z) (success), \
223+
[oval] "+a" (pco_old__), \
224+
[var] "+m" (_var) \
225+
: [nval] __pcpu_reg_##size(, pco_new__) \
226+
: "memory"); \
227+
if (unlikely(!success)) \
228+
*pco_oval__ = pco_old__; \
229+
likely(success); \
230+
})
231+
213232
#if defined(CONFIG_X86_32) && !defined(CONFIG_UML)
214233
#define percpu_cmpxchg64_op(size, qual, _var, _oval, _nval) \
215234
({ \
@@ -223,26 +242,63 @@ do { \
223242
old__.var = _oval; \
224243
new__.var = _nval; \
225244
\
226-
asm qual (ALTERNATIVE("leal %P[var], %%esi; call this_cpu_cmpxchg8b_emu", \
245+
asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
227246
"cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
228247
: [var] "+m" (_var), \
229248
"+a" (old__.low), \
230249
"+d" (old__.high) \
231250
: "b" (new__.low), \
232-
"c" (new__.high) \
233-
: "memory", "esi"); \
251+
"c" (new__.high), \
252+
"S" (&(_var)) \
253+
: "memory"); \
234254
\
235255
old__.var; \
236256
})
237257

238258
#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, , pcp, oval, nval)
239259
#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg64_op(8, volatile, pcp, oval, nval)
260+
261+
#define percpu_try_cmpxchg64_op(size, qual, _var, _ovalp, _nval) \
262+
({ \
263+
bool success; \
264+
u64 *_oval = (u64 *)(_ovalp); \
265+
union { \
266+
u64 var; \
267+
struct { \
268+
u32 low, high; \
269+
}; \
270+
} old__, new__; \
271+
\
272+
old__.var = *_oval; \
273+
new__.var = _nval; \
274+
\
275+
asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
276+
"cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
277+
CC_SET(z) \
278+
: CC_OUT(z) (success), \
279+
[var] "+m" (_var), \
280+
"+a" (old__.low), \
281+
"+d" (old__.high) \
282+
: "b" (new__.low), \
283+
"c" (new__.high), \
284+
"S" (&(_var)) \
285+
: "memory"); \
286+
if (unlikely(!success)) \
287+
*_oval = old__.var; \
288+
likely(success); \
289+
})
290+
291+
#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, , pcp, ovalp, nval)
292+
#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg64_op(8, volatile, pcp, ovalp, nval)
240293
#endif
241294

242295
#ifdef CONFIG_X86_64
243296
#define raw_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval);
244297
#define this_cpu_cmpxchg64(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval);
245298

299+
#define raw_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval);
300+
#define this_cpu_try_cmpxchg64(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval);
301+
246302
#define percpu_cmpxchg128_op(size, qual, _var, _oval, _nval) \
247303
({ \
248304
union { \
@@ -255,20 +311,54 @@ do { \
255311
old__.var = _oval; \
256312
new__.var = _nval; \
257313
\
258-
asm qual (ALTERNATIVE("leaq %P[var], %%rsi; call this_cpu_cmpxchg16b_emu", \
314+
asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
259315
"cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
260316
: [var] "+m" (_var), \
261317
"+a" (old__.low), \
262318
"+d" (old__.high) \
263319
: "b" (new__.low), \
264-
"c" (new__.high) \
265-
: "memory", "rsi"); \
320+
"c" (new__.high), \
321+
"S" (&(_var)) \
322+
: "memory"); \
266323
\
267324
old__.var; \
268325
})
269326

270327
#define raw_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, , pcp, oval, nval)
271328
#define this_cpu_cmpxchg128(pcp, oval, nval) percpu_cmpxchg128_op(16, volatile, pcp, oval, nval)
329+
330+
#define percpu_try_cmpxchg128_op(size, qual, _var, _ovalp, _nval) \
331+
({ \
332+
bool success; \
333+
u128 *_oval = (u128 *)(_ovalp); \
334+
union { \
335+
u128 var; \
336+
struct { \
337+
u64 low, high; \
338+
}; \
339+
} old__, new__; \
340+
\
341+
old__.var = *_oval; \
342+
new__.var = _nval; \
343+
\
344+
asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
345+
"cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
346+
CC_SET(z) \
347+
: CC_OUT(z) (success), \
348+
[var] "+m" (_var), \
349+
"+a" (old__.low), \
350+
"+d" (old__.high) \
351+
: "b" (new__.low), \
352+
"c" (new__.high), \
353+
"S" (&(_var)) \
354+
: "memory"); \
355+
if (unlikely(!success)) \
356+
*_oval = old__.var; \
357+
likely(success); \
358+
})
359+
360+
#define raw_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, , pcp, ovalp, nval)
361+
#define this_cpu_try_cmpxchg128(pcp, ovalp, nval) percpu_try_cmpxchg128_op(16, volatile, pcp, ovalp, nval)
272362
#endif
273363

274364
/*
@@ -343,13 +433,19 @@ do { \
343433
#define raw_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, , pcp, oval, nval)
344434
#define raw_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, , pcp, oval, nval)
345435
#define raw_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, , pcp, oval, nval)
436+
#define raw_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, , pcp, ovalp, nval)
437+
#define raw_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, , pcp, ovalp, nval)
438+
#define raw_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, , pcp, ovalp, nval)
346439

347440
#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(1, volatile, pcp, val)
348441
#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(2, volatile, pcp, val)
349442
#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(4, volatile, pcp, val)
350443
#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(1, volatile, pcp, oval, nval)
351444
#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(2, volatile, pcp, oval, nval)
352445
#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(4, volatile, pcp, oval, nval)
446+
#define this_cpu_try_cmpxchg_1(pcp, ovalp, nval) percpu_try_cmpxchg_op(1, volatile, pcp, ovalp, nval)
447+
#define this_cpu_try_cmpxchg_2(pcp, ovalp, nval) percpu_try_cmpxchg_op(2, volatile, pcp, ovalp, nval)
448+
#define this_cpu_try_cmpxchg_4(pcp, ovalp, nval) percpu_try_cmpxchg_op(4, volatile, pcp, ovalp, nval)
353449

354450
/*
355451
* Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -364,6 +460,7 @@ do { \
364460
#define raw_cpu_add_return_8(pcp, val) percpu_add_return_op(8, , pcp, val)
365461
#define raw_cpu_xchg_8(pcp, nval) raw_percpu_xchg_op(pcp, nval)
366462
#define raw_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, , pcp, oval, nval)
463+
#define raw_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, , pcp, ovalp, nval)
367464

368465
#define this_cpu_read_8(pcp) percpu_from_op(8, volatile, "mov", pcp)
369466
#define this_cpu_write_8(pcp, val) percpu_to_op(8, volatile, "mov", (pcp), val)
@@ -373,6 +470,7 @@ do { \
373470
#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
374471
#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval)
375472
#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
473+
#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
376474
#endif
377475

378476
static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,

arch/x86/include/asm/preempt.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@ static __always_inline void preempt_count_set(int pc)
3131
{
3232
int old, new;
3333

34+
old = raw_cpu_read_4(pcpu_hot.preempt_count);
3435
do {
35-
old = raw_cpu_read_4(pcpu_hot.preempt_count);
3636
new = (old & PREEMPT_NEED_RESCHED) |
3737
(pc & ~PREEMPT_NEED_RESCHED);
38-
} while (raw_cpu_cmpxchg_4(pcpu_hot.preempt_count, old, new) != old);
38+
} while (!raw_cpu_try_cmpxchg_4(pcpu_hot.preempt_count, &old, new));
3939
}
4040

4141
/*

arch/x86/lib/csum-wrappers_64.c

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,6 @@
1414
* @src: source address (user space)
1515
* @dst: destination address
1616
* @len: number of bytes to be copied.
17-
* @isum: initial sum that is added into the result (32bit unfolded)
18-
* @errp: set to -EFAULT for an bad source address.
1917
*
2018
* Returns an 32bit unfolded checksum of the buffer.
2119
* src and dst are best aligned to 64bits.
@@ -38,8 +36,6 @@ csum_and_copy_from_user(const void __user *src, void *dst, int len)
3836
* @src: source address
3937
* @dst: destination address (user space)
4038
* @len: number of bytes to be copied.
41-
* @isum: initial sum that is added into the result (32bit unfolded)
42-
* @errp: set to -EFAULT for an bad destination address.
4339
*
4440
* Returns an 32bit unfolded checksum of the buffer.
4541
* src and dst are best aligned to 64bits.
@@ -62,7 +58,6 @@ csum_and_copy_to_user(const void *src, void __user *dst, int len)
6258
* @src: source address
6359
* @dst: destination address
6460
* @len: number of bytes to be copied.
65-
* @sum: initial sum that is added into the result (32bit unfolded)
6661
*
6762
* Returns an 32bit unfolded checksum of the buffer.
6863
*/

arch/x86/lib/hweight.S

Lines changed: 6 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,12 @@ SYM_FUNC_START(__sw_hweight32)
3636
SYM_FUNC_END(__sw_hweight32)
3737
EXPORT_SYMBOL(__sw_hweight32)
3838

39-
SYM_FUNC_START(__sw_hweight64)
39+
/*
40+
* No 32-bit variant, because it's implemented as an inline wrapper
41+
* on top of __arch_hweight32():
42+
*/
4043
#ifdef CONFIG_X86_64
44+
SYM_FUNC_START(__sw_hweight64)
4145
pushq %rdi
4246
pushq %rdx
4347

@@ -66,18 +70,6 @@ SYM_FUNC_START(__sw_hweight64)
6670
popq %rdx
6771
popq %rdi
6872
RET
69-
#else /* CONFIG_X86_32 */
70-
/* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */
71-
pushl %ecx
72-
73-
call __sw_hweight32
74-
movl %eax, %ecx # stash away result
75-
movl %edx, %eax # second part of input
76-
call __sw_hweight32
77-
addl %ecx, %eax # result
78-
79-
popl %ecx
80-
RET
81-
#endif
8273
SYM_FUNC_END(__sw_hweight64)
8374
EXPORT_SYMBOL(__sw_hweight64)
75+
#endif

0 commit comments

Comments
 (0)