Skip to content

Commit e76f69b

Browse files
committed
Merge tag 'x86-percpu-2024-05-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 percpu updates from Ingo Molnar: - Expand the named address spaces optimizations down to GCC 9.1+. - Re-enable named address spaces with sanitizers for GCC 13.3+ - Generate better this_percpu_xchg_op() code - Introduce raw_cpu_read_long() to reduce ifdeffery - Simplify the x86_this_cpu_test_bit() et al macros - Address Sparse warnings - Misc cleanups & fixes * tag 'x86-percpu-2024-05-13' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/percpu: Introduce raw_cpu_read_long() to reduce ifdeffery x86/percpu: Rewrite x86_this_cpu_test_bit() and friends as macros x86/percpu: Fix x86_this_cpu_variable_test_bit() asm template x86/percpu: Re-enable named address spaces with sanitizers for GCC 13.3+ x86/percpu: Use __force to cast from __percpu address space x86/percpu: Do not use this_cpu_read_stable_8() for 32-bit targets x86/percpu: Unify arch_raw_cpu_ptr() defines x86/percpu: Enable named address spaces for GCC 9.1+ x86/percpu: Re-enable named address spaces with KASAN for GCC 13.3+ x86/percpu: Move raw_percpu_xchg_op() to a better place x86/percpu: Convert this_percpu_xchg_op() from asm() to C code, to generate better code
2 parents eabb629 + 93cfa54 commit e76f69b

File tree

4 files changed

+78
-99
lines changed

4 files changed

+78
-99
lines changed

arch/um/include/asm/cpufeature.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
3838

3939
#define this_cpu_has(bit) \
4040
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
41-
x86_this_cpu_test_bit(bit, \
42-
(unsigned long __percpu *)&cpu_info.x86_capability))
41+
x86_this_cpu_test_bit(bit, cpu_info.x86_capability))
4342

4443
/*
4544
* This macro is for detection of features which need kernel

arch/x86/Kconfig

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2418,18 +2418,20 @@ source "kernel/livepatch/Kconfig"
24182418
endmenu
24192419

24202420
config CC_HAS_NAMED_AS
2421-
def_bool CC_IS_GCC && GCC_VERSION >= 120100
2421+
def_bool CC_IS_GCC && GCC_VERSION >= 90100
2422+
2423+
config CC_HAS_NAMED_AS_FIXED_SANITIZERS
2424+
def_bool CC_IS_GCC && GCC_VERSION >= 130300
24222425

24232426
config USE_X86_SEG_SUPPORT
24242427
def_bool y
24252428
depends on CC_HAS_NAMED_AS
24262429
#
2427-
# -fsanitize=kernel-address (KASAN) is at the moment incompatible
2428-
# with named address spaces - see GCC PR sanitizer/111736.
2430+
# -fsanitize=kernel-address (KASAN) and -fsanitize=thread
2431+
# (KCSAN) are incompatible with named address spaces with
2432+
# GCC < 13.3 - see GCC PR sanitizer/111736.
24292433
#
2430-
depends on !KASAN
2431-
# -fsanitize=thread (KCSAN) is also incompatible.
2432-
depends on !KCSAN
2434+
depends on !(KASAN || KCSAN) || CC_HAS_NAMED_AS_FIXED_SANITIZERS
24332435

24342436
config CC_HAS_SLS
24352437
def_bool $(cc-option,-mharden-sls=all)

arch/x86/include/asm/cpufeature.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -129,8 +129,7 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
129129

130130
#define this_cpu_has(bit) \
131131
(__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
132-
x86_this_cpu_test_bit(bit, \
133-
(unsigned long __percpu *)&cpu_info.x86_capability))
132+
x86_this_cpu_test_bit(bit, cpu_info.x86_capability))
134133

135134
/*
136135
* This macro is for detection of features which need kernel

arch/x86/include/asm/percpu.h

Lines changed: 68 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -59,36 +59,24 @@
5959
#define __force_percpu_prefix "%%"__stringify(__percpu_seg)":"
6060
#define __my_cpu_offset this_cpu_read(this_cpu_off)
6161

62-
#ifdef CONFIG_USE_X86_SEG_SUPPORT
63-
/*
64-
* Efficient implementation for cases in which the compiler supports
65-
* named address spaces. Allows the compiler to perform additional
66-
* optimizations that can save more instructions.
67-
*/
68-
#define arch_raw_cpu_ptr(ptr) \
69-
({ \
70-
unsigned long tcp_ptr__; \
71-
tcp_ptr__ = __raw_cpu_read(, this_cpu_off); \
72-
\
73-
tcp_ptr__ += (unsigned long)(ptr); \
74-
(typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
75-
})
76-
#else /* CONFIG_USE_X86_SEG_SUPPORT */
7762
/*
7863
* Compared to the generic __my_cpu_offset version, the following
7964
* saves one instruction and avoids clobbering a temp register.
65+
*
66+
* arch_raw_cpu_ptr should not be used in 32-bit VDSO for a 64-bit
67+
* kernel, because games are played with CONFIG_X86_64 there and
68+
* sizeof(this_cpu_off) becames 4.
8069
*/
81-
#define arch_raw_cpu_ptr(ptr) \
70+
#ifndef BUILD_VDSO32_64
71+
#define arch_raw_cpu_ptr(_ptr) \
8272
({ \
83-
unsigned long tcp_ptr__; \
84-
asm ("mov " __percpu_arg(1) ", %0" \
85-
: "=r" (tcp_ptr__) \
86-
: "m" (__my_cpu_var(this_cpu_off))); \
87-
\
88-
tcp_ptr__ += (unsigned long)(ptr); \
89-
(typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
73+
unsigned long tcp_ptr__ = raw_cpu_read_long(this_cpu_off); \
74+
tcp_ptr__ += (__force unsigned long)(_ptr); \
75+
(typeof(*(_ptr)) __kernel __force *)tcp_ptr__; \
9076
})
91-
#endif /* CONFIG_USE_X86_SEG_SUPPORT */
77+
#else
78+
#define arch_raw_cpu_ptr(_ptr) ({ BUILD_BUG(); (typeof(_ptr))0; })
79+
#endif
9280

9381
#define PER_CPU_VAR(var) %__percpu_seg:(var)__percpu_rel
9482

@@ -102,8 +90,8 @@
10290
#endif /* CONFIG_SMP */
10391

10492
#define __my_cpu_type(var) typeof(var) __percpu_seg_override
105-
#define __my_cpu_ptr(ptr) (__my_cpu_type(*ptr) *)(uintptr_t)(ptr)
106-
#define __my_cpu_var(var) (*__my_cpu_ptr(&var))
93+
#define __my_cpu_ptr(ptr) (__my_cpu_type(*(ptr))*)(__force uintptr_t)(ptr)
94+
#define __my_cpu_var(var) (*__my_cpu_ptr(&(var)))
10795
#define __percpu_arg(x) __percpu_prefix "%" #x
10896
#define __force_percpu_arg(x) __force_percpu_prefix "%" #x
10997

@@ -230,25 +218,26 @@ do { \
230218
})
231219

232220
/*
233-
* xchg is implemented using cmpxchg without a lock prefix. xchg is
234-
* expensive due to the implied lock prefix. The processor cannot prefetch
235-
* cachelines if xchg is used.
221+
* raw_cpu_xchg() can use a load-store since
222+
* it is not required to be IRQ-safe.
236223
*/
237-
#define percpu_xchg_op(size, qual, _var, _nval) \
224+
#define raw_percpu_xchg_op(_var, _nval) \
238225
({ \
239-
__pcpu_type_##size pxo_old__; \
240-
__pcpu_type_##size pxo_new__ = __pcpu_cast_##size(_nval); \
241-
asm qual (__pcpu_op2_##size("mov", __percpu_arg([var]), \
242-
"%[oval]") \
243-
"\n1:\t" \
244-
__pcpu_op2_##size("cmpxchg", "%[nval]", \
245-
__percpu_arg([var])) \
246-
"\n\tjnz 1b" \
247-
: [oval] "=&a" (pxo_old__), \
248-
[var] "+m" (__my_cpu_var(_var)) \
249-
: [nval] __pcpu_reg_##size(, pxo_new__) \
250-
: "memory"); \
251-
(typeof(_var))(unsigned long) pxo_old__; \
226+
typeof(_var) pxo_old__ = raw_cpu_read(_var); \
227+
raw_cpu_write(_var, _nval); \
228+
pxo_old__; \
229+
})
230+
231+
/*
232+
* this_cpu_xchg() is implemented using cmpxchg without a lock prefix.
233+
* xchg is expensive due to the implied lock prefix. The processor
234+
* cannot prefetch cachelines if xchg is used.
235+
*/
236+
#define this_percpu_xchg_op(_var, _nval) \
237+
({ \
238+
typeof(_var) pxo_old__ = this_cpu_read(_var); \
239+
do { } while (!this_cpu_try_cmpxchg(_var, &pxo_old__, _nval)); \
240+
pxo_old__; \
252241
})
253242

254243
/*
@@ -428,10 +417,6 @@ do { \
428417
* actually per-thread variables implemented as per-CPU variables and
429418
* thus stable for the duration of the respective task.
430419
*/
431-
#define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp)
432-
#define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp)
433-
#define this_cpu_read_stable_4(pcp) percpu_stable_op(4, "mov", pcp)
434-
#define this_cpu_read_stable_8(pcp) percpu_stable_op(8, "mov", pcp)
435420
#define this_cpu_read_stable(pcp) __pcpu_size_call_return(this_cpu_read_stable_, pcp)
436421

437422
#ifdef CONFIG_USE_X86_SEG_SUPPORT
@@ -500,6 +485,10 @@ do { \
500485
#define this_cpu_read_const(pcp) ({ BUILD_BUG(); (typeof(pcp))0; })
501486
#endif /* CONFIG_USE_X86_SEG_SUPPORT */
502487

488+
#define this_cpu_read_stable_1(pcp) percpu_stable_op(1, "mov", pcp)
489+
#define this_cpu_read_stable_2(pcp) percpu_stable_op(2, "mov", pcp)
490+
#define this_cpu_read_stable_4(pcp) percpu_stable_op(4, "mov", pcp)
491+
503492
#define raw_cpu_add_1(pcp, val) percpu_add_op(1, , (pcp), val)
504493
#define raw_cpu_add_2(pcp, val) percpu_add_op(2, , (pcp), val)
505494
#define raw_cpu_add_4(pcp, val) percpu_add_op(4, , (pcp), val)
@@ -509,18 +498,6 @@ do { \
509498
#define raw_cpu_or_1(pcp, val) percpu_to_op(1, , "or", (pcp), val)
510499
#define raw_cpu_or_2(pcp, val) percpu_to_op(2, , "or", (pcp), val)
511500
#define raw_cpu_or_4(pcp, val) percpu_to_op(4, , "or", (pcp), val)
512-
513-
/*
514-
* raw_cpu_xchg() can use a load-store since it is not required to be
515-
* IRQ-safe.
516-
*/
517-
#define raw_percpu_xchg_op(var, nval) \
518-
({ \
519-
typeof(var) pxo_ret__ = raw_cpu_read(var); \
520-
raw_cpu_write(var, (nval)); \
521-
pxo_ret__; \
522-
})
523-
524501
#define raw_cpu_xchg_1(pcp, val) raw_percpu_xchg_op(pcp, val)
525502
#define raw_cpu_xchg_2(pcp, val) raw_percpu_xchg_op(pcp, val)
526503
#define raw_cpu_xchg_4(pcp, val) raw_percpu_xchg_op(pcp, val)
@@ -534,9 +511,9 @@ do { \
534511
#define this_cpu_or_1(pcp, val) percpu_to_op(1, volatile, "or", (pcp), val)
535512
#define this_cpu_or_2(pcp, val) percpu_to_op(2, volatile, "or", (pcp), val)
536513
#define this_cpu_or_4(pcp, val) percpu_to_op(4, volatile, "or", (pcp), val)
537-
#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(1, volatile, pcp, nval)
538-
#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(2, volatile, pcp, nval)
539-
#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(4, volatile, pcp, nval)
514+
#define this_cpu_xchg_1(pcp, nval) this_percpu_xchg_op(pcp, nval)
515+
#define this_cpu_xchg_2(pcp, nval) this_percpu_xchg_op(pcp, nval)
516+
#define this_cpu_xchg_4(pcp, nval) this_percpu_xchg_op(pcp, nval)
540517

541518
#define raw_cpu_add_return_1(pcp, val) percpu_add_return_op(1, , pcp, val)
542519
#define raw_cpu_add_return_2(pcp, val) percpu_add_return_op(2, , pcp, val)
@@ -563,6 +540,8 @@ do { \
563540
* 32 bit must fall back to generic operations.
564541
*/
565542
#ifdef CONFIG_X86_64
543+
#define this_cpu_read_stable_8(pcp) percpu_stable_op(8, "mov", pcp)
544+
566545
#define raw_cpu_add_8(pcp, val) percpu_add_op(8, , (pcp), val)
567546
#define raw_cpu_and_8(pcp, val) percpu_to_op(8, , "and", (pcp), val)
568547
#define raw_cpu_or_8(pcp, val) percpu_to_op(8, , "or", (pcp), val)
@@ -575,41 +554,41 @@ do { \
575554
#define this_cpu_and_8(pcp, val) percpu_to_op(8, volatile, "and", (pcp), val)
576555
#define this_cpu_or_8(pcp, val) percpu_to_op(8, volatile, "or", (pcp), val)
577556
#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(8, volatile, pcp, val)
578-
#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(8, volatile, pcp, nval)
557+
#define this_cpu_xchg_8(pcp, nval) this_percpu_xchg_op(pcp, nval)
579558
#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(8, volatile, pcp, oval, nval)
580559
#define this_cpu_try_cmpxchg_8(pcp, ovalp, nval) percpu_try_cmpxchg_op(8, volatile, pcp, ovalp, nval)
581-
#endif
582-
583-
static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr,
584-
const unsigned long __percpu *addr)
585-
{
586-
unsigned long __percpu *a =
587-
(unsigned long __percpu *)addr + nr / BITS_PER_LONG;
588560

589-
#ifdef CONFIG_X86_64
590-
return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_8(*a)) != 0;
561+
#define raw_cpu_read_long(pcp) raw_cpu_read_8(pcp)
591562
#else
592-
return ((1UL << (nr % BITS_PER_LONG)) & raw_cpu_read_4(*a)) != 0;
593-
#endif
594-
}
563+
/* There is no generic 64 bit read stable operation for 32 bit targets. */
564+
#define this_cpu_read_stable_8(pcp) ({ BUILD_BUG(); (typeof(pcp))0; })
595565

596-
static inline bool x86_this_cpu_variable_test_bit(int nr,
597-
const unsigned long __percpu *addr)
598-
{
599-
bool oldbit;
566+
#define raw_cpu_read_long(pcp) raw_cpu_read_4(pcp)
567+
#endif
600568

601-
asm volatile("btl "__percpu_arg(2)",%1"
602-
CC_SET(c)
603-
: CC_OUT(c) (oldbit)
604-
: "m" (*__my_cpu_ptr((unsigned long __percpu *)(addr))), "Ir" (nr));
569+
#define x86_this_cpu_constant_test_bit(_nr, _var) \
570+
({ \
571+
unsigned long __percpu *addr__ = \
572+
(unsigned long __percpu *)&(_var) + ((_nr) / BITS_PER_LONG); \
573+
!!((1UL << ((_nr) % BITS_PER_LONG)) & raw_cpu_read(*addr__)); \
574+
})
605575

606-
return oldbit;
607-
}
576+
#define x86_this_cpu_variable_test_bit(_nr, _var) \
577+
({ \
578+
bool oldbit; \
579+
\
580+
asm volatile("btl %[nr], " __percpu_arg([var]) \
581+
CC_SET(c) \
582+
: CC_OUT(c) (oldbit) \
583+
: [var] "m" (__my_cpu_var(_var)), \
584+
[nr] "rI" (_nr)); \
585+
oldbit; \
586+
})
608587

609-
#define x86_this_cpu_test_bit(nr, addr) \
610-
(__builtin_constant_p((nr)) \
611-
? x86_this_cpu_constant_test_bit((nr), (addr)) \
612-
: x86_this_cpu_variable_test_bit((nr), (addr)))
588+
#define x86_this_cpu_test_bit(_nr, _var) \
589+
(__builtin_constant_p(_nr) \
590+
? x86_this_cpu_constant_test_bit(_nr, _var) \
591+
: x86_this_cpu_variable_test_bit(_nr, _var))
613592

614593

615594
#include <asm-generic/percpu.h>

0 commit comments

Comments
 (0)