Skip to content

Commit 2d352ec

Browse files
ubizjakIngo Molnar
authored andcommitted
x86/locking: Use asm_inline for {,try_}cmpxchg{64,128} emulations
According to: https://gcc.gnu.org/onlinedocs/gcc/Size-of-an-asm.html the usage of asm pseudo directives in the asm template can confuse the compiler to wrongly estimate the size of the generated code. The ALTERNATIVE macro expands to several asm pseudo directives, so its usage in {,try_}cmpxchg{64,128} causes instruction length estimate to fail by an order of magnitude (the specially instrumented compiler reports the estimated length of these asm templates to be more than 20 instructions long). This incorrect estimate further causes unoptimal inlining decisions, unoptimal instruction scheduling and unoptimal code block alignments for functions that use these locking primitives. Use asm_inline instead: https://gcc.gnu.org/pipermail/gcc-patches/2018-December/512349.html which is a feature that makes GCC pretend some inline assembler code is tiny (while it would think it is huge), instead of just asm. For code size estimation, the size of the asm is then taken as the minimum size of one instruction, ignoring how many instructions compiler thinks it is. The effect of this patch on x86_64 target is minor, since 128-bit functions are rarely used on this target. The code size of the resulting defconfig object file stays the same: text data bss dec hex filename 27456612 4638523 814148 32909283 1f627e3 vmlinux-old.o 27456612 4638523 814148 32909283 1f627e3 vmlinux-new.o but the patch has minor effect on code layout due to the different scheduling decisions in functions containing changed macros. There is no effect on the x64_32 target, the code size of the resulting defconfig object file and the code layout stays the same: text data bss dec hex filename 18883870 2679275 1707916 23271061 1631695 vmlinux-old.o 18883870 2679275 1707916 23271061 1631695 vmlinux-new.o Signed-off-by: Uros Bizjak <[email protected]> Signed-off-by: Ingo Molnar <[email protected]> Cc: Linus Torvalds <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 4087e16 commit 2d352ec

File tree

2 files changed

+55
-54
lines changed

2 files changed

+55
-54
lines changed

arch/x86/include/asm/cmpxchg_32.h

Lines changed: 18 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -91,12 +91,14 @@ static __always_inline bool __try_cmpxchg64_local(volatile u64 *ptr, u64 *oldp,
9191
union __u64_halves o = { .full = (_old), }, \
9292
n = { .full = (_new), }; \
9393
\
94-
asm volatile(ALTERNATIVE(_lock_loc \
95-
"call cmpxchg8b_emu", \
96-
_lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \
97-
: ALT_OUTPUT_SP("+a" (o.low), "+d" (o.high)) \
98-
: "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \
99-
: "memory"); \
94+
asm_inline volatile( \
95+
ALTERNATIVE(_lock_loc \
96+
"call cmpxchg8b_emu", \
97+
_lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \
98+
: ALT_OUTPUT_SP("+a" (o.low), "+d" (o.high)) \
99+
: "b" (n.low), "c" (n.high), \
100+
[ptr] "S" (_ptr) \
101+
: "memory"); \
100102
\
101103
o.full; \
102104
})
@@ -119,14 +121,16 @@ static __always_inline u64 arch_cmpxchg64_local(volatile u64 *ptr, u64 old, u64
119121
n = { .full = (_new), }; \
120122
bool ret; \
121123
\
122-
asm volatile(ALTERNATIVE(_lock_loc \
123-
"call cmpxchg8b_emu", \
124-
_lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \
125-
CC_SET(e) \
126-
: ALT_OUTPUT_SP(CC_OUT(e) (ret), \
127-
"+a" (o.low), "+d" (o.high)) \
128-
: "b" (n.low), "c" (n.high), [ptr] "S" (_ptr) \
129-
: "memory"); \
124+
asm_inline volatile( \
125+
ALTERNATIVE(_lock_loc \
126+
"call cmpxchg8b_emu", \
127+
_lock "cmpxchg8b %a[ptr]", X86_FEATURE_CX8) \
128+
CC_SET(e) \
129+
: ALT_OUTPUT_SP(CC_OUT(e) (ret), \
130+
"+a" (o.low), "+d" (o.high)) \
131+
: "b" (n.low), "c" (n.high), \
132+
[ptr] "S" (_ptr) \
133+
: "memory"); \
130134
\
131135
if (unlikely(!ret)) \
132136
*(_oldp) = o.full; \

arch/x86/include/asm/percpu.h

Lines changed: 37 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -348,15 +348,14 @@ do { \
348348
old__.var = _oval; \
349349
new__.var = _nval; \
350350
\
351-
asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
352-
"cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
353-
: ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \
354-
"+a" (old__.low), \
355-
"+d" (old__.high)) \
356-
: "b" (new__.low), \
357-
"c" (new__.high), \
358-
"S" (&(_var)) \
359-
: "memory"); \
351+
asm_inline qual ( \
352+
ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
353+
"cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
354+
: ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \
355+
"+a" (old__.low), "+d" (old__.high)) \
356+
: "b" (new__.low), "c" (new__.high), \
357+
"S" (&(_var)) \
358+
: "memory"); \
360359
\
361360
old__.var; \
362361
})
@@ -378,17 +377,16 @@ do { \
378377
old__.var = *_oval; \
379378
new__.var = _nval; \
380379
\
381-
asm qual (ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
382-
"cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
383-
CC_SET(z) \
384-
: ALT_OUTPUT_SP(CC_OUT(z) (success), \
385-
[var] "+m" (__my_cpu_var(_var)), \
386-
"+a" (old__.low), \
387-
"+d" (old__.high)) \
388-
: "b" (new__.low), \
389-
"c" (new__.high), \
390-
"S" (&(_var)) \
391-
: "memory"); \
380+
asm_inline qual ( \
381+
ALTERNATIVE("call this_cpu_cmpxchg8b_emu", \
382+
"cmpxchg8b " __percpu_arg([var]), X86_FEATURE_CX8) \
383+
CC_SET(z) \
384+
: ALT_OUTPUT_SP(CC_OUT(z) (success), \
385+
[var] "+m" (__my_cpu_var(_var)), \
386+
"+a" (old__.low), "+d" (old__.high)) \
387+
: "b" (new__.low), "c" (new__.high), \
388+
"S" (&(_var)) \
389+
: "memory"); \
392390
if (unlikely(!success)) \
393391
*_oval = old__.var; \
394392
\
@@ -419,15 +417,14 @@ do { \
419417
old__.var = _oval; \
420418
new__.var = _nval; \
421419
\
422-
asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
423-
"cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
424-
: ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \
425-
"+a" (old__.low), \
426-
"+d" (old__.high)) \
427-
: "b" (new__.low), \
428-
"c" (new__.high), \
429-
"S" (&(_var)) \
430-
: "memory"); \
420+
asm_inline qual ( \
421+
ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
422+
"cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
423+
: ALT_OUTPUT_SP([var] "+m" (__my_cpu_var(_var)), \
424+
"+a" (old__.low), "+d" (old__.high)) \
425+
: "b" (new__.low), "c" (new__.high), \
426+
"S" (&(_var)) \
427+
: "memory"); \
431428
\
432429
old__.var; \
433430
})
@@ -449,19 +446,19 @@ do { \
449446
old__.var = *_oval; \
450447
new__.var = _nval; \
451448
\
452-
asm qual (ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
453-
"cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
454-
CC_SET(z) \
455-
: ALT_OUTPUT_SP(CC_OUT(z) (success), \
456-
[var] "+m" (__my_cpu_var(_var)), \
457-
"+a" (old__.low), \
458-
"+d" (old__.high)) \
459-
: "b" (new__.low), \
460-
"c" (new__.high), \
461-
"S" (&(_var)) \
462-
: "memory"); \
449+
asm_inline qual ( \
450+
ALTERNATIVE("call this_cpu_cmpxchg16b_emu", \
451+
"cmpxchg16b " __percpu_arg([var]), X86_FEATURE_CX16) \
452+
CC_SET(z) \
453+
: ALT_OUTPUT_SP(CC_OUT(z) (success), \
454+
[var] "+m" (__my_cpu_var(_var)), \
455+
"+a" (old__.low), "+d" (old__.high)) \
456+
: "b" (new__.low), "c" (new__.high), \
457+
"S" (&(_var)) \
458+
: "memory"); \
463459
if (unlikely(!success)) \
464460
*_oval = old__.var; \
461+
\
465462
likely(success); \
466463
})
467464

0 commit comments

Comments
 (0)