Skip to content

Commit ed766c2

Browse files
committed
Merge tag 'x86-entry-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 entry updates from Ingo Molnar: - Make IA32_EMULATION boot time configurable with the new ia32_emulation=<bool> boot option - Clean up fast syscall return validation code: convert it to C and refactor the code - As part of this, optimize the canonical RIP test code * tag 'x86-entry-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/entry/32: Clean up syscall fast exit tests x86/entry/64: Use TASK_SIZE_MAX for canonical RIP test x86/entry/64: Convert SYSRET validation tests to C x86/entry/32: Remove SEP test for SYSEXIT x86/entry/32: Convert do_fast_syscall_32() to bool return type x86/entry/compat: Combine return value test from syscall handler x86/entry/64: Remove obsolete comment on tracing vs. SYSRET x86: Make IA32_EMULATION boot time configurable x86/entry: Make IA32 syscalls' availability depend on ia32_enabled() x86/elf: Make loading of 32bit processes depend on ia32_enabled() x86/entry: Compile entry_SYSCALL32_ignore() unconditionally x86/entry: Rename ignore_sysret() x86: Introduce ia32_enabled()
2 parents 5780e39 + 1a09a27 commit ed766c2

File tree

13 files changed

+155
-132
lines changed

13 files changed

+155
-132
lines changed

Documentation/admin-guide/kernel-parameters.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1893,6 +1893,12 @@
18931893
0 -- machine default
18941894
1 -- force brightness inversion
18951895

1896+
ia32_emulation= [X86-64]
1897+
Format: <bool>
1898+
When true, allows loading 32-bit programs and executing 32-bit
1899+
syscalls, essentially overriding IA32_EMULATION_DEFAULT_DISABLED at
1900+
boot time. When false, unconditionally disables IA32 emulation.
1901+
18961902
icn= [HW,ISDN]
18971903
Format: <io>[,<membase>[,<icn_id>[,<icn_id2>]]]
18981904

arch/x86/Kconfig

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2955,6 +2955,15 @@ config IA32_EMULATION
29552955
64-bit kernel. You should likely turn this on, unless you're
29562956
100% sure that you don't have any 32-bit programs left.
29572957

2958+
config IA32_EMULATION_DEFAULT_DISABLED
2959+
bool "IA32 emulation disabled by default"
2960+
default n
2961+
depends on IA32_EMULATION
2962+
help
2963+
Make IA32 emulation disabled by default. This prevents loading 32-bit
2964+
processes and access to 32-bit syscalls. If unsure, leave it to its
2965+
default value.
2966+
29582967
config X86_X32_ABI
29592968
bool "x32 ABI for 64-bit mode"
29602969
depends on X86_64

arch/x86/entry/common.c

Lines changed: 80 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <linux/nospec.h>
2020
#include <linux/syscalls.h>
2121
#include <linux/uaccess.h>
22+
#include <linux/init.h>
2223

2324
#ifdef CONFIG_XEN_PV
2425
#include <xen/xen-ops.h>
@@ -70,7 +71,8 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
7071
return false;
7172
}
7273

73-
__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
74+
/* Returns true to return using SYSRET, or false to use IRET */
75+
__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
7476
{
7577
add_random_kstack_offset();
7678
nr = syscall_enter_from_user_mode(regs, nr);
@@ -84,6 +86,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
8486

8587
instrumentation_end();
8688
syscall_exit_to_user_mode(regs);
89+
90+
/*
91+
* Check that the register state is valid for using SYSRET to exit
92+
* to userspace. Otherwise use the slower but fully capable IRET
93+
* exit path.
94+
*/
95+
96+
/* XEN PV guests always use the IRET path */
97+
if (cpu_feature_enabled(X86_FEATURE_XENPV))
98+
return false;
99+
100+
/* SYSRET requires RCX == RIP and R11 == EFLAGS */
101+
if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
102+
return false;
103+
104+
/* CS and SS must match the values set in MSR_STAR */
105+
if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
106+
return false;
107+
108+
/*
109+
* On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
110+
* in kernel space. This essentially lets the user take over
111+
* the kernel, since userspace controls RSP.
112+
*
113+
* TASK_SIZE_MAX covers all user-accessible addresses other than
114+
* the deprecated vsyscall page.
115+
*/
116+
if (unlikely(regs->ip >= TASK_SIZE_MAX))
117+
return false;
118+
119+
/*
120+
* SYSRET cannot restore RF. It can restore TF, but unlike IRET,
121+
* restoring TF results in a trap from userspace immediately after
122+
* SYSRET.
123+
*/
124+
if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
125+
return false;
126+
127+
/* Use SYSRET to exit to userspace */
128+
return true;
87129
}
88130
#endif
89131

@@ -96,6 +138,16 @@ static __always_inline int syscall_32_enter(struct pt_regs *regs)
96138
return (int)regs->orig_ax;
97139
}
98140

141+
#ifdef CONFIG_IA32_EMULATION
142+
bool __ia32_enabled __ro_after_init = !IS_ENABLED(CONFIG_IA32_EMULATION_DEFAULT_DISABLED);
143+
144+
static int ia32_emulation_override_cmdline(char *arg)
145+
{
146+
return kstrtobool(arg, &__ia32_enabled);
147+
}
148+
early_param("ia32_emulation", ia32_emulation_override_cmdline);
149+
#endif
150+
99151
/*
100152
* Invoke a 32-bit syscall. Called with IRQs on in CONTEXT_KERNEL.
101153
*/
@@ -182,8 +234,8 @@ static noinstr bool __do_fast_syscall_32(struct pt_regs *regs)
182234
return true;
183235
}
184236

185-
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
186-
__visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
237+
/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
238+
__visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
187239
{
188240
/*
189241
* Called using the internal vDSO SYSENTER/SYSCALL32 calling
@@ -201,41 +253,36 @@ __visible noinstr long do_fast_syscall_32(struct pt_regs *regs)
201253

202254
/* Invoke the syscall. If it failed, keep it simple: use IRET. */
203255
if (!__do_fast_syscall_32(regs))
204-
return 0;
256+
return false;
205257

206-
#ifdef CONFIG_X86_64
207258
/*
208-
* Opportunistic SYSRETL: if possible, try to return using SYSRETL.
209-
* SYSRETL is available on all 64-bit CPUs, so we don't need to
210-
* bother with SYSEXIT.
211-
*
212-
* Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
213-
* because the ECX fixup above will ensure that this is essentially
214-
* never the case.
215-
*/
216-
return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
217-
regs->ip == landing_pad &&
218-
(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
219-
#else
220-
/*
221-
* Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
222-
*
223-
* Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
224-
* because the ECX fixup above will ensure that this is essentially
225-
* never the case.
226-
*
227-
* We don't allow syscalls at all from VM86 mode, but we still
228-
* need to check VM, because we might be returning from sys_vm86.
259+
* Check that the register state is valid for using SYSRETL/SYSEXIT
260+
* to exit to userspace. Otherwise use the slower but fully capable
261+
* IRET exit path.
229262
*/
230-
return static_cpu_has(X86_FEATURE_SEP) &&
231-
regs->cs == __USER_CS && regs->ss == __USER_DS &&
232-
regs->ip == landing_pad &&
233-
(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
234-
#endif
263+
264+
/* XEN PV guests always use the IRET path */
265+
if (cpu_feature_enabled(X86_FEATURE_XENPV))
266+
return false;
267+
268+
/* EIP must point to the VDSO landing pad */
269+
if (unlikely(regs->ip != landing_pad))
270+
return false;
271+
272+
/* CS and SS must match the values set in MSR_STAR */
273+
if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
274+
return false;
275+
276+
/* If the TF, RF, or VM flags are set, use IRET */
277+
if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
278+
return false;
279+
280+
/* Use SYSRETL/SYSEXIT to exit to userspace */
281+
return true;
235282
}
236283

237-
/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */
238-
__visible noinstr long do_SYSENTER_32(struct pt_regs *regs)
284+
/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
285+
__visible noinstr bool do_SYSENTER_32(struct pt_regs *regs)
239286
{
240287
/* SYSENTER loses RSP, but the vDSO saved it in RBP. */
241288
regs->sp = regs->bp;

arch/x86/entry/entry_32.S

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -837,7 +837,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
837837

838838
movl %esp, %eax
839839
call do_SYSENTER_32
840-
testl %eax, %eax
840+
testb %al, %al
841841
jz .Lsyscall_32_done
842842

843843
STACKLEAK_ERASE

arch/x86/entry/entry_64.S

Lines changed: 4 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -126,70 +126,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
126126
* In the Xen PV case we must use iret anyway.
127127
*/
128128

129-
ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
130-
X86_FEATURE_XENPV
131-
132-
movq RCX(%rsp), %rcx
133-
movq RIP(%rsp), %r11
134-
135-
cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
136-
jne swapgs_restore_regs_and_return_to_usermode
137-
138-
/*
139-
* On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
140-
* in kernel space. This essentially lets the user take over
141-
* the kernel, since userspace controls RSP.
142-
*
143-
* If width of "canonical tail" ever becomes variable, this will need
144-
* to be updated to remain correct on both old and new CPUs.
145-
*
146-
* Change top bits to match most significant bit (47th or 56th bit
147-
* depending on paging mode) in the address.
148-
*/
149-
#ifdef CONFIG_X86_5LEVEL
150-
ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
151-
"shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
152-
#else
153-
shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
154-
sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
155-
#endif
156-
157-
/* If this changed %rcx, it was not canonical */
158-
cmpq %rcx, %r11
159-
jne swapgs_restore_regs_and_return_to_usermode
160-
161-
cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
162-
jne swapgs_restore_regs_and_return_to_usermode
163-
164-
movq R11(%rsp), %r11
165-
cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
166-
jne swapgs_restore_regs_and_return_to_usermode
167-
168-
/*
169-
* SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
170-
* restore RF properly. If the slowpath sets it for whatever reason, we
171-
* need to restore it correctly.
172-
*
173-
* SYSRET can restore TF, but unlike IRET, restoring TF results in a
174-
* trap from userspace immediately after SYSRET. This would cause an
175-
* infinite loop whenever #DB happens with register state that satisfies
176-
* the opportunistic SYSRET conditions. For example, single-stepping
177-
* this user code:
178-
*
179-
* movq $stuck_here, %rcx
180-
* pushfq
181-
* popq %r11
182-
* stuck_here:
183-
*
184-
* would never get past 'stuck_here'.
185-
*/
186-
testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
187-
jnz swapgs_restore_regs_and_return_to_usermode
188-
189-
/* nothing to check for RSP */
190-
191-
cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
192-
jne swapgs_restore_regs_and_return_to_usermode
129+
ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
130+
"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
193131

194132
/*
195133
* We win! This label is here just for ease of understanding
@@ -1509,18 +1447,16 @@ nmi_restore:
15091447
iretq
15101448
SYM_CODE_END(asm_exc_nmi)
15111449

1512-
#ifndef CONFIG_IA32_EMULATION
15131450
/*
15141451
* This handles SYSCALL from 32-bit code. There is no way to program
15151452
* MSRs to fully disable 32-bit SYSCALL.
15161453
*/
1517-
SYM_CODE_START(ignore_sysret)
1454+
SYM_CODE_START(entry_SYSCALL32_ignore)
15181455
UNWIND_HINT_END_OF_STACK
15191456
ENDBR
15201457
mov $-ENOSYS, %eax
15211458
sysretl
1522-
SYM_CODE_END(ignore_sysret)
1523-
#endif
1459+
SYM_CODE_END(entry_SYSCALL32_ignore)
15241460

15251461
.pushsection .text, "ax"
15261462
__FUNC_ALIGN

arch/x86/entry/entry_64_compat.S

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -118,9 +118,6 @@ SYM_INNER_LABEL(entry_SYSENTER_compat_after_hwframe, SYM_L_GLOBAL)
118118

119119
movq %rsp, %rdi
120120
call do_SYSENTER_32
121-
/* XEN PV guests always use IRET path */
122-
ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
123-
"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
124121
jmp sysret32_from_system_call
125122

126123
.Lsysenter_fix_flags:
@@ -212,13 +209,15 @@ SYM_INNER_LABEL(entry_SYSCALL_compat_after_hwframe, SYM_L_GLOBAL)
212209

213210
movq %rsp, %rdi
214211
call do_fast_syscall_32
212+
213+
sysret32_from_system_call:
215214
/* XEN PV guests always use IRET path */
216-
ALTERNATIVE "testl %eax, %eax; jz swapgs_restore_regs_and_return_to_usermode", \
215+
ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
217216
"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
218217

219-
/* Opportunistic SYSRET */
220-
sysret32_from_system_call:
221218
/*
219+
* Opportunistic SYSRET
220+
*
222221
* We are not going to return to userspace from the trampoline
223222
* stack. So let's erase the thread stack right now.
224223
*/

arch/x86/include/asm/elf.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
*/
88
#include <linux/thread_info.h>
99

10+
#include <asm/ia32.h>
1011
#include <asm/ptrace.h>
1112
#include <asm/user.h>
1213
#include <asm/auxvec.h>
@@ -149,7 +150,7 @@ do { \
149150
((x)->e_machine == EM_X86_64)
150151

151152
#define compat_elf_check_arch(x) \
152-
(elf_check_arch_ia32(x) || \
153+
((elf_check_arch_ia32(x) && ia32_enabled()) || \
153154
(IS_ENABLED(CONFIG_X86_X32_ABI) && (x)->e_machine == EM_X86_64))
154155

155156
static inline void elf_common_init(struct thread_struct *t,

arch/x86/include/asm/ia32.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,20 @@ extern void ia32_pick_mmap_layout(struct mm_struct *mm);
6868

6969
#endif
7070

71-
#endif /* CONFIG_IA32_EMULATION */
71+
extern bool __ia32_enabled;
72+
73+
static inline bool ia32_enabled(void)
74+
{
75+
return __ia32_enabled;
76+
}
77+
78+
#else /* !CONFIG_IA32_EMULATION */
79+
80+
static inline bool ia32_enabled(void)
81+
{
82+
return IS_ENABLED(CONFIG_X86_32);
83+
}
84+
85+
#endif
7286

7387
#endif /* _ASM_X86_IA32_H */

arch/x86/include/asm/processor.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -399,7 +399,7 @@ static inline unsigned long cpu_kernelmode_gs_base(int cpu)
399399
return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu);
400400
}
401401

402-
extern asmlinkage void ignore_sysret(void);
402+
extern asmlinkage void entry_SYSCALL32_ignore(void);
403403

404404
/* Save actual FS/GS selectors and bases to current->thread */
405405
void current_save_fsgs(void);

arch/x86/include/asm/proto.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ void entry_INT80_compat(void);
3636
#ifdef CONFIG_XEN_PV
3737
void xen_entry_INT80_compat(void);
3838
#endif
39+
#else /* !CONFIG_IA32_EMULATION */
40+
#define entry_SYSCALL_compat NULL
41+
#define entry_SYSENTER_compat NULL
3942
#endif
4043

4144
void x86_configure_nx(void);

0 commit comments

Comments
 (0)