Skip to content

Commit 70f43ea

Browse files
committed
Merge tag 'x86-mm-2024-09-17' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 memory management updates from Thomas Gleixner: - Make LAM enablement safe vs. kernel threads using a process mm temporarily as switching back to the process would not update CR3 and therefore not enable LAM causing faults in user space when using tagged pointers. Cure it by synchronizing LAM enablement via IPIs to all CPUs which use the related mm. - Cure a LAM harmless inconsistency between CR3 and the state during context switch. It's both confusing and prone to lead to real bugs - Handle alt stack handling for threads which run with a non-zero protection key. The non-zero key prevents the kernel to access the alternate stack. Cure it by temporarily enabling all protection keys for the alternate stack setup/restore operations. - Provide a EFI config table identity mapping for kexec kernel to prevent kexec fails because the new kernel cannot access the config table array - Use GB pages only when a full GB is mapped in the identity map as otherwise the CPU can speculate into reserved areas after the end of memory which causes malfunction on UV systems. - Remove the noisy and pointless SRAT table dump during boot - Use is_ioremap_addr() for iounmap() address range checks instead of high_memory. is_ioremap_addr() is more precise. * tag 'x86-mm-2024-09-17' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/ioremap: Improve iounmap() address range checks x86/mm: Remove duplicate check from build_cr3() x86/mm: Remove unused NX related declarations x86/mm: Remove unused CR3_HW_ASID_BITS x86/mm: Don't print out SRAT table information x86/mm/ident_map: Use gbpages only where full GB page should be mapped. x86/kexec: Add EFI config table identity mapping for kexec kernel selftests/mm: Add new testcases for pkeys x86/pkeys: Restore altstack access in sigreturn() x86/pkeys: Update PKRU to enable all pkeys before XSAVE x86/pkeys: Add helper functions to update PKRU on the sigframe x86/pkeys: Add PKRU as a parameter in signal handling functions x86/mm: Cleanup prctl_enable_tagged_addr() nr_bits error checking x86/mm: Fix LAM inconsistency during context switch x86/mm: Use IPIs to synchronize LAM enablement
2 parents b136021 + 50c6dbd commit 70f43ea

File tree

20 files changed

+664
-60
lines changed

20 files changed

+664
-60
lines changed

arch/x86/include/asm/fpu/signal.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame,
2929

3030
unsigned long fpu__get_fpstate_size(void);
3131

32-
extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size);
32+
extern bool copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size, u32 pkru);
3333
extern void fpu__clear_user_states(struct fpu *fpu);
3434
extern bool fpu__restore_sig(void __user *buf, int ia32_frame);
3535

arch/x86/include/asm/mmu_context.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,13 @@ static inline void switch_ldt(struct mm_struct *prev, struct mm_struct *next)
8888
#ifdef CONFIG_ADDRESS_MASKING
8989
static inline unsigned long mm_lam_cr3_mask(struct mm_struct *mm)
9090
{
91-
return mm->context.lam_cr3_mask;
91+
/*
92+
* When switch_mm_irqs_off() is called for a kthread, it may race with
93+
* LAM enablement. switch_mm_irqs_off() uses the LAM mask to do two
94+
* things: populate CR3 and populate 'cpu_tlbstate.lam'. Make sure it
95+
* reads a single value for both.
96+
*/
97+
return READ_ONCE(mm->context.lam_cr3_mask);
9298
}
9399

94100
static inline void dup_lam(struct mm_struct *oldmm, struct mm_struct *mm)

arch/x86/include/asm/pgtable_types.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -517,8 +517,6 @@ typedef struct page *pgtable_t;
517517

518518
extern pteval_t __supported_pte_mask;
519519
extern pteval_t __default_kernel_pte_mask;
520-
extern void set_nx(void);
521-
extern int nx_enabled;
522520

523521
#define pgprot_writecombine pgprot_writecombine
524522
extern pgprot_t pgprot_writecombine(pgprot_t prot);

arch/x86/include/asm/tlbflush.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -399,11 +399,10 @@ static inline u64 tlbstate_lam_cr3_mask(void)
399399
return lam << X86_CR3_LAM_U57_BIT;
400400
}
401401

402-
static inline void set_tlbstate_lam_mode(struct mm_struct *mm)
402+
static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
403403
{
404-
this_cpu_write(cpu_tlbstate.lam,
405-
mm->context.lam_cr3_mask >> X86_CR3_LAM_U57_BIT);
406-
this_cpu_write(tlbstate_untag_mask, mm->context.untag_mask);
404+
this_cpu_write(cpu_tlbstate.lam, lam >> X86_CR3_LAM_U57_BIT);
405+
this_cpu_write(tlbstate_untag_mask, untag_mask);
407406
}
408407

409408
#else
@@ -413,7 +412,7 @@ static inline u64 tlbstate_lam_cr3_mask(void)
413412
return 0;
414413
}
415414

416-
static inline void set_tlbstate_lam_mode(struct mm_struct *mm)
415+
static inline void cpu_tlbstate_update_lam(unsigned long lam, u64 untag_mask)
417416
{
418417
}
419418
#endif

arch/x86/kernel/fpu/signal.c

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,16 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf,
6363
return true;
6464
}
6565

66+
/*
67+
* Update the value of PKRU register that was already pushed onto the signal frame.
68+
*/
69+
static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru)
70+
{
71+
if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE)))
72+
return 0;
73+
return __put_user(pkru, (unsigned int __user *)get_xsave_addr_user(buf, XFEATURE_PKRU));
74+
}
75+
6676
/*
6777
* Signal frame handlers.
6878
*/
@@ -156,10 +166,17 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame,
156166
return !err;
157167
}
158168

159-
static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
169+
static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pkru)
160170
{
161-
if (use_xsave())
162-
return xsave_to_user_sigframe(buf);
171+
int err = 0;
172+
173+
if (use_xsave()) {
174+
err = xsave_to_user_sigframe(buf);
175+
if (!err)
176+
err = update_pkru_in_sigframe(buf, pkru);
177+
return err;
178+
}
179+
163180
if (use_fxsr())
164181
return fxsave_to_user_sigframe((struct fxregs_state __user *) buf);
165182
else
@@ -185,7 +202,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf)
185202
* For [f]xsave state, update the SW reserved fields in the [f]xsave frame
186203
* indicating the absence/presence of the extended state to the user.
187204
*/
188-
bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
205+
bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru)
189206
{
190207
struct task_struct *tsk = current;
191208
struct fpstate *fpstate = tsk->thread.fpu.fpstate;
@@ -228,7 +245,7 @@ bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size)
228245
fpregs_restore_userregs();
229246

230247
pagefault_disable();
231-
ret = copy_fpregs_to_sigframe(buf_fx);
248+
ret = copy_fpregs_to_sigframe(buf_fx, pkru);
232249
pagefault_enable();
233250
fpregs_unlock();
234251

arch/x86/kernel/fpu/xstate.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -999,6 +999,19 @@ void *get_xsave_addr(struct xregs_state *xsave, int xfeature_nr)
999999
}
10001000
EXPORT_SYMBOL_GPL(get_xsave_addr);
10011001

1002+
/*
1003+
* Given an xstate feature nr, calculate where in the xsave buffer the state is.
1004+
* The xsave buffer should be in standard format, not compacted (e.g. user mode
1005+
* signal frames).
1006+
*/
1007+
void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr)
1008+
{
1009+
if (WARN_ON_ONCE(!xfeature_enabled(xfeature_nr)))
1010+
return NULL;
1011+
1012+
return (void __user *)xsave + xstate_offsets[xfeature_nr];
1013+
}
1014+
10021015
#ifdef CONFIG_ARCH_HAS_PKEYS
10031016

10041017
/*

arch/x86/kernel/fpu/xstate.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@ extern int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void
5454
extern void fpu__init_cpu_xstate(void);
5555
extern void fpu__init_system_xstate(unsigned int legacy_size);
5656

57+
extern void __user *get_xsave_addr_user(struct xregs_state __user *xsave, int xfeature_nr);
58+
5759
static inline u64 xfeatures_mask_supervisor(void)
5860
{
5961
return fpu_kernel_cfg.max_features & XFEATURE_MASK_SUPERVISOR_SUPPORTED;

arch/x86/kernel/machine_kexec_64.c

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include <asm/setup.h>
2929
#include <asm/set_memory.h>
3030
#include <asm/cpu.h>
31+
#include <asm/efi.h>
3132

3233
#ifdef CONFIG_ACPI
3334
/*
@@ -87,6 +88,8 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
8788
{
8889
#ifdef CONFIG_EFI
8990
unsigned long mstart, mend;
91+
void *kaddr;
92+
int ret;
9093

9194
if (!efi_enabled(EFI_BOOT))
9295
return 0;
@@ -102,6 +105,30 @@ map_efi_systab(struct x86_mapping_info *info, pgd_t *level4p)
102105
if (!mstart)
103106
return 0;
104107

108+
ret = kernel_ident_mapping_init(info, level4p, mstart, mend);
109+
if (ret)
110+
return ret;
111+
112+
kaddr = memremap(mstart, mend - mstart, MEMREMAP_WB);
113+
if (!kaddr) {
114+
pr_err("Could not map UEFI system table\n");
115+
return -ENOMEM;
116+
}
117+
118+
mstart = efi_config_table;
119+
120+
if (efi_enabled(EFI_64BIT)) {
121+
efi_system_table_64_t *stbl = (efi_system_table_64_t *)kaddr;
122+
123+
mend = mstart + sizeof(efi_config_table_64_t) * stbl->nr_tables;
124+
} else {
125+
efi_system_table_32_t *stbl = (efi_system_table_32_t *)kaddr;
126+
127+
mend = mstart + sizeof(efi_config_table_32_t) * stbl->nr_tables;
128+
}
129+
130+
memunmap(kaddr);
131+
105132
return kernel_ident_mapping_init(info, level4p, mstart, mend);
106133
#endif
107134
return 0;

arch/x86/kernel/process_64.c

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -798,6 +798,32 @@ static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
798798

799799
#define LAM_U57_BITS 6
800800

801+
static void enable_lam_func(void *__mm)
802+
{
803+
struct mm_struct *mm = __mm;
804+
unsigned long lam;
805+
806+
if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm) {
807+
lam = mm_lam_cr3_mask(mm);
808+
write_cr3(__read_cr3() | lam);
809+
cpu_tlbstate_update_lam(lam, mm_untag_mask(mm));
810+
}
811+
}
812+
813+
static void mm_enable_lam(struct mm_struct *mm)
814+
{
815+
mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
816+
mm->context.untag_mask = ~GENMASK(62, 57);
817+
818+
/*
819+
* Even though the process must still be single-threaded at this
820+
* point, kernel threads may be using the mm. IPI those kernel
821+
* threads if they exist.
822+
*/
823+
on_each_cpu_mask(mm_cpumask(mm), enable_lam_func, mm, true);
824+
set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
825+
}
826+
801827
static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
802828
{
803829
if (!cpu_feature_enabled(X86_FEATURE_LAM))
@@ -814,25 +840,21 @@ static int prctl_enable_tagged_addr(struct mm_struct *mm, unsigned long nr_bits)
814840
if (mmap_write_lock_killable(mm))
815841
return -EINTR;
816842

843+
/*
844+
* MM_CONTEXT_LOCK_LAM is set on clone. Prevent LAM from
845+
* being enabled unless the process is single threaded:
846+
*/
817847
if (test_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags)) {
818848
mmap_write_unlock(mm);
819849
return -EBUSY;
820850
}
821851

822-
if (!nr_bits) {
823-
mmap_write_unlock(mm);
824-
return -EINVAL;
825-
} else if (nr_bits <= LAM_U57_BITS) {
826-
mm->context.lam_cr3_mask = X86_CR3_LAM_U57;
827-
mm->context.untag_mask = ~GENMASK(62, 57);
828-
} else {
852+
if (!nr_bits || nr_bits > LAM_U57_BITS) {
829853
mmap_write_unlock(mm);
830854
return -EINVAL;
831855
}
832856

833-
write_cr3(__read_cr3() | mm->context.lam_cr3_mask);
834-
set_tlbstate_lam_mode(mm);
835-
set_bit(MM_CONTEXT_LOCK_LAM, &mm->context.flags);
857+
mm_enable_lam(mm);
836858

837859
mmap_write_unlock(mm);
838860

arch/x86/kernel/signal.c

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,24 @@ static inline int is_x32_frame(struct ksignal *ksig)
6060
ksig->ka.sa.sa_flags & SA_X32_ABI;
6161
}
6262

63+
/*
64+
* Enable all pkeys temporarily, so as to ensure that both the current
65+
* execution stack as well as the alternate signal stack are writeable.
66+
* The application can use any of the available pkeys to protect the
67+
* alternate signal stack, and we don't know which one it is, so enable
68+
* all. The PKRU register will be reset to init_pkru later in the flow,
69+
* in fpu__clear_user_states(), and it is the application's responsibility
70+
* to enable the appropriate pkey as the first step in the signal handler
71+
* so that the handler does not segfault.
72+
*/
73+
static inline u32 sig_prepare_pkru(void)
74+
{
75+
u32 orig_pkru = read_pkru();
76+
77+
write_pkru(0);
78+
return orig_pkru;
79+
}
80+
6381
/*
6482
* Set up a signal frame.
6583
*/
@@ -84,6 +102,7 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
84102
unsigned long math_size = 0;
85103
unsigned long sp = regs->sp;
86104
unsigned long buf_fx = 0;
105+
u32 pkru;
87106

88107
/* redzone */
89108
if (!ia32_frame)
@@ -138,9 +157,17 @@ get_sigframe(struct ksignal *ksig, struct pt_regs *regs, size_t frame_size,
138157
return (void __user *)-1L;
139158
}
140159

160+
/* Update PKRU to enable access to the alternate signal stack. */
161+
pkru = sig_prepare_pkru();
141162
/* save i387 and extended state */
142-
if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size))
163+
if (!copy_fpstate_to_sigframe(*fpstate, (void __user *)buf_fx, math_size, pkru)) {
164+
/*
165+
* Restore PKRU to the original, user-defined value; disable
166+
* extra pkeys enabled for the alternate signal stack, if any.
167+
*/
168+
write_pkru(pkru);
143169
return (void __user *)-1L;
170+
}
144171

145172
return (void __user *)sp;
146173
}

0 commit comments

Comments
 (0)