Skip to content

Commit 5b7f723

Browse files
committed
Merge tag 'x86-boot-2025-01-21' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 boot updates from Ingo Molnar: - A large and involved preparatory series to pave the way to add exception handling for relocate_kernel - which will be a debugging facility that has aided in the field to debug an exceptionally hard to debug early boot bug. Plus assorted cleanups and fixes that were discovered along the way, by David Woodhouse: - Clean up and document register use in relocate_kernel_64.S - Use named labels in swap_pages in relocate_kernel_64.S - Only swap pages for ::preserve_context mode - Allocate PGD for x86_64 transition page tables separately - Copy control page into place in machine_kexec_prepare() - Invoke copy of relocate_kernel() instead of the original - Move relocate_kernel to kernel .data section - Add data section to relocate_kernel - Drop page_list argument from relocate_kernel() - Eliminate writes through kernel mapping of relocate_kernel page - Clean up register usage in relocate_kernel() - Mark relocate_kernel page as ROX instead of RWX - Disable global pages before writing to control page - Ensure preserve_context flag is set on return to kernel - Use correct swap page in swap_pages function - Fix stack and handling of re-entry point for ::preserve_context - Mark machine_kexec() with __nocfi - Cope with relocate_kernel() not being at the start of the page - Use typedef for relocate_kernel_fn function prototype - Fix location of relocate_kernel with -ffunction-sections (fix by Nathan Chancellor) - A series to remove the last remaining absolute symbol references from .head.text, and enforce this at build time, by Ard Biesheuvel: - Avoid WARN()s and panic()s in early boot code - Don't hang but terminate on failure to remap SVSM CA - Determine VA/PA offset before entering C code - Avoid intentional absolute symbol references in .head.text - Disable UBSAN in early boot code - Move ENTRY_TEXT to the start of the image - Move .head.text into its own output section - Reject absolute references in .head.text - The above build-time enforcement uncovered a handful of bugs of essentially non-working code, and a wrokaround for a toolchain bug, fixed by Ard Biesheuvel as well: - Fix spurious undefined reference when CONFIG_X86_5LEVEL=n, on GCC-12 - Disable UBSAN on SEV code that may execute very early - Disable ftrace branch profiling in SEV startup code - And miscellaneous cleanups: - kexec_core: Add and update comments regarding the KEXEC_JUMP flow (Rafael J. Wysocki) - x86/sysfs: Constify 'struct bin_attribute' (Thomas Weißschuh)" * tag 'x86-boot-2025-01-21' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits) x86/sev: Disable ftrace branch profiling in SEV startup code x86/kexec: Use typedef for relocate_kernel_fn function prototype x86/kexec: Cope with relocate_kernel() not being at the start of the page kexec_core: Add and update comments regarding the KEXEC_JUMP flow x86/kexec: Mark machine_kexec() with __nocfi x86/kexec: Fix location of relocate_kernel with -ffunction-sections x86/kexec: Fix stack and handling of re-entry point for ::preserve_context x86/kexec: Use correct swap page in swap_pages function x86/kexec: Ensure preserve_context flag is set on return to kernel x86/kexec: Disable global pages before writing to control page x86/sev: Don't hang but terminate on failure to remap SVSM CA x86/sev: Disable UBSAN on SEV code that may execute very early x86/boot/64: Fix spurious undefined reference when CONFIG_X86_5LEVEL=n, on GCC-12 x86/sysfs: Constify 'struct bin_attribute' x86/kexec: Mark relocate_kernel page as ROX instead of RWX x86/kexec: Clean up register usage in relocate_kernel() x86/kexec: Eliminate writes through kernel mapping of relocate_kernel page x86/kexec: Drop page_list argument from relocate_kernel() x86/kexec: Add data section to relocate_kernel x86/kexec: Move relocate_kernel to kernel .data section ...
2 parents 7685b33 + cf4ca80 commit 5b7f723

File tree

18 files changed

+317
-224
lines changed

18 files changed

+317
-224
lines changed

arch/x86/coco/sev/Makefile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,3 +13,6 @@ KCOV_INSTRUMENT_core.o := n
1313
# With some compiler versions the generated code results in boot hangs, caused
1414
# by several compilation units. To be safe, disable all instrumentation.
1515
KCSAN_SANITIZE := n
16+
17+
# Clang 14 and older may fail to respect __no_sanitize_undefined when inlining
18+
UBSAN_SANITIZE := n

arch/x86/coco/sev/core.c

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
#define pr_fmt(fmt) "SEV: " fmt
1111

12+
#define DISABLE_BRANCH_PROFILING
13+
1214
#include <linux/sched/debug.h> /* For show_regs() */
1315
#include <linux/percpu-defs.h>
1416
#include <linux/cc_platform.h>
@@ -787,15 +789,10 @@ early_set_pages_state(unsigned long vaddr, unsigned long paddr,
787789

788790
val = sev_es_rd_ghcb_msr();
789791

790-
if (WARN(GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP,
791-
"Wrong PSC response code: 0x%x\n",
792-
(unsigned int)GHCB_RESP_CODE(val)))
792+
if (GHCB_RESP_CODE(val) != GHCB_MSR_PSC_RESP)
793793
goto e_term;
794794

795-
if (WARN(GHCB_MSR_PSC_RESP_VAL(val),
796-
"Failed to change page state to '%s' paddr 0x%lx error 0x%llx\n",
797-
op == SNP_PAGE_STATE_PRIVATE ? "private" : "shared",
798-
paddr, GHCB_MSR_PSC_RESP_VAL(val)))
795+
if (GHCB_MSR_PSC_RESP_VAL(val))
799796
goto e_term;
800797

801798
/* Page validation must be performed after changing to private */
@@ -831,7 +828,7 @@ void __head early_snp_set_memory_private(unsigned long vaddr, unsigned long padd
831828
early_set_pages_state(vaddr, paddr, npages, SNP_PAGE_STATE_PRIVATE);
832829
}
833830

834-
void __init early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
831+
void __head early_snp_set_memory_shared(unsigned long vaddr, unsigned long paddr,
835832
unsigned long npages)
836833
{
837834
/*
@@ -2423,7 +2420,7 @@ static __head void svsm_setup(struct cc_blob_sev_info *cc_info)
24232420
call.rcx = pa;
24242421
ret = svsm_perform_call_protocol(&call);
24252422
if (ret)
2426-
panic("Can't remap the SVSM CA, ret=%d, rax_out=0x%llx\n", ret, call.rax_out);
2423+
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_SVSM_CA_REMAP_FAIL);
24272424

24282425
RIP_REL_REF(boot_svsm_caa) = (struct svsm_ca *)pa;
24292426
RIP_REL_REF(boot_svsm_caa_pa) = pa;

arch/x86/coco/sev/shared.c

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -498,7 +498,7 @@ static const struct snp_cpuid_table *snp_cpuid_get_table(void)
498498
*
499499
* Return: XSAVE area size on success, 0 otherwise.
500500
*/
501-
static u32 snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
501+
static u32 __head snp_cpuid_calc_xsave_size(u64 xfeatures_en, bool compacted)
502502
{
503503
const struct snp_cpuid_table *cpuid_table = snp_cpuid_get_table();
504504
u64 xfeatures_found = 0;
@@ -576,8 +576,9 @@ static void snp_cpuid_hv(struct ghcb *ghcb, struct es_em_ctxt *ctxt, struct cpui
576576
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_CPUID_HV);
577577
}
578578

579-
static int snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
580-
struct cpuid_leaf *leaf)
579+
static int __head
580+
snp_cpuid_postprocess(struct ghcb *ghcb, struct es_em_ctxt *ctxt,
581+
struct cpuid_leaf *leaf)
581582
{
582583
struct cpuid_leaf leaf_hv = *leaf;
583584

@@ -1253,7 +1254,7 @@ static void svsm_pval_terminate(struct svsm_pvalidate_call *pc, int ret, u64 svs
12531254
__pval_terminate(pfn, action, page_size, ret, svsm_ret);
12541255
}
12551256

1256-
static void svsm_pval_4k_page(unsigned long paddr, bool validate)
1257+
static void __head svsm_pval_4k_page(unsigned long paddr, bool validate)
12571258
{
12581259
struct svsm_pvalidate_call *pc;
12591260
struct svsm_call call = {};
@@ -1285,12 +1286,13 @@ static void svsm_pval_4k_page(unsigned long paddr, bool validate)
12851286

12861287
ret = svsm_perform_call_protocol(&call);
12871288
if (ret)
1288-
svsm_pval_terminate(pc, ret, call.rax_out);
1289+
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
12891290

12901291
native_local_irq_restore(flags);
12911292
}
12921293

1293-
static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool validate)
1294+
static void __head pvalidate_4k_page(unsigned long vaddr, unsigned long paddr,
1295+
bool validate)
12941296
{
12951297
int ret;
12961298

@@ -1303,7 +1305,7 @@ static void pvalidate_4k_page(unsigned long vaddr, unsigned long paddr, bool val
13031305
} else {
13041306
ret = pvalidate(vaddr, RMP_PG_SIZE_4K, validate);
13051307
if (ret)
1306-
__pval_terminate(PHYS_PFN(paddr), validate, RMP_PG_SIZE_4K, ret, 0);
1308+
sev_es_terminate(SEV_TERM_SET_LINUX, GHCB_TERM_PVALIDATE);
13071309
}
13081310
}
13091311

arch/x86/include/asm/init.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
#ifndef _ASM_X86_INIT_H
33
#define _ASM_X86_INIT_H
44

5-
#define __head __section(".head.text")
5+
#define __head __section(".head.text") __no_sanitize_undefined
66

77
struct x86_mapping_info {
88
void *(*alloc_pgt_page)(void *); /* allocate buf for page table */

arch/x86/include/asm/kexec.h

Lines changed: 31 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,9 @@
88
# define PA_PGD 2
99
# define PA_SWAP_PAGE 3
1010
# define PAGES_NR 4
11-
#else
12-
# define PA_CONTROL_PAGE 0
13-
# define VA_CONTROL_PAGE 1
14-
# define PA_TABLE_PAGE 2
15-
# define PA_SWAP_PAGE 3
16-
# define PAGES_NR 4
1711
#endif
1812

13+
# define KEXEC_CONTROL_PAGE_SIZE 4096
1914
# define KEXEC_CONTROL_CODE_MAX_SIZE 2048
2015

2116
#ifndef __ASSEMBLY__
@@ -43,7 +38,6 @@ struct kimage;
4338
/* Maximum address we can use for the control code buffer */
4439
# define KEXEC_CONTROL_MEMORY_LIMIT TASK_SIZE
4540

46-
# define KEXEC_CONTROL_PAGE_SIZE 4096
4741

4842
/* The native architecture */
4943
# define KEXEC_ARCH KEXEC_ARCH_386
@@ -58,11 +52,12 @@ struct kimage;
5852
/* Maximum address we can use for the control pages */
5953
# define KEXEC_CONTROL_MEMORY_LIMIT (MAXMEM-1)
6054

61-
/* Allocate one page for the pdp and the second for the code */
62-
# define KEXEC_CONTROL_PAGE_SIZE (4096UL + 4096UL)
63-
6455
/* The native architecture */
6556
# define KEXEC_ARCH KEXEC_ARCH_X86_64
57+
58+
extern unsigned long kexec_va_control_page;
59+
extern unsigned long kexec_pa_table_page;
60+
extern unsigned long kexec_pa_swap_page;
6661
#endif
6762

6863
/*
@@ -116,21 +111,21 @@ static inline void crash_setup_regs(struct pt_regs *newregs,
116111
}
117112

118113
#ifdef CONFIG_X86_32
119-
asmlinkage unsigned long
120-
relocate_kernel(unsigned long indirection_page,
121-
unsigned long control_page,
122-
unsigned long start_address,
123-
unsigned int has_pae,
124-
unsigned int preserve_context);
114+
typedef asmlinkage unsigned long
115+
relocate_kernel_fn(unsigned long indirection_page,
116+
unsigned long control_page,
117+
unsigned long start_address,
118+
unsigned int has_pae,
119+
unsigned int preserve_context);
125120
#else
126-
unsigned long
127-
relocate_kernel(unsigned long indirection_page,
128-
unsigned long page_list,
129-
unsigned long start_address,
130-
unsigned int preserve_context,
131-
unsigned int host_mem_enc_active);
121+
typedef unsigned long
122+
relocate_kernel_fn(unsigned long indirection_page,
123+
unsigned long pa_control_page,
124+
unsigned long start_address,
125+
unsigned int preserve_context,
126+
unsigned int host_mem_enc_active);
132127
#endif
133-
128+
extern relocate_kernel_fn relocate_kernel;
134129
#define ARCH_HAS_KIMAGE_ARCH
135130

136131
#ifdef CONFIG_X86_32
@@ -145,6 +140,19 @@ struct kimage_arch {
145140
};
146141
#else
147142
struct kimage_arch {
143+
/*
144+
* This is a kimage control page, as it must not overlap with either
145+
* source or destination address ranges.
146+
*/
147+
pgd_t *pgd;
148+
/*
149+
* The virtual mapping of the control code page itself is used only
150+
* during the transition, while the current kernel's pages are all
151+
* in place. Thus the intermediate page table pages used to map it
152+
* are not control pages, but instead just normal pages obtained
153+
* with get_zeroed_page(). And have to be tracked (below) so that
154+
* they can be freed.
155+
*/
148156
p4d_t *p4d;
149157
pud_t *pud;
150158
pmd_t *pmd;

arch/x86/include/asm/sections.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include <asm-generic/sections.h>
66
#include <asm/extable.h>
77

8+
extern char __relocate_kernel_start[], __relocate_kernel_end[];
89
extern char __brk_base[], __brk_limit[];
910
extern char __end_rodata_aligned[];
1011

arch/x86/include/asm/setup.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ extern unsigned long saved_video_mode;
4949

5050
extern void reserve_standard_io_resources(void);
5151
extern void i386_reserve_resources(void);
52-
extern unsigned long __startup_64(unsigned long physaddr, struct boot_params *bp);
52+
extern unsigned long __startup_64(unsigned long p2v_offset, struct boot_params *bp);
5353
extern void startup_64_setup_gdt_idt(void);
5454
extern void early_setup_idt(void);
5555
extern void __init do_early_exception(struct pt_regs *regs, int trapnr);

arch/x86/include/asm/sev-common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ struct snp_psc_desc {
207207
#define GHCB_TERM_SVSM_VMPL0 8 /* SVSM is present but has set VMPL to 0 */
208208
#define GHCB_TERM_SVSM_CAA 9 /* SVSM is present but CAA is not page aligned */
209209
#define GHCB_TERM_SECURE_TSC 10 /* Secure TSC initialization failed */
210+
#define GHCB_TERM_SVSM_CA_REMAP_FAIL 11 /* SVSM is present but CA could not be remapped */
210211

211212
#define GHCB_RESP_CODE(v) ((v) & GHCB_MSR_INFO_MASK)
212213

arch/x86/kernel/callthunks.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,9 +139,15 @@ static bool skip_addr(void *dest)
139139
return true;
140140
#endif
141141
#ifdef CONFIG_KEXEC_CORE
142+
# ifdef CONFIG_X86_64
143+
if (dest >= (void *)__relocate_kernel_start &&
144+
dest < (void *)__relocate_kernel_end)
145+
return true;
146+
# else
142147
if (dest >= (void *)relocate_kernel &&
143148
dest < (void*)relocate_kernel + KEXEC_CONTROL_CODE_MAX_SIZE)
144149
return true;
150+
# endif
145151
#endif
146152
return false;
147153
}

arch/x86/kernel/head64.c

Lines changed: 24 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -91,9 +91,11 @@ static inline bool check_la57_support(void)
9191
return true;
9292
}
9393

94-
static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdval_t *pmd)
94+
static unsigned long __head sme_postprocess_startup(struct boot_params *bp,
95+
pmdval_t *pmd,
96+
unsigned long p2v_offset)
9597
{
96-
unsigned long vaddr, vaddr_end;
98+
unsigned long paddr, paddr_end;
9799
int i;
98100

99101
/* Encrypt the kernel and related (if SME is active) */
@@ -106,10 +108,10 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
106108
* attribute.
107109
*/
108110
if (sme_get_me_mask()) {
109-
vaddr = (unsigned long)__start_bss_decrypted;
110-
vaddr_end = (unsigned long)__end_bss_decrypted;
111+
paddr = (unsigned long)&RIP_REL_REF(__start_bss_decrypted);
112+
paddr_end = (unsigned long)&RIP_REL_REF(__end_bss_decrypted);
111113

112-
for (; vaddr < vaddr_end; vaddr += PMD_SIZE) {
114+
for (; paddr < paddr_end; paddr += PMD_SIZE) {
113115
/*
114116
* On SNP, transition the page to shared in the RMP table so that
115117
* it is consistent with the page table attribute change.
@@ -118,11 +120,11 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
118120
* mapping (kernel .text). PVALIDATE, by way of
119121
* early_snp_set_memory_shared(), requires a valid virtual
120122
* address but the kernel is currently running off of the identity
121-
* mapping so use __pa() to get a *currently* valid virtual address.
123+
* mapping so use the PA to get a *currently* valid virtual address.
122124
*/
123-
early_snp_set_memory_shared(__pa(vaddr), __pa(vaddr), PTRS_PER_PMD);
125+
early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD);
124126

125-
i = pmd_index(vaddr);
127+
i = pmd_index(paddr - p2v_offset);
126128
pmd[i] -= sme_get_me_mask();
127129
}
128130
}
@@ -138,12 +140,15 @@ static unsigned long __head sme_postprocess_startup(struct boot_params *bp, pmdv
138140
* doesn't have to generate PC-relative relocations when accessing globals from
139141
* that function. Clang actually does not generate them, which leads to
140142
* boot-time crashes. To work around this problem, every global pointer must
141-
* be accessed using RIP_REL_REF().
143+
* be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined
144+
* by subtracting p2v_offset from the RIP-relative address.
142145
*/
143-
unsigned long __head __startup_64(unsigned long physaddr,
146+
unsigned long __head __startup_64(unsigned long p2v_offset,
144147
struct boot_params *bp)
145148
{
146149
pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts);
150+
unsigned long physaddr = (unsigned long)&RIP_REL_REF(_text);
151+
unsigned long va_text, va_end;
147152
unsigned long pgtable_flags;
148153
unsigned long load_delta;
149154
pgdval_t *pgd;
@@ -163,13 +168,16 @@ unsigned long __head __startup_64(unsigned long physaddr,
163168
* Compute the delta between the address I am compiled to run at
164169
* and the address I am actually running at.
165170
*/
166-
load_delta = physaddr - (unsigned long)(_text - __START_KERNEL_map);
171+
load_delta = __START_KERNEL_map + p2v_offset;
167172
RIP_REL_REF(phys_base) = load_delta;
168173

169174
/* Is the address not 2M aligned? */
170175
if (load_delta & ~PMD_MASK)
171176
for (;;);
172177

178+
va_text = physaddr - p2v_offset;
179+
va_end = (unsigned long)&RIP_REL_REF(_end) - p2v_offset;
180+
173181
/* Include the SME encryption mask in the fixup value */
174182
load_delta += sme_get_me_mask();
175183

@@ -178,7 +186,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
178186
pgd = &RIP_REL_REF(early_top_pgt)->pgd;
179187
pgd[pgd_index(__START_KERNEL_map)] += load_delta;
180188

181-
if (la57) {
189+
if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) {
182190
p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt);
183191
p4d[MAX_PTRS_PER_P4D - 1] += load_delta;
184192

@@ -230,7 +238,7 @@ unsigned long __head __startup_64(unsigned long physaddr,
230238
pmd_entry += sme_get_me_mask();
231239
pmd_entry += physaddr;
232240

233-
for (i = 0; i < DIV_ROUND_UP(_end - _text, PMD_SIZE); i++) {
241+
for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) {
234242
int idx = i + (physaddr >> PMD_SHIFT);
235243

236244
pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE;
@@ -255,19 +263,19 @@ unsigned long __head __startup_64(unsigned long physaddr,
255263
pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd;
256264

257265
/* invalidate pages before the kernel image */
258-
for (i = 0; i < pmd_index((unsigned long)_text); i++)
266+
for (i = 0; i < pmd_index(va_text); i++)
259267
pmd[i] &= ~_PAGE_PRESENT;
260268

261269
/* fixup pages that are part of the kernel image */
262-
for (; i <= pmd_index((unsigned long)_end); i++)
270+
for (; i <= pmd_index(va_end); i++)
263271
if (pmd[i] & _PAGE_PRESENT)
264272
pmd[i] += load_delta;
265273

266274
/* invalidate pages after the kernel image */
267275
for (; i < PTRS_PER_PMD; i++)
268276
pmd[i] &= ~_PAGE_PRESENT;
269277

270-
return sme_postprocess_startup(bp, pmd);
278+
return sme_postprocess_startup(bp, pmd, p2v_offset);
271279
}
272280

273281
/* Wipe all early page tables except for the kernel symbol map */

0 commit comments

Comments
 (0)