Skip to content

Commit 8f849ff

Browse files
KAGA-KOKObp3tk0v
authored andcommitted
x86/microcode: Handle "offline" CPUs correctly
Offline CPUs need to be parked in a safe loop when microcode update is in progress on the primary CPU. Currently, offline CPUs are parked in mwait_play_dead(), and for Intel CPUs, its not a safe instruction, because the MWAIT instruction can be patched in the new microcode update that can cause instability. - Add a new microcode state 'UCODE_OFFLINE' to report status on per-CPU basis. - Force NMI on the offline CPUs. Wake up offline CPUs while the update is in progress and then return them back to mwait_play_dead() after microcode update is complete. Signed-off-by: Thomas Gleixner <[email protected]> Signed-off-by: Borislav Petkov (AMD) <[email protected]> Link: https://lore.kernel.org/r/[email protected]
1 parent 9cab5fb commit 8f849ff

File tree

4 files changed

+113
-6
lines changed

4 files changed

+113
-6
lines changed

arch/x86/include/asm/microcode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ static inline u32 intel_get_microcode_revision(void)
7373
#endif /* !CONFIG_CPU_SUP_INTEL */
7474

7575
bool microcode_nmi_handler(void);
76+
void microcode_offline_nmi_handler(void);
7677

7778
#ifdef CONFIG_MICROCODE_LATE_LOADING
7879
DECLARE_STATIC_KEY_FALSE(microcode_nmi_handler_enable);

arch/x86/kernel/cpu/microcode/core.c

Lines changed: 107 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -272,8 +272,9 @@ struct microcode_ctrl {
272272

273273
DEFINE_STATIC_KEY_FALSE(microcode_nmi_handler_enable);
274274
static DEFINE_PER_CPU(struct microcode_ctrl, ucode_ctrl);
275+
static atomic_t late_cpus_in, offline_in_nmi;
275276
static unsigned int loops_per_usec;
276-
static atomic_t late_cpus_in;
277+
static cpumask_t cpu_offline_mask;
277278

278279
static noinstr bool wait_for_cpus(atomic_t *cnt)
279280
{
@@ -381,7 +382,7 @@ static noinstr void load_secondary(unsigned int cpu)
381382
instrumentation_end();
382383
}
383384

384-
static void load_primary(unsigned int cpu)
385+
static void __load_primary(unsigned int cpu)
385386
{
386387
struct cpumask *secondaries = topology_sibling_cpumask(cpu);
387388
enum sibling_ctrl ctrl;
@@ -416,6 +417,67 @@ static void load_primary(unsigned int cpu)
416417
}
417418
}
418419

420+
static bool kick_offline_cpus(unsigned int nr_offl)
421+
{
422+
unsigned int cpu, timeout;
423+
424+
for_each_cpu(cpu, &cpu_offline_mask) {
425+
/* Enable the rendezvous handler and send NMI */
426+
per_cpu(ucode_ctrl.nmi_enabled, cpu) = true;
427+
apic_send_nmi_to_offline_cpu(cpu);
428+
}
429+
430+
/* Wait for them to arrive */
431+
for (timeout = 0; timeout < (USEC_PER_SEC / 2); timeout++) {
432+
if (atomic_read(&offline_in_nmi) == nr_offl)
433+
return true;
434+
udelay(1);
435+
}
436+
/* Let the others time out */
437+
return false;
438+
}
439+
440+
static void release_offline_cpus(void)
441+
{
442+
unsigned int cpu;
443+
444+
for_each_cpu(cpu, &cpu_offline_mask)
445+
per_cpu(ucode_ctrl.ctrl, cpu) = SCTRL_DONE;
446+
}
447+
448+
static void load_primary(unsigned int cpu)
449+
{
450+
unsigned int nr_offl = cpumask_weight(&cpu_offline_mask);
451+
bool proceed = true;
452+
453+
/* Kick soft-offlined SMT siblings if required */
454+
if (!cpu && nr_offl)
455+
proceed = kick_offline_cpus(nr_offl);
456+
457+
/* If the soft-offlined CPUs did not respond, abort */
458+
if (proceed)
459+
__load_primary(cpu);
460+
461+
/* Unconditionally release soft-offlined SMT siblings if required */
462+
if (!cpu && nr_offl)
463+
release_offline_cpus();
464+
}
465+
466+
/*
467+
* Minimal stub rendezvous handler for soft-offlined CPUs which participate
468+
* in the NMI rendezvous to protect against a concurrent NMI on affected
469+
* CPUs.
470+
*/
471+
void noinstr microcode_offline_nmi_handler(void)
472+
{
473+
if (!raw_cpu_read(ucode_ctrl.nmi_enabled))
474+
return;
475+
raw_cpu_write(ucode_ctrl.nmi_enabled, false);
476+
raw_cpu_write(ucode_ctrl.result, UCODE_OFFLINE);
477+
raw_atomic_inc(&offline_in_nmi);
478+
wait_for_ctrl();
479+
}
480+
419481
static noinstr bool microcode_update_handler(void)
420482
{
421483
unsigned int cpu = raw_smp_processor_id();
@@ -472,13 +534,15 @@ static int load_cpus_stopped(void *unused)
472534
static int load_late_stop_cpus(void)
473535
{
474536
unsigned int cpu, updated = 0, failed = 0, timedout = 0, siblings = 0;
537+
unsigned int nr_offl, offline = 0;
475538
int old_rev = boot_cpu_data.microcode;
476539
struct cpuinfo_x86 prev_info;
477540

478541
pr_err("Attempting late microcode loading - it is dangerous and taints the kernel.\n");
479542
pr_err("You should switch to early loading, if possible.\n");
480543

481544
atomic_set(&late_cpus_in, num_online_cpus());
545+
atomic_set(&offline_in_nmi, 0);
482546
loops_per_usec = loops_per_jiffy / (TICK_NSEC / 1000);
483547

484548
/*
@@ -501,6 +565,7 @@ static int load_late_stop_cpus(void)
501565
case UCODE_UPDATED: updated++; break;
502566
case UCODE_TIMEOUT: timedout++; break;
503567
case UCODE_OK: siblings++; break;
568+
case UCODE_OFFLINE: offline++; break;
504569
default: failed++; break;
505570
}
506571
}
@@ -512,6 +577,13 @@ static int load_late_stop_cpus(void)
512577
/* Nothing changed. */
513578
if (!failed && !timedout)
514579
return 0;
580+
581+
nr_offl = cpumask_weight(&cpu_offline_mask);
582+
if (offline < nr_offl) {
583+
pr_warn("%u offline siblings did not respond.\n",
584+
nr_offl - atomic_read(&offline_in_nmi));
585+
return -EIO;
586+
}
515587
pr_err("update failed: %u CPUs failed %u CPUs timed out\n",
516588
failed, timedout);
517589
return -EIO;
@@ -545,19 +617,49 @@ static int load_late_stop_cpus(void)
545617
* modern CPUs uses MWAIT, which is also not guaranteed to be safe
546618
* against a microcode update which affects MWAIT.
547619
*
548-
* 2) Initialize the per CPU control structure
620+
* As soft-offlined CPUs still react on NMIs, the SMT sibling
621+
* restriction can be lifted when the vendor driver signals to use NMI
622+
* for rendezvous and the APIC provides a mechanism to send an NMI to a
623+
* soft-offlined CPU. The soft-offlined CPUs are then able to
624+
* participate in the rendezvous in a trivial stub handler.
625+
*
626+
* 2) Initialize the per CPU control structure and create a cpumask
627+
* which contains "offline"; secondary threads, so they can be handled
628+
* correctly by a control CPU.
549629
*/
550630
static bool setup_cpus(void)
551631
{
552632
struct microcode_ctrl ctrl = { .ctrl = SCTRL_WAIT, .result = -1, };
633+
bool allow_smt_offline;
553634
unsigned int cpu;
554635

636+
allow_smt_offline = microcode_ops->nmi_safe ||
637+
(microcode_ops->use_nmi && apic->nmi_to_offline_cpu);
638+
639+
cpumask_clear(&cpu_offline_mask);
640+
555641
for_each_cpu_and(cpu, cpu_present_mask, &cpus_booted_once_mask) {
642+
/*
643+
* Offline CPUs sit in one of the play_dead() functions
644+
* with interrupts disabled, but they still react on NMIs
645+
* and execute arbitrary code. Also MWAIT being updated
646+
* while the offline CPU sits there is not necessarily safe
647+
* on all CPU variants.
648+
*
649+
* Mark them in the offline_cpus mask which will be handled
650+
* by CPU0 later in the update process.
651+
*
652+
* Ensure that the primary thread is online so that it is
653+
* guaranteed that all cores are updated.
654+
*/
556655
if (!cpu_online(cpu)) {
557-
if (topology_is_primary_thread(cpu) || !microcode_ops->nmi_safe) {
558-
pr_err("CPU %u not online\n", cpu);
656+
if (topology_is_primary_thread(cpu) || !allow_smt_offline) {
657+
pr_err("CPU %u not online, loading aborted\n", cpu);
559658
return false;
560659
}
660+
cpumask_set_cpu(cpu, &cpu_offline_mask);
661+
per_cpu(ucode_ctrl, cpu) = ctrl;
662+
continue;
561663
}
562664

563665
/*

arch/x86/kernel/cpu/microcode/internal.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ enum ucode_state {
1717
UCODE_NFOUND,
1818
UCODE_ERROR,
1919
UCODE_TIMEOUT,
20+
UCODE_OFFLINE,
2021
};
2122

2223
struct microcode_ops {

arch/x86/kernel/nmi.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -502,8 +502,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
502502
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU))
503503
raw_atomic_long_inc(&nsp->idt_calls);
504504

505-
if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
505+
if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id())) {
506+
if (microcode_nmi_handler_enabled())
507+
microcode_offline_nmi_handler();
506508
return;
509+
}
507510

508511
if (this_cpu_read(nmi_state) != NMI_NOT_RUNNING) {
509512
this_cpu_write(nmi_state, NMI_LATCHED);

0 commit comments

Comments
 (0)