Skip to content

Commit 1a3ea61

Browse files
committed
x86/nmi: Accumulate NMI-progress evidence in exc_nmi()
CPUs ignoring NMIs is often a sign of those CPUs going bad, but there are quite a few other reasons why a CPU might ignore NMIs. Therefore, accumulate evidence within exc_nmi() as to what might be preventing a given CPU from responding to an NMI. [ paulmck: Apply Peter Zijlstra feedback. ] Signed-off-by: Paul E. McKenney <[email protected]> Cc: Thomas Gleixner <[email protected]> Cc: Ingo Molnar <[email protected]> Cc: Borislav Petkov <[email protected]> Cc: Dave Hansen <[email protected]> Cc: "H. Peter Anvin" <[email protected]> Cc: <[email protected]> Reviewed-by: Ingo Molnar <[email protected]>
1 parent 1b929c0 commit 1a3ea61

File tree

2 files changed

+45
-1
lines changed

2 files changed

+45
-1
lines changed

arch/x86/kernel/nmi.c

Lines changed: 34 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,15 @@ struct nmi_stats {
6969
unsigned int unknown;
7070
unsigned int external;
7171
unsigned int swallow;
72+
unsigned long recv_jiffies;
73+
unsigned long idt_seq;
74+
unsigned long idt_nmi_seq;
75+
unsigned long idt_ignored;
76+
atomic_long_t idt_calls;
77+
unsigned long idt_seq_snap;
78+
unsigned long idt_nmi_seq_snap;
79+
unsigned long idt_ignored_snap;
80+
long idt_calls_snap;
7281
};
7382

7483
static DEFINE_PER_CPU(struct nmi_stats, nmi_stats);
@@ -479,12 +488,15 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);
479488
DEFINE_IDTENTRY_RAW(exc_nmi)
480489
{
481490
irqentry_state_t irq_state;
491+
struct nmi_stats *nsp = this_cpu_ptr(&nmi_stats);
482492

483493
/*
484494
* Re-enable NMIs right here when running as an SEV-ES guest. This might
485495
* cause nested NMIs, but those can be handled safely.
486496
*/
487497
sev_es_nmi_complete();
498+
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU))
499+
arch_atomic_long_inc(&nsp->idt_calls);
488500

489501
if (IS_ENABLED(CONFIG_SMP) && arch_cpu_is_offline(smp_processor_id()))
490502
return;
@@ -495,6 +507,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
495507
}
496508
this_cpu_write(nmi_state, NMI_EXECUTING);
497509
this_cpu_write(nmi_cr2, read_cr2());
510+
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
511+
WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
512+
WARN_ON_ONCE(!(nsp->idt_seq & 0x1));
513+
WRITE_ONCE(nsp->recv_jiffies, jiffies);
514+
}
498515
nmi_restart:
499516

500517
/*
@@ -509,8 +526,19 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
509526

510527
inc_irq_stat(__nmi_count);
511528

512-
if (!ignore_nmis)
529+
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU) && ignore_nmis) {
530+
WRITE_ONCE(nsp->idt_ignored, nsp->idt_ignored + 1);
531+
} else if (!ignore_nmis) {
532+
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
533+
WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1);
534+
WARN_ON_ONCE(!(nsp->idt_nmi_seq & 0x1));
535+
}
513536
default_do_nmi(regs);
537+
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
538+
WRITE_ONCE(nsp->idt_nmi_seq, nsp->idt_nmi_seq + 1);
539+
WARN_ON_ONCE(nsp->idt_nmi_seq & 0x1);
540+
}
541+
}
514542

515543
irqentry_nmi_exit(regs, irq_state);
516544

@@ -525,6 +553,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
525553

526554
if (user_mode(regs))
527555
mds_user_clear_cpu_buffers();
556+
if (IS_ENABLED(CONFIG_NMI_CHECK_CPU)) {
557+
WRITE_ONCE(nsp->idt_seq, nsp->idt_seq + 1);
558+
WARN_ON_ONCE(nsp->idt_seq & 0x1);
559+
WRITE_ONCE(nsp->recv_jiffies, jiffies);
560+
}
528561
}
529562

530563
#if defined(CONFIG_X86_64) && IS_ENABLED(CONFIG_KVM_INTEL)

lib/Kconfig.debug

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1552,6 +1552,17 @@ config TRACE_IRQFLAGS_NMI
15521552
depends on TRACE_IRQFLAGS
15531553
depends on TRACE_IRQFLAGS_NMI_SUPPORT
15541554

1555+
config NMI_CHECK_CPU
1556+
bool "Debugging for CPUs failing to respond to backtrace requests"
1557+
depends on DEBUG_KERNEL
1558+
depends on X86
1559+
default n
1560+
help
1561+
Enables debug prints when a CPU fails to respond to a given
1562+
backtrace NMI. These prints provide some reasons why a CPU
1563+
might legitimately be failing to respond, for example, if it
1564+
is offline of if ignore_nmis is set.
1565+
15551566
config DEBUG_IRQFLAGS
15561567
bool "Debug IRQ flag manipulation"
15571568
help

0 commit comments

Comments
 (0)