@@ -69,6 +69,15 @@ struct nmi_stats {
69
69
unsigned int unknown ;
70
70
unsigned int external ;
71
71
unsigned int swallow ;
72
+ unsigned long recv_jiffies ;
73
+ unsigned long idt_seq ;
74
+ unsigned long idt_nmi_seq ;
75
+ unsigned long idt_ignored ;
76
+ atomic_long_t idt_calls ;
77
+ unsigned long idt_seq_snap ;
78
+ unsigned long idt_nmi_seq_snap ;
79
+ unsigned long idt_ignored_snap ;
80
+ long idt_calls_snap ;
72
81
};
73
82
74
83
static DEFINE_PER_CPU (struct nmi_stats , nmi_stats ) ;
@@ -479,12 +488,15 @@ static DEFINE_PER_CPU(unsigned long, nmi_dr7);
479
488
DEFINE_IDTENTRY_RAW (exc_nmi )
480
489
{
481
490
irqentry_state_t irq_state ;
491
+ struct nmi_stats * nsp = this_cpu_ptr (& nmi_stats );
482
492
483
493
/*
484
494
* Re-enable NMIs right here when running as an SEV-ES guest. This might
485
495
* cause nested NMIs, but those can be handled safely.
486
496
*/
487
497
sev_es_nmi_complete ();
498
+ if (IS_ENABLED (CONFIG_NMI_CHECK_CPU ))
499
+ arch_atomic_long_inc (& nsp -> idt_calls );
488
500
489
501
if (IS_ENABLED (CONFIG_SMP ) && arch_cpu_is_offline (smp_processor_id ()))
490
502
return ;
@@ -495,6 +507,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
495
507
}
496
508
this_cpu_write (nmi_state , NMI_EXECUTING );
497
509
this_cpu_write (nmi_cr2 , read_cr2 ());
510
+ if (IS_ENABLED (CONFIG_NMI_CHECK_CPU )) {
511
+ WRITE_ONCE (nsp -> idt_seq , nsp -> idt_seq + 1 );
512
+ WARN_ON_ONCE (!(nsp -> idt_seq & 0x1 ));
513
+ WRITE_ONCE (nsp -> recv_jiffies , jiffies );
514
+ }
498
515
nmi_restart :
499
516
500
517
/*
@@ -509,8 +526,19 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
509
526
510
527
inc_irq_stat (__nmi_count );
511
528
512
- if (!ignore_nmis )
529
+ if (IS_ENABLED (CONFIG_NMI_CHECK_CPU ) && ignore_nmis ) {
530
+ WRITE_ONCE (nsp -> idt_ignored , nsp -> idt_ignored + 1 );
531
+ } else if (!ignore_nmis ) {
532
+ if (IS_ENABLED (CONFIG_NMI_CHECK_CPU )) {
533
+ WRITE_ONCE (nsp -> idt_nmi_seq , nsp -> idt_nmi_seq + 1 );
534
+ WARN_ON_ONCE (!(nsp -> idt_nmi_seq & 0x1 ));
535
+ }
513
536
default_do_nmi (regs );
537
+ if (IS_ENABLED (CONFIG_NMI_CHECK_CPU )) {
538
+ WRITE_ONCE (nsp -> idt_nmi_seq , nsp -> idt_nmi_seq + 1 );
539
+ WARN_ON_ONCE (nsp -> idt_nmi_seq & 0x1 );
540
+ }
541
+ }
514
542
515
543
irqentry_nmi_exit (regs , irq_state );
516
544
@@ -525,6 +553,11 @@ DEFINE_IDTENTRY_RAW(exc_nmi)
525
553
526
554
if (user_mode (regs ))
527
555
mds_user_clear_cpu_buffers ();
556
+ if (IS_ENABLED (CONFIG_NMI_CHECK_CPU )) {
557
+ WRITE_ONCE (nsp -> idt_seq , nsp -> idt_seq + 1 );
558
+ WARN_ON_ONCE (nsp -> idt_seq & 0x1 );
559
+ WRITE_ONCE (nsp -> recv_jiffies , jiffies );
560
+ }
528
561
}
529
562
530
563
#if defined(CONFIG_X86_64 ) && IS_ENABLED (CONFIG_KVM_INTEL )
@@ -537,6 +570,79 @@ DEFINE_IDTENTRY_RAW(exc_nmi_noist)
537
570
EXPORT_SYMBOL_GPL (asm_exc_nmi_noist );
538
571
#endif
539
572
573
+ #ifdef CONFIG_NMI_CHECK_CPU
574
+
575
+ static char * nmi_check_stall_msg [] = {
576
+ /* */
577
+ /* +--------- nsp->idt_seq_snap & 0x1: CPU is in NMI handler. */
578
+ /* | +------ cpu_is_offline(cpu) */
579
+ /* | | +--- nsp->idt_calls_snap != atomic_long_read(&nsp->idt_calls): */
580
+ /* | | | NMI handler has been invoked. */
581
+ /* | | | */
582
+ /* V V V */
583
+ /* 0 0 0 */ "NMIs are not reaching exc_nmi() handler" ,
584
+ /* 0 0 1 */ "exc_nmi() handler is ignoring NMIs" ,
585
+ /* 0 1 0 */ "CPU is offline and NMIs are not reaching exc_nmi() handler" ,
586
+ /* 0 1 1 */ "CPU is offline and exc_nmi() handler is legitimately ignoring NMIs" ,
587
+ /* 1 0 0 */ "CPU is in exc_nmi() handler and no further NMIs are reaching handler" ,
588
+ /* 1 0 1 */ "CPU is in exc_nmi() handler which is legitimately ignoring NMIs" ,
589
+ /* 1 1 0 */ "CPU is offline in exc_nmi() handler and no more NMIs are reaching exc_nmi() handler" ,
590
+ /* 1 1 1 */ "CPU is offline in exc_nmi() handler which is legitimately ignoring NMIs" ,
591
+ };
592
+
593
+ void nmi_backtrace_stall_snap (const struct cpumask * btp )
594
+ {
595
+ int cpu ;
596
+ struct nmi_stats * nsp ;
597
+
598
+ for_each_cpu (cpu , btp ) {
599
+ nsp = per_cpu_ptr (& nmi_stats , cpu );
600
+ nsp -> idt_seq_snap = READ_ONCE (nsp -> idt_seq );
601
+ nsp -> idt_nmi_seq_snap = READ_ONCE (nsp -> idt_nmi_seq );
602
+ nsp -> idt_ignored_snap = READ_ONCE (nsp -> idt_ignored );
603
+ nsp -> idt_calls_snap = atomic_long_read (& nsp -> idt_calls );
604
+ }
605
+ }
606
+
607
+ void nmi_backtrace_stall_check (const struct cpumask * btp )
608
+ {
609
+ int cpu ;
610
+ int idx ;
611
+ unsigned long nmi_seq ;
612
+ unsigned long j = jiffies ;
613
+ char * modp ;
614
+ char * msgp ;
615
+ char * msghp ;
616
+ struct nmi_stats * nsp ;
617
+
618
+ for_each_cpu (cpu , btp ) {
619
+ nsp = per_cpu_ptr (& nmi_stats , cpu );
620
+ modp = "" ;
621
+ msghp = "" ;
622
+ nmi_seq = READ_ONCE (nsp -> idt_nmi_seq );
623
+ if (nsp -> idt_nmi_seq_snap + 1 == nmi_seq && (nmi_seq & 0x1 )) {
624
+ msgp = "CPU entered NMI handler function, but has not exited" ;
625
+ } else if ((nsp -> idt_nmi_seq_snap & 0x1 ) != (nmi_seq & 0x1 )) {
626
+ msgp = "CPU is handling NMIs" ;
627
+ } else {
628
+ idx = ((nsp -> idt_seq_snap & 0x1 ) << 2 ) |
629
+ (cpu_is_offline (cpu ) << 1 ) |
630
+ (nsp -> idt_calls_snap != atomic_long_read (& nsp -> idt_calls ));
631
+ msgp = nmi_check_stall_msg [idx ];
632
+ if (nsp -> idt_ignored_snap != READ_ONCE (nsp -> idt_ignored ) && (idx & 0x1 ))
633
+ modp = ", but OK because ignore_nmis was set" ;
634
+ if (nmi_seq & ~0x1 )
635
+ msghp = " (CPU currently in NMI handler function)" ;
636
+ else if (nsp -> idt_nmi_seq_snap + 1 == nmi_seq )
637
+ msghp = " (CPU exited one NMI handler function)" ;
638
+ }
639
+ pr_alert ("%s: CPU %d: %s%s%s, last activity: %lu jiffies ago.\n" ,
640
+ __func__ , cpu , msgp , modp , msghp , j - READ_ONCE (nsp -> recv_jiffies ));
641
+ }
642
+ }
643
+
644
+ #endif
645
+
540
646
void stop_nmi (void )
541
647
{
542
648
ignore_nmis ++ ;
0 commit comments