Skip to content

Commit 0dc9c63

Browse files
stellarhopperKAGA-KOKO
authored andcommitted
x86/mce: Make the MCE notifier a blocking one
The NFIT MCE handler callback (for handling media errors on NVDIMMs) takes a mutex to add the location of a memory error to a list. But since the notifier call chain for machine checks (x86_mce_decoder_chain) is atomic, we get a lockdep splat like: BUG: sleeping function called from invalid context at kernel/locking/mutex.c:620 in_atomic(): 1, irqs_disabled(): 0, pid: 4, name: kworker/0:0 [..] Call Trace: dump_stack ___might_sleep __might_sleep mutex_lock_nested ? __lock_acquire nfit_handle_mce notifier_call_chain atomic_notifier_call_chain ? atomic_notifier_call_chain mce_gen_pool_process Convert the notifier to a blocking one which gets to run only in process context. Boris: remove the notifier call in atomic context in print_mce(). For now, let's print the MCE on the atomic path so that we can make sure they go out and get logged at least. Fixes: 6839a6d ("nfit: do an ARS scrub on hitting a latent media error") Reported-by: Ross Zwisler <[email protected]> Signed-off-by: Vishal Verma <[email protected]> Acked-by: Tony Luck <[email protected]> Cc: Dan Williams <[email protected]> Cc: linux-edac <[email protected]> Cc: x86-ml <[email protected]> Cc: <[email protected]> Link: http://lkml.kernel.org/r/[email protected] Signed-off-by: Borislav Petkov <[email protected]> Signed-off-by: Thomas Gleixner <[email protected]>
1 parent 4f7d029 commit 0dc9c63

File tree

3 files changed

+5
-16
lines changed

3 files changed

+5
-16
lines changed

arch/x86/kernel/cpu/mcheck/mce-genpool.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ void mce_gen_pool_process(struct work_struct *__unused)
8585
head = llist_reverse_order(head);
8686
llist_for_each_entry_safe(node, tmp, head, llnode) {
8787
mce = &node->mce;
88-
atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
88+
blocking_notifier_call_chain(&x86_mce_decoder_chain, 0, mce);
8989
gen_pool_free(mce_evt_pool, (unsigned long)node, sizeof(*node));
9090
}
9191
}

arch/x86/kernel/cpu/mcheck/mce-internal.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ enum severity_level {
1313
MCE_PANIC_SEVERITY,
1414
};
1515

16-
extern struct atomic_notifier_head x86_mce_decoder_chain;
16+
extern struct blocking_notifier_head x86_mce_decoder_chain;
1717

1818
#define ATTR_LEN 16
1919
#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */

arch/x86/kernel/cpu/mcheck/mce.c

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@ static void (*quirk_no_way_out)(int bank, struct mce *m, struct pt_regs *regs);
123123
* CPU/chipset specific EDAC code can register a notifier call here to print
124124
* MCE errors in a human-readable form.
125125
*/
126-
ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
126+
BLOCKING_NOTIFIER_HEAD(x86_mce_decoder_chain);
127127

128128
/* Do initial initialization of a struct mce */
129129
void mce_setup(struct mce *m)
@@ -220,15 +220,15 @@ void mce_register_decode_chain(struct notifier_block *nb)
220220

221221
WARN_ON(nb->priority > MCE_PRIO_LOWEST && nb->priority < MCE_PRIO_EDAC);
222222

223-
atomic_notifier_chain_register(&x86_mce_decoder_chain, nb);
223+
blocking_notifier_chain_register(&x86_mce_decoder_chain, nb);
224224
}
225225
EXPORT_SYMBOL_GPL(mce_register_decode_chain);
226226

227227
void mce_unregister_decode_chain(struct notifier_block *nb)
228228
{
229229
atomic_dec(&num_notifiers);
230230

231-
atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
231+
blocking_notifier_chain_unregister(&x86_mce_decoder_chain, nb);
232232
}
233233
EXPORT_SYMBOL_GPL(mce_unregister_decode_chain);
234234

@@ -321,18 +321,7 @@ static void __print_mce(struct mce *m)
321321

322322
static void print_mce(struct mce *m)
323323
{
324-
int ret = 0;
325-
326324
__print_mce(m);
327-
328-
/*
329-
* Print out human-readable details about the MCE error,
330-
* (if the CPU has an implementation for that)
331-
*/
332-
ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
333-
if (ret == NOTIFY_STOP)
334-
return;
335-
336325
pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
337326
}
338327

0 commit comments

Comments
 (0)