Skip to content

Commit d8ecca4

Browse files
aeglsuryasaimadhu
authored andcommitted
x86/mce/dev-mcelog: Dynamically allocate space for machine check records
We have had a hard coded limit of 32 machine check records since the dawn of time. But as numbers of cores increase, it is possible for more than 32 errors to be reported before a user process reads from /dev/mcelog. In this case the additional errors are lost. Keep 32 as the minimum. But tune the maximum value up based on the number of processors. Signed-off-by: Tony Luck <[email protected]> Signed-off-by: Borislav Petkov <[email protected]> Link: https://lkml.kernel.org/r/[email protected]
1 parent 2976908 commit d8ecca4

File tree

2 files changed

+30
-23
lines changed

2 files changed

+30
-23
lines changed

arch/x86/include/asm/mce.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,7 @@
102102

103103
#define MCE_OVERFLOW 0 /* bit 0 in flags means overflow */
104104

105-
#define MCE_LOG_LEN 32
105+
#define MCE_LOG_MIN_LEN 32U
106106
#define MCE_LOG_SIGNATURE "MACHINECHECK"
107107

108108
/* AMD Scalable MCA */
@@ -135,11 +135,11 @@
135135
*/
136136
struct mce_log_buffer {
137137
char signature[12]; /* "MACHINECHECK" */
138-
unsigned len; /* = MCE_LOG_LEN */
138+
unsigned len; /* = elements in .mce_entry[] */
139139
unsigned next;
140140
unsigned flags;
141141
unsigned recordlen; /* length of struct mce */
142-
struct mce entry[MCE_LOG_LEN];
142+
struct mce entry[];
143143
};
144144

145145
enum mce_notifier_prios {

arch/x86/kernel/cpu/mce/dev-mcelog.c

Lines changed: 27 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,7 @@ static char *mce_helper_argv[2] = { mce_helper, NULL };
2929
* separate MCEs from kernel messages to avoid bogus bug reports.
3030
*/
3131

32-
static struct mce_log_buffer mcelog = {
33-
.signature = MCE_LOG_SIGNATURE,
34-
.len = MCE_LOG_LEN,
35-
.recordlen = sizeof(struct mce),
36-
};
32+
static struct mce_log_buffer *mcelog;
3733

3834
static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
3935

@@ -45,21 +41,21 @@ static int dev_mce_log(struct notifier_block *nb, unsigned long val,
4541

4642
mutex_lock(&mce_chrdev_read_mutex);
4743

48-
entry = mcelog.next;
44+
entry = mcelog->next;
4945

5046
/*
5147
* When the buffer fills up discard new entries. Assume that the
5248
* earlier errors are the more interesting ones:
5349
*/
54-
if (entry >= MCE_LOG_LEN) {
55-
set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags);
50+
if (entry >= mcelog->len) {
51+
set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog->flags);
5652
goto unlock;
5753
}
5854

59-
mcelog.next = entry + 1;
55+
mcelog->next = entry + 1;
6056

61-
memcpy(mcelog.entry + entry, mce, sizeof(struct mce));
62-
mcelog.entry[entry].finished = 1;
57+
memcpy(mcelog->entry + entry, mce, sizeof(struct mce));
58+
mcelog->entry[entry].finished = 1;
6359

6460
/* wake processes polling /dev/mcelog */
6561
wake_up_interruptible(&mce_chrdev_wait);
@@ -214,21 +210,21 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
214210

215211
/* Only supports full reads right now */
216212
err = -EINVAL;
217-
if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
213+
if (*off != 0 || usize < mcelog->len * sizeof(struct mce))
218214
goto out;
219215

220-
next = mcelog.next;
216+
next = mcelog->next;
221217
err = 0;
222218

223219
for (i = 0; i < next; i++) {
224-
struct mce *m = &mcelog.entry[i];
220+
struct mce *m = &mcelog->entry[i];
225221

226222
err |= copy_to_user(buf, m, sizeof(*m));
227223
buf += sizeof(*m);
228224
}
229225

230-
memset(mcelog.entry, 0, next * sizeof(struct mce));
231-
mcelog.next = 0;
226+
memset(mcelog->entry, 0, next * sizeof(struct mce));
227+
mcelog->next = 0;
232228

233229
if (err)
234230
err = -EFAULT;
@@ -242,7 +238,7 @@ static ssize_t mce_chrdev_read(struct file *filp, char __user *ubuf,
242238
static __poll_t mce_chrdev_poll(struct file *file, poll_table *wait)
243239
{
244240
poll_wait(file, &mce_chrdev_wait, wait);
245-
if (READ_ONCE(mcelog.next))
241+
if (READ_ONCE(mcelog->next))
246242
return EPOLLIN | EPOLLRDNORM;
247243
if (!mce_apei_read_done && apei_check_mce())
248244
return EPOLLIN | EPOLLRDNORM;
@@ -261,13 +257,13 @@ static long mce_chrdev_ioctl(struct file *f, unsigned int cmd,
261257
case MCE_GET_RECORD_LEN:
262258
return put_user(sizeof(struct mce), p);
263259
case MCE_GET_LOG_LEN:
264-
return put_user(MCE_LOG_LEN, p);
260+
return put_user(mcelog->len, p);
265261
case MCE_GETCLEAR_FLAGS: {
266262
unsigned flags;
267263

268264
do {
269-
flags = mcelog.flags;
270-
} while (cmpxchg(&mcelog.flags, flags, 0) != flags);
265+
flags = mcelog->flags;
266+
} while (cmpxchg(&mcelog->flags, flags, 0) != flags);
271267

272268
return put_user(flags, p);
273269
}
@@ -339,8 +335,18 @@ static struct miscdevice mce_chrdev_device = {
339335

340336
static __init int dev_mcelog_init_device(void)
341337
{
338+
int mce_log_len;
342339
int err;
343340

341+
mce_log_len = max(MCE_LOG_MIN_LEN, num_online_cpus());
342+
mcelog = kzalloc(sizeof(*mcelog) + mce_log_len * sizeof(struct mce), GFP_KERNEL);
343+
if (!mcelog)
344+
return -ENOMEM;
345+
346+
strncpy(mcelog->signature, MCE_LOG_SIGNATURE, sizeof(mcelog->signature));
347+
mcelog->len = mce_log_len;
348+
mcelog->recordlen = sizeof(struct mce);
349+
344350
/* register character device /dev/mcelog */
345351
err = misc_register(&mce_chrdev_device);
346352
if (err) {
@@ -350,6 +356,7 @@ static __init int dev_mcelog_init_device(void)
350356
else
351357
pr_err("Unable to init device /dev/mcelog (rc: %d)\n", err);
352358

359+
kfree(mcelog);
353360
return err;
354361
}
355362

0 commit comments

Comments
 (0)