Skip to content

Commit ccb5ecd

Browse files
Xiaofei Tanrafaeljw
authored andcommitted
ACPI: APEI: fix synchronous external aborts in user-mode
Before commit 8fcc4ae ("arm64: acpi: Make apei_claim_sea() synchronise with APEI's irq work"), do_sea() would unconditionally signal the affected task from the arch code. Since that change, the GHES driver sends the signals. This exposes a problem as errors the GHES driver doesn't understand or doesn't handle effectively are silently ignored. It will cause the errors get taken again, and circulate endlessly. User-space task get stuck in this loop. Existing firmware on Kunpeng9xx systems reports cache errors with the 'ARM Processor Error' CPER records. Do memory failure handling for ARM Processor Error Section just like for Memory Error Section. Fixes: 8fcc4ae ("arm64: acpi: Make apei_claim_sea() synchronise with APEI's irq work") Signed-off-by: Xiaofei Tan <[email protected]> Reviewed-by: James Morse <[email protected]> [ rjw: Subject edit ] Signed-off-by: Rafael J. Wysocki <[email protected]>
1 parent b7a732a commit ccb5ecd

File tree

1 file changed

+64
-17
lines changed

1 file changed

+64
-17
lines changed

drivers/acpi/apei/ghes.c

Lines changed: 64 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -441,43 +441,92 @@ static void ghes_kick_task_work(struct callback_head *head)
441441
gen_pool_free(ghes_estatus_pool, (unsigned long)estatus_node, node_len);
442442
}
443443

444-
static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
445-
int sev)
444+
static bool ghes_do_memory_failure(u64 physical_addr, int flags)
446445
{
447446
unsigned long pfn;
448-
int flags = -1;
449-
int sec_sev = ghes_severity(gdata->error_severity);
450-
struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
451447

452448
if (!IS_ENABLED(CONFIG_ACPI_APEI_MEMORY_FAILURE))
453449
return false;
454450

455-
if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
456-
return false;
457-
458-
pfn = mem_err->physical_addr >> PAGE_SHIFT;
451+
pfn = PHYS_PFN(physical_addr);
459452
if (!pfn_valid(pfn)) {
460453
pr_warn_ratelimited(FW_WARN GHES_PFX
461454
"Invalid address in generic error data: %#llx\n",
462-
mem_err->physical_addr);
455+
physical_addr);
463456
return false;
464457
}
465458

459+
memory_failure_queue(pfn, flags);
460+
return true;
461+
}
462+
463+
static bool ghes_handle_memory_failure(struct acpi_hest_generic_data *gdata,
464+
int sev)
465+
{
466+
int flags = -1;
467+
int sec_sev = ghes_severity(gdata->error_severity);
468+
struct cper_sec_mem_err *mem_err = acpi_hest_get_payload(gdata);
469+
470+
if (!(mem_err->validation_bits & CPER_MEM_VALID_PA))
471+
return false;
472+
466473
/* iff following two events can be handled properly by now */
467474
if (sec_sev == GHES_SEV_CORRECTED &&
468475
(gdata->flags & CPER_SEC_ERROR_THRESHOLD_EXCEEDED))
469476
flags = MF_SOFT_OFFLINE;
470477
if (sev == GHES_SEV_RECOVERABLE && sec_sev == GHES_SEV_RECOVERABLE)
471478
flags = 0;
472479

473-
if (flags != -1) {
474-
memory_failure_queue(pfn, flags);
475-
return true;
476-
}
480+
if (flags != -1)
481+
return ghes_do_memory_failure(mem_err->physical_addr, flags);
477482

478483
return false;
479484
}
480485

486+
static bool ghes_handle_arm_hw_error(struct acpi_hest_generic_data *gdata, int sev)
487+
{
488+
struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
489+
bool queued = false;
490+
int sec_sev, i;
491+
char *p;
492+
493+
log_arm_hw_error(err);
494+
495+
sec_sev = ghes_severity(gdata->error_severity);
496+
if (sev != GHES_SEV_RECOVERABLE || sec_sev != GHES_SEV_RECOVERABLE)
497+
return false;
498+
499+
p = (char *)(err + 1);
500+
for (i = 0; i < err->err_info_num; i++) {
501+
struct cper_arm_err_info *err_info = (struct cper_arm_err_info *)p;
502+
bool is_cache = (err_info->type == CPER_ARM_CACHE_ERROR);
503+
bool has_pa = (err_info->validation_bits & CPER_ARM_INFO_VALID_PHYSICAL_ADDR);
504+
const char *error_type = "unknown error";
505+
506+
/*
507+
* The field (err_info->error_info & BIT(26)) is fixed to set to
508+
* 1 in some old firmware of HiSilicon Kunpeng920. We assume that
509+
* firmware won't mix corrected errors in an uncorrected section,
510+
* and don't filter out 'corrected' error here.
511+
*/
512+
if (is_cache && has_pa) {
513+
queued = ghes_do_memory_failure(err_info->physical_fault_addr, 0);
514+
p += err_info->length;
515+
continue;
516+
}
517+
518+
if (err_info->type < ARRAY_SIZE(cper_proc_error_type_strs))
519+
error_type = cper_proc_error_type_strs[err_info->type];
520+
521+
pr_warn_ratelimited(FW_WARN GHES_PFX
522+
"Unhandled processor error type: %s\n",
523+
error_type);
524+
p += err_info->length;
525+
}
526+
527+
return queued;
528+
}
529+
481530
/*
482531
* PCIe AER errors need to be sent to the AER driver for reporting and
483532
* recovery. The GHES severities map to the following AER severities and
@@ -605,9 +654,7 @@ static bool ghes_do_proc(struct ghes *ghes,
605654
ghes_handle_aer(gdata);
606655
}
607656
else if (guid_equal(sec_type, &CPER_SEC_PROC_ARM)) {
608-
struct cper_sec_proc_arm *err = acpi_hest_get_payload(gdata);
609-
610-
log_arm_hw_error(err);
657+
queued = ghes_handle_arm_hw_error(gdata, sev);
611658
} else {
612659
void *err = acpi_hest_get_payload(gdata);
613660

0 commit comments

Comments
 (0)