Skip to content

Commit 338f741

Browse files
Xiang Liualexdeucher
authored andcommitted
drm/amdgpu: Decode deferred error type in gfx aca bank parser
In the case of injecting uncorrected error with background workload, the deferred error among uncorrected errors need to be specified by checking the deferred and poison bits of status register. v2: refine checking for deferred error v2: log possiable DEs among CEs v2: generate CPER records for DEs among UEs Signed-off-by: Xiang Liu <[email protected]> Reviewed-by: Hawking Zhang <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 2ec0a7c commit 338f741

File tree

3 files changed

+36
-10
lines changed

3 files changed

+36
-10
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_aca.c

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,7 @@ static void aca_banks_generate_cper(struct amdgpu_device *adev,
391391
{
392392
struct aca_bank_node *node;
393393
struct aca_bank *bank;
394+
int r;
394395

395396
if (!adev->cper.enabled)
396397
return;
@@ -402,11 +403,27 @@ static void aca_banks_generate_cper(struct amdgpu_device *adev,
402403

403404
/* UEs must be encoded into separate CPER entries */
404405
if (type == ACA_SMU_TYPE_UE) {
406+
struct aca_banks de_banks;
407+
408+
aca_banks_init(&de_banks);
405409
list_for_each_entry(node, &banks->list, node) {
406410
bank = &node->bank;
407-
if (amdgpu_cper_generate_ue_record(adev, bank))
408-
dev_warn(adev->dev, "fail to generate ue cper records\n");
411+
if (bank->aca_err_type == ACA_ERROR_TYPE_DEFERRED) {
412+
r = aca_banks_add_bank(&de_banks, bank);
413+
if (r)
414+
dev_warn(adev->dev, "fail to add de banks, ret = %d\n", r);
415+
} else {
416+
if (amdgpu_cper_generate_ue_record(adev, bank))
417+
dev_warn(adev->dev, "fail to generate ue cper records\n");
418+
}
419+
}
420+
421+
if (!list_empty(&de_banks.list)) {
422+
if (amdgpu_cper_generate_ce_records(adev, &de_banks, de_banks.nr_banks))
423+
dev_warn(adev->dev, "fail to generate de cper records\n");
409424
}
425+
426+
aca_banks_release(&de_banks);
410427
} else {
411428
/*
412429
* SMU_TYPE_CE banks are combined into 1 CPER entries,
@@ -541,6 +558,10 @@ static int __aca_get_error_data(struct amdgpu_device *adev, struct aca_handle *h
541558
if (ret)
542559
return ret;
543560

561+
/* DEs may contain in CEs or UEs */
562+
if (type != ACA_ERROR_TYPE_DEFERRED)
563+
aca_log_aca_error(handle, ACA_ERROR_TYPE_DEFERRED, err_data);
564+
544565
return aca_log_aca_error(handle, type, err_data);
545566
}
546567

drivers/gpu/drm/amd/amdgpu/amdgpu_aca.h

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,11 +76,17 @@ struct ras_query_context;
7676
#define mmSMNAID_XCD1_MCA_SMU 0x38430400 /* SMN AID XCD1 */
7777
#define mmSMNXCD_XCD0_MCA_SMU 0x40430400 /* SMN XCD XCD0 */
7878

79-
#define ACA_BANK_ERR_CE_DE_DECODE(bank) \
80-
((ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \
81-
ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS])) ? \
82-
ACA_ERROR_TYPE_DEFERRED : \
83-
ACA_ERROR_TYPE_CE)
79+
#define ACA_BANK_ERR_IS_DEFFERED(bank) \
80+
(ACA_REG__STATUS__POISON((bank)->regs[ACA_REG_IDX_STATUS]) || \
81+
ACA_REG__STATUS__DEFERRED((bank)->regs[ACA_REG_IDX_STATUS]))
82+
83+
#define ACA_BANK_ERR_CE_DE_DECODE(bank) \
84+
(ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
85+
ACA_ERROR_TYPE_CE)
86+
87+
#define ACA_BANK_ERR_UE_DE_DECODE(bank) \
88+
(ACA_BANK_ERR_IS_DEFFERED(bank) ? ACA_ERROR_TYPE_DEFERRED : \
89+
ACA_ERROR_TYPE_UE)
8490

8591
enum aca_reg_idx {
8692
ACA_REG_IDX_CTL = 0,

drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -867,9 +867,8 @@ static int gfx_v9_4_3_aca_bank_parser(struct aca_handle *handle,
867867

868868
switch (type) {
869869
case ACA_SMU_TYPE_UE:
870-
bank->aca_err_type = ACA_ERROR_TYPE_UE;
871-
ret = aca_error_cache_log_bank_error(handle, &info,
872-
ACA_ERROR_TYPE_UE, 1ULL);
870+
bank->aca_err_type = ACA_BANK_ERR_UE_DE_DECODE(bank);
871+
ret = aca_error_cache_log_bank_error(handle, &info, bank->aca_err_type, 1ULL);
873872
break;
874873
case ACA_SMU_TYPE_CE:
875874
bank->aca_err_type = ACA_BANK_ERR_CE_DE_DECODE(bank);

0 commit comments

Comments
 (0)