Skip to content

Commit cfce8f4

Browse files
ganglxiealexdeucher
authored andcommitted
drm/amdgpu: refine ras error injection when eeprom initialization failed
when eeprom initialization failed, we still support ras error injection, and reserve bad pages, but do not save bad pages to eeprom Signed-off-by: ganglxie <[email protected]> Reviewed-by: Tao Zhou <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 0b7f135 commit cfce8f4

File tree

2 files changed

+18
-6
lines changed

2 files changed

+18
-6
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3006,6 +3006,15 @@ int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev,
30063006
return 0;
30073007
}
30083008

3009+
if (!con->eeprom_control.is_eeprom_valid) {
3010+
dev_warn(adev->dev,
3011+
"Failed to save EEPROM table data because of EEPROM data corruption!");
3012+
if (new_cnt)
3013+
*new_cnt = 0;
3014+
3015+
return 0;
3016+
}
3017+
30093018
mutex_lock(&con->recovery_lock);
30103019
control = &con->eeprom_control;
30113020
data = con->eh_data;
@@ -3491,8 +3500,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
34913500

34923501
control = &con->eeprom_control;
34933502
ret = amdgpu_ras_eeprom_init(control);
3494-
if (ret)
3495-
return ret;
3503+
control->is_eeprom_valid = !ret;
34963504

34973505
if (!adev->umc.ras || !adev->umc.ras->convert_ras_err_addr)
34983506
control->ras_num_pa_recs = control->ras_num_recs;
@@ -3501,10 +3509,12 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
35013509
adev->umc.ras->get_retire_flip_bits)
35023510
adev->umc.ras->get_retire_flip_bits(adev);
35033511

3504-
if (control->ras_num_recs) {
3512+
if (control->ras_num_recs && control->is_eeprom_valid) {
35053513
ret = amdgpu_ras_load_bad_pages(adev);
3506-
if (ret)
3507-
return ret;
3514+
if (ret) {
3515+
control->is_eeprom_valid = false;
3516+
return 0;
3517+
}
35083518

35093519
amdgpu_dpm_send_hbm_bad_pages_num(
35103520
adev, control->ras_num_bad_pages);
@@ -3523,7 +3533,7 @@ int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
35233533
dev_warn(adev->dev, "Failed to format RAS EEPROM data in V3 version!\n");
35243534
}
35253535

3526-
return ret;
3536+
return 0;
35273537
}
35283538

35293539
int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)

drivers/gpu/drm/amd/amdgpu/amdgpu_ras_eeprom.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,8 @@ struct amdgpu_ras_eeprom_control {
114114
/* Record channel info which occurred bad pages
115115
*/
116116
u32 bad_channel_bitmap;
117+
118+
bool is_eeprom_valid;
117119
};
118120

119121
/*

0 commit comments

Comments
 (0)