Skip to content

Commit c047069

Browse files
YiPeng Chaialexdeucher
authored andcommitted
drm/amdgpu: flush all cached ras bad pages to eeprom
Before uninstalling gpu driver, flush all cached ras bad pages to eeprom. v2: Put the same code into a function and reuse the function. Signed-off-by: YiPeng Chai <[email protected]> Reviewed-by: Tao Zhou <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent c393857 commit c047069

File tree

1 file changed

+29
-6
lines changed

1 file changed

+29
-6
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
124124

125125
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
126126

127+
#define MAX_FLUSH_RETIRE_DWORK_TIMES 100
128+
127129
enum amdgpu_ras_retire_page_reservation {
128130
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
129131
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -2907,6 +2909,23 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
29072909
ecc_log->prev_de_queried_count = 0;
29082910
}
29092911

2912+
static bool amdgpu_ras_schedule_retirement_dwork(struct amdgpu_ras *con,
2913+
uint32_t delayed_ms)
2914+
{
2915+
int ret;
2916+
2917+
mutex_lock(&con->umc_ecc_log.lock);
2918+
ret = radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
2919+
UMC_ECC_NEW_DETECTED_TAG);
2920+
mutex_unlock(&con->umc_ecc_log.lock);
2921+
2922+
if (ret)
2923+
schedule_delayed_work(&con->page_retirement_dwork,
2924+
msecs_to_jiffies(delayed_ms));
2925+
2926+
return ret ? true : false;
2927+
}
2928+
29102929
static void amdgpu_ras_do_page_retirement(struct work_struct *work)
29112930
{
29122931
struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
@@ -2928,12 +2947,8 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
29282947
if (err_cnt && con->is_rma)
29292948
amdgpu_ras_reset_gpu(adev);
29302949

2931-
mutex_lock(&con->umc_ecc_log.lock);
2932-
if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
2933-
UMC_ECC_NEW_DETECTED_TAG))
2934-
schedule_delayed_work(&con->page_retirement_dwork,
2935-
msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL));
2936-
mutex_unlock(&con->umc_ecc_log.lock);
2950+
amdgpu_ras_schedule_retirement_dwork(con,
2951+
AMDGPU_RAS_RETIRE_PAGE_INTERVAL);
29372952
}
29382953

29392954
static int amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
@@ -3237,11 +3252,19 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
32373252
{
32383253
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
32393254
struct ras_err_handler_data *data = con->eh_data;
3255+
int max_flush_timeout = MAX_FLUSH_RETIRE_DWORK_TIMES;
3256+
bool ret;
32403257

32413258
/* recovery_init failed to init it, fini is useless */
32423259
if (!data)
32433260
return 0;
32443261

3262+
/* Save all cached bad pages to eeprom */
3263+
do {
3264+
flush_delayed_work(&con->page_retirement_dwork);
3265+
ret = amdgpu_ras_schedule_retirement_dwork(con, 0);
3266+
} while (ret && max_flush_timeout--);
3267+
32453268
if (con->page_retirement_thread)
32463269
kthread_stop(con->page_retirement_thread);
32473270

0 commit comments

Comments
 (0)