@@ -124,6 +124,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
124
124
125
125
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
126
126
127
+ #define MAX_FLUSH_RETIRE_DWORK_TIMES 100
128
+
127
129
enum amdgpu_ras_retire_page_reservation {
128
130
AMDGPU_RAS_RETIRE_PAGE_RESERVED ,
129
131
AMDGPU_RAS_RETIRE_PAGE_PENDING ,
@@ -2907,6 +2909,23 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
2907
2909
ecc_log -> prev_de_queried_count = 0 ;
2908
2910
}
2909
2911
2912
+ static bool amdgpu_ras_schedule_retirement_dwork (struct amdgpu_ras * con ,
2913
+ uint32_t delayed_ms )
2914
+ {
2915
+ int ret ;
2916
+
2917
+ mutex_lock (& con -> umc_ecc_log .lock );
2918
+ ret = radix_tree_tagged (& con -> umc_ecc_log .de_page_tree ,
2919
+ UMC_ECC_NEW_DETECTED_TAG );
2920
+ mutex_unlock (& con -> umc_ecc_log .lock );
2921
+
2922
+ if (ret )
2923
+ schedule_delayed_work (& con -> page_retirement_dwork ,
2924
+ msecs_to_jiffies (delayed_ms ));
2925
+
2926
+ return ret ? true : false;
2927
+ }
2928
+
2910
2929
static void amdgpu_ras_do_page_retirement (struct work_struct * work )
2911
2930
{
2912
2931
struct amdgpu_ras * con = container_of (work , struct amdgpu_ras ,
@@ -2928,12 +2947,8 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
2928
2947
if (err_cnt && con -> is_rma )
2929
2948
amdgpu_ras_reset_gpu (adev );
2930
2949
2931
- mutex_lock (& con -> umc_ecc_log .lock );
2932
- if (radix_tree_tagged (& con -> umc_ecc_log .de_page_tree ,
2933
- UMC_ECC_NEW_DETECTED_TAG ))
2934
- schedule_delayed_work (& con -> page_retirement_dwork ,
2935
- msecs_to_jiffies (AMDGPU_RAS_RETIRE_PAGE_INTERVAL ));
2936
- mutex_unlock (& con -> umc_ecc_log .lock );
2950
+ amdgpu_ras_schedule_retirement_dwork (con ,
2951
+ AMDGPU_RAS_RETIRE_PAGE_INTERVAL );
2937
2952
}
2938
2953
2939
2954
static int amdgpu_ras_poison_creation_handler (struct amdgpu_device * adev ,
@@ -3237,11 +3252,19 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
3237
3252
{
3238
3253
struct amdgpu_ras * con = amdgpu_ras_get_context (adev );
3239
3254
struct ras_err_handler_data * data = con -> eh_data ;
3255
+ int max_flush_timeout = MAX_FLUSH_RETIRE_DWORK_TIMES ;
3256
+ bool ret ;
3240
3257
3241
3258
/* recovery_init failed to init it, fini is useless */
3242
3259
if (!data )
3243
3260
return 0 ;
3244
3261
3262
+ /* Save all cached bad pages to eeprom */
3263
+ do {
3264
+ flush_delayed_work (& con -> page_retirement_dwork );
3265
+ ret = amdgpu_ras_schedule_retirement_dwork (con , 0 );
3266
+ } while (ret && max_flush_timeout -- );
3267
+
3245
3268
if (con -> page_retirement_thread )
3246
3269
kthread_stop (con -> page_retirement_thread );
3247
3270
0 commit comments