@@ -122,6 +122,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
122
122
123
123
#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms
124
124
125
+ #define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
126
+
125
127
enum amdgpu_ras_retire_page_reservation {
126
128
AMDGPU_RAS_RETIRE_PAGE_RESERVED ,
127
129
AMDGPU_RAS_RETIRE_PAGE_PENDING ,
@@ -2776,6 +2778,30 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
2776
2778
ecc_log -> de_updated = false;
2777
2779
}
2778
2780
2781
+ static void amdgpu_ras_do_page_retirement (struct work_struct * work )
2782
+ {
2783
+ struct amdgpu_ras * con = container_of (work , struct amdgpu_ras ,
2784
+ page_retirement_dwork .work );
2785
+ struct amdgpu_device * adev = con -> adev ;
2786
+ struct ras_err_data err_data ;
2787
+
2788
+ if (amdgpu_in_reset (adev ) || atomic_read (& con -> in_recovery ))
2789
+ return ;
2790
+
2791
+ amdgpu_ras_error_data_init (& err_data );
2792
+
2793
+ amdgpu_umc_handle_bad_pages (adev , & err_data );
2794
+
2795
+ amdgpu_ras_error_data_fini (& err_data );
2796
+
2797
+ mutex_lock (& con -> umc_ecc_log .lock );
2798
+ if (radix_tree_tagged (& con -> umc_ecc_log .de_page_tree ,
2799
+ UMC_ECC_NEW_DETECTED_TAG ))
2800
+ schedule_delayed_work (& con -> page_retirement_dwork ,
2801
+ msecs_to_jiffies (AMDGPU_RAS_RETIRE_PAGE_INTERVAL ));
2802
+ mutex_unlock (& con -> umc_ecc_log .lock );
2803
+ }
2804
+
2779
2805
static int amdgpu_ras_query_ecc_status (struct amdgpu_device * adev ,
2780
2806
enum amdgpu_ras_block ras_block , uint32_t timeout_ms )
2781
2807
{
@@ -2814,7 +2840,12 @@ static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
2814
2840
static void amdgpu_ras_poison_creation_handler (struct amdgpu_device * adev ,
2815
2841
uint32_t timeout )
2816
2842
{
2817
- amdgpu_ras_query_ecc_status (adev , AMDGPU_RAS_BLOCK__UMC , timeout );
2843
+ struct amdgpu_ras * con = amdgpu_ras_get_context (adev );
2844
+ int ret ;
2845
+
2846
+ ret = amdgpu_ras_query_ecc_status (adev , AMDGPU_RAS_BLOCK__UMC , timeout );
2847
+ if (!ret )
2848
+ schedule_delayed_work (& con -> page_retirement_dwork , 0 );
2818
2849
}
2819
2850
2820
2851
static int amdgpu_ras_page_retirement_thread (void * param )
@@ -2929,6 +2960,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
2929
2960
dev_warn (adev -> dev , "Failed to create umc_page_retirement thread!!!\n" );
2930
2961
}
2931
2962
2963
+ INIT_DELAYED_WORK (& con -> page_retirement_dwork , amdgpu_ras_do_page_retirement );
2932
2964
amdgpu_ras_ecc_log_init (& con -> umc_ecc_log );
2933
2965
#ifdef CONFIG_X86_MCE_AMD
2934
2966
if ((adev -> asic_type == CHIP_ALDEBARAN ) &&
@@ -2974,6 +3006,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
2974
3006
2975
3007
cancel_work_sync (& con -> recovery_work );
2976
3008
3009
+ cancel_delayed_work_sync (& con -> page_retirement_dwork );
3010
+
2977
3011
amdgpu_ras_ecc_log_fini (& con -> umc_ecc_log );
2978
3012
2979
3013
mutex_lock (& con -> recovery_lock );
0 commit comments