Skip to content

Commit 2cf8e50

Browse files
YiPeng Chaialexdeucher
authored andcommitted
drm/amdgpu: Add delay work to retire bad pages
Add delay work to retire bad pages. Signed-off-by: YiPeng Chai <[email protected]> Reviewed-by: Tao Zhou <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent f27defc commit 2cf8e50

File tree

4 files changed

+40
-2
lines changed

4 files changed

+40
-2
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,8 @@ const char *get_ras_block_str(struct ras_common_if *ras_block)
122122

123123
#define MAX_UMC_POISON_POLLING_TIME_ASYNC 100 //ms
124124

125+
#define AMDGPU_RAS_RETIRE_PAGE_INTERVAL 100 //ms
126+
125127
enum amdgpu_ras_retire_page_reservation {
126128
AMDGPU_RAS_RETIRE_PAGE_RESERVED,
127129
AMDGPU_RAS_RETIRE_PAGE_PENDING,
@@ -2776,6 +2778,30 @@ static void amdgpu_ras_ecc_log_fini(struct ras_ecc_log_info *ecc_log)
27762778
ecc_log->de_updated = false;
27772779
}
27782780

2781+
static void amdgpu_ras_do_page_retirement(struct work_struct *work)
2782+
{
2783+
struct amdgpu_ras *con = container_of(work, struct amdgpu_ras,
2784+
page_retirement_dwork.work);
2785+
struct amdgpu_device *adev = con->adev;
2786+
struct ras_err_data err_data;
2787+
2788+
if (amdgpu_in_reset(adev) || atomic_read(&con->in_recovery))
2789+
return;
2790+
2791+
amdgpu_ras_error_data_init(&err_data);
2792+
2793+
amdgpu_umc_handle_bad_pages(adev, &err_data);
2794+
2795+
amdgpu_ras_error_data_fini(&err_data);
2796+
2797+
mutex_lock(&con->umc_ecc_log.lock);
2798+
if (radix_tree_tagged(&con->umc_ecc_log.de_page_tree,
2799+
UMC_ECC_NEW_DETECTED_TAG))
2800+
schedule_delayed_work(&con->page_retirement_dwork,
2801+
msecs_to_jiffies(AMDGPU_RAS_RETIRE_PAGE_INTERVAL));
2802+
mutex_unlock(&con->umc_ecc_log.lock);
2803+
}
2804+
27792805
static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
27802806
enum amdgpu_ras_block ras_block, uint32_t timeout_ms)
27812807
{
@@ -2814,7 +2840,12 @@ static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
28142840
static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
28152841
uint32_t timeout)
28162842
{
2817-
amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout);
2843+
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
2844+
int ret;
2845+
2846+
ret = amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout);
2847+
if (!ret)
2848+
schedule_delayed_work(&con->page_retirement_dwork, 0);
28182849
}
28192850

28202851
static int amdgpu_ras_page_retirement_thread(void *param)
@@ -2929,6 +2960,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
29292960
dev_warn(adev->dev, "Failed to create umc_page_retirement thread!!!\n");
29302961
}
29312962

2963+
INIT_DELAYED_WORK(&con->page_retirement_dwork, amdgpu_ras_do_page_retirement);
29322964
amdgpu_ras_ecc_log_init(&con->umc_ecc_log);
29332965
#ifdef CONFIG_X86_MCE_AMD
29342966
if ((adev->asic_type == CHIP_ALDEBARAN) &&
@@ -2974,6 +3006,8 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
29743006

29753007
cancel_work_sync(&con->recovery_work);
29763008

3009+
cancel_delayed_work_sync(&con->page_retirement_dwork);
3010+
29773011
amdgpu_ras_ecc_log_fini(&con->umc_ecc_log);
29783012

29793013
mutex_lock(&con->recovery_lock);

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,7 @@ struct amdgpu_ras {
537537
struct mutex page_rsv_lock;
538538
DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);
539539
struct ras_ecc_log_info umc_ecc_log;
540+
struct delayed_work page_retirement_dwork;
540541

541542
/* Fatal error detected flag */
542543
atomic_t fed;

drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,7 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
8989
return ret;
9090
}
9191

92-
static void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
92+
void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
9393
void *ras_error_status)
9494
{
9595
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;

drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,4 +133,7 @@ int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
133133
uint64_t *pfns, int len, uint64_t *val);
134134
int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
135135
struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err);
136+
137+
void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
138+
void *ras_error_status);
136139
#endif

0 commit comments

Comments
 (0)