Skip to content

Commit 314c38c

Browse files
YiPeng Chaialexdeucher
authored andcommitted
drm/amdgpu: retire bad pages for umc v12_0
Retire bad pages for umc v12_0. Signed-off-by: YiPeng Chai <[email protected]> Reviewed-by: Tao Zhou <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent e74313b commit 314c38c

File tree

1 file changed

+57
-2
lines changed

1 file changed

+57
-2
lines changed

drivers/gpu/drm/amd/amdgpu/umc_v12_0.c

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
#include "umc/umc_12_0_0_sh_mask.h"
2929
#include "mp/mp_13_0_6_sh_mask.h"
3030

31+
#define MAX_ECC_NUM_PER_RETIREMENT 32
32+
3133
static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
3234
uint32_t node_inst,
3335
uint32_t umc_inst,
@@ -374,6 +376,7 @@ static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,
374376
return 0;
375377
}
376378

379+
#ifdef TO_BE_REMOVED
377380
static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
378381
void *ras_error_status)
379382
{
@@ -442,6 +445,7 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
442445
}
443446
}
444447
}
448+
#endif
445449

446450
static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev,
447451
enum amdgpu_mca_error_type type, void *ras_error_status)
@@ -633,15 +637,66 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
633637
return 0;
634638
}
635639

640+
static int umc_v12_0_fill_error_record(struct amdgpu_device *adev,
641+
struct ras_ecc_err *ecc_err, void *ras_error_status)
642+
{
643+
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
644+
uint32_t i = 0;
645+
int ret = 0;
646+
647+
if (!err_data || !ecc_err)
648+
return -EINVAL;
649+
650+
for (i = 0; i < ecc_err->err_pages.count; i++) {
651+
ret = amdgpu_umc_fill_error_record(err_data,
652+
ecc_err->addr,
653+
ecc_err->err_pages.pfn[i] << AMDGPU_GPU_PAGE_SHIFT,
654+
MCA_IPID_2_UMC_CH(ecc_err->ipid),
655+
MCA_IPID_2_UMC_INST(ecc_err->ipid));
656+
if (ret)
657+
break;
658+
}
659+
660+
err_data->de_count++;
661+
662+
return ret;
663+
}
664+
665+
static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev,
666+
void *ras_error_status)
667+
{
668+
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
669+
struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT];
670+
struct radix_tree_root *ecc_tree;
671+
int new_detected, ret, i;
672+
673+
ecc_tree = &con->umc_ecc_log.de_page_tree;
674+
675+
mutex_lock(&con->umc_ecc_log.lock);
676+
new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries,
677+
0, ARRAY_SIZE(entries), UMC_ECC_NEW_DETECTED_TAG);
678+
for (i = 0; i < new_detected; i++) {
679+
if (!entries[i])
680+
continue;
681+
682+
ret = umc_v12_0_fill_error_record(adev, entries[i], ras_error_status);
683+
if (ret) {
684+
dev_err(adev->dev, "Fail to fill umc error record, ret:%d\n", ret);
685+
break;
686+
}
687+
radix_tree_tag_clear(ecc_tree, entries[i]->hash_index, UMC_ECC_NEW_DETECTED_TAG);
688+
}
689+
mutex_unlock(&con->umc_ecc_log.lock);
690+
}
691+
636692
struct amdgpu_umc_ras umc_v12_0_ras = {
637693
.ras_block = {
638694
.hw_ops = &umc_v12_0_ras_hw_ops,
639695
.ras_late_init = umc_v12_0_ras_late_init,
640696
},
641697
.err_cnt_init = umc_v12_0_err_cnt_init,
642698
.query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
643-
.ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count,
644-
.ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address,
699+
.ecc_info_query_ras_error_address = umc_v12_0_query_ras_ecc_err_addr,
645700
.check_ecc_err_status = umc_v12_0_check_ecc_err_status,
646701
.update_ecc_status = umc_v12_0_update_ecc_status,
647702
};

0 commit comments

Comments
 (0)