Skip to content

Commit f27defc

Browse files
YiPeng Chaialexdeucher
authored andcommitted
drm/amdgpu: umc v12_0 logs ecc errors
1. umc v12_0 logs ecc errors. 2. Reserve newly detected ecc error pages. 3. Add tag for bad pages, so that they can be retired later. Signed-off-by: YiPeng Chai <[email protected]> Reviewed-by: Tao Zhou <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent b2aa6b1 commit f27defc

File tree

3 files changed

+113
-2
lines changed

3 files changed

+113
-2
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,13 @@
2121
*
2222
*/
2323

24+
#include <linux/sort.h>
2425
#include "amdgpu.h"
2526
#include "umc_v6_7.h"
2627
#define MAX_UMC_POISON_POLLING_TIME_SYNC 20 //ms
2728

29+
#define MAX_UMC_HASH_STRING_SIZE 256
30+
2831
static int amdgpu_umc_convert_error_address(struct amdgpu_device *adev,
2932
struct ras_err_data *err_data, uint64_t err_addr,
3033
uint32_t ch_inst, uint32_t umc_inst)
@@ -446,3 +449,67 @@ int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
446449
status, ipid, addr);
447450
return 0;
448451
}
452+
453+
static int amdgpu_umc_uint64_cmp(const void *a, const void *b)
454+
{
455+
uint64_t *addr_a = (uint64_t *)a;
456+
uint64_t *addr_b = (uint64_t *)b;
457+
458+
if (*addr_a > *addr_b)
459+
return 1;
460+
else if (*addr_a < *addr_b)
461+
return -1;
462+
else
463+
return 0;
464+
}
465+
466+
/* Use string hash to avoid logging the same bad pages repeatedly */
467+
int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
468+
uint64_t *pfns, int len, uint64_t *val)
469+
{
470+
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
471+
char buf[MAX_UMC_HASH_STRING_SIZE] = {0};
472+
int offset = 0, i = 0;
473+
uint64_t hash_val;
474+
475+
if (!pfns || !len)
476+
return -EINVAL;
477+
478+
sort(pfns, len, sizeof(uint64_t), amdgpu_umc_uint64_cmp, NULL);
479+
480+
for (i = 0; i < len; i++)
481+
offset += snprintf(&buf[offset], sizeof(buf) - offset, "%llx", pfns[i]);
482+
483+
hash_val = siphash(buf, offset, &con->umc_ecc_log.ecc_key);
484+
485+
*val = hash_val;
486+
487+
return 0;
488+
}
489+
490+
int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
491+
struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err)
492+
{
493+
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
494+
struct ras_ecc_log_info *ecc_log;
495+
int ret;
496+
497+
ecc_log = &con->umc_ecc_log;
498+
499+
mutex_lock(&ecc_log->lock);
500+
ret = radix_tree_insert(ecc_tree, ecc_err->hash_index, ecc_err);
501+
if (!ret) {
502+
struct ras_err_pages *err_pages = &ecc_err->err_pages;
503+
int i;
504+
505+
/* Reserve memory */
506+
for (i = 0; i < err_pages->count; i++)
507+
amdgpu_ras_reserve_page(adev, err_pages->pfn[i]);
508+
509+
radix_tree_tag_set(ecc_tree,
510+
ecc_err->hash_index, UMC_ECC_NEW_DETECTED_TAG);
511+
}
512+
mutex_unlock(&ecc_log->lock);
513+
514+
return ret;
515+
}

drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,8 @@
5252
#define LOOP_UMC_EACH_NODE_INST_AND_CH(node_inst, umc_inst, ch_inst) \
5353
LOOP_UMC_NODE_INST((node_inst)) LOOP_UMC_INST_AND_CH((umc_inst), (ch_inst))
5454

55+
/* Page retirement tag */
56+
#define UMC_ECC_NEW_DETECTED_TAG 0x1
5557

5658
typedef int (*umc_func)(struct amdgpu_device *adev, uint32_t node_inst,
5759
uint32_t umc_inst, uint32_t ch_inst, void *data);
@@ -127,5 +129,8 @@ int amdgpu_umc_bad_page_polling_timeout(struct amdgpu_device *adev,
127129

128130
int amdgpu_umc_update_ecc_status(struct amdgpu_device *adev,
129131
uint64_t status, uint64_t ipid, uint64_t addr);
130-
132+
int amdgpu_umc_build_pages_hash(struct amdgpu_device *adev,
133+
uint64_t *pfns, int len, uint64_t *val);
134+
int amdgpu_umc_logs_ecc_err(struct amdgpu_device *adev,
135+
struct radix_tree_root *ecc_tree, struct ras_ecc_err *ecc_err);
131136
#endif

drivers/gpu/drm/amd/amdgpu/umc_v12_0.c

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -546,8 +546,10 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
546546
uint16_t hwid, mcatype;
547547
struct ta_ras_query_address_input addr_in;
548548
uint64_t page_pfn[UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL];
549-
uint64_t err_addr;
549+
uint64_t err_addr, hash_val = 0;
550+
struct ras_ecc_err *ecc_err;
550551
int count;
552+
int ret;
551553

552554
hwid = REG_GET_FIELD(ipid, MCMP1_IPIDT0, HardwareID);
553555
mcatype = REG_GET_FIELD(ipid, MCMP1_IPIDT0, McaType);
@@ -589,6 +591,43 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
589591
return 0;
590592
}
591593

594+
ret = amdgpu_umc_build_pages_hash(adev,
595+
page_pfn, count, &hash_val);
596+
if (ret) {
597+
dev_err(adev->dev, "Fail to build error pages hash\n");
598+
return ret;
599+
}
600+
601+
ecc_err = kzalloc(sizeof(*ecc_err), GFP_KERNEL);
602+
if (!ecc_err)
603+
return -ENOMEM;
604+
605+
ecc_err->err_pages.pfn = kcalloc(count, sizeof(*ecc_err->err_pages.pfn), GFP_KERNEL);
606+
if (!ecc_err->err_pages.pfn) {
607+
kfree(ecc_err);
608+
return -ENOMEM;
609+
}
610+
611+
memcpy(ecc_err->err_pages.pfn, page_pfn, count * sizeof(*ecc_err->err_pages.pfn));
612+
ecc_err->err_pages.count = count;
613+
614+
ecc_err->hash_index = hash_val;
615+
ecc_err->status = status;
616+
ecc_err->ipid = ipid;
617+
ecc_err->addr = addr;
618+
619+
ret = amdgpu_umc_logs_ecc_err(adev, &con->umc_ecc_log.de_page_tree, ecc_err);
620+
if (ret) {
621+
if (ret == -EEXIST)
622+
con->umc_ecc_log.de_updated = true;
623+
else
624+
dev_err(adev->dev, "Fail to log ecc error! ret:%d\n", ret);
625+
626+
kfree(ecc_err->err_pages.pfn);
627+
kfree(ecc_err);
628+
return ret;
629+
}
630+
592631
con->umc_ecc_log.de_updated = true;
593632

594633
return 0;

0 commit comments

Comments
 (0)