|
28 | 28 | #include "umc/umc_12_0_0_sh_mask.h"
|
29 | 29 | #include "mp/mp_13_0_6_sh_mask.h"
|
30 | 30 |
|
| 31 | +#define MAX_ECC_NUM_PER_RETIREMENT 32 |
| 32 | + |
31 | 33 | static inline uint64_t get_umc_v12_0_reg_offset(struct amdgpu_device *adev,
|
32 | 34 | uint32_t node_inst,
|
33 | 35 | uint32_t umc_inst,
|
@@ -374,6 +376,7 @@ static int umc_v12_0_err_cnt_init_per_channel(struct amdgpu_device *adev,
|
374 | 376 | return 0;
|
375 | 377 | }
|
376 | 378 |
|
| 379 | +#ifdef TO_BE_REMOVED |
377 | 380 | static void umc_v12_0_ecc_info_query_ras_error_count(struct amdgpu_device *adev,
|
378 | 381 | void *ras_error_status)
|
379 | 382 | {
|
@@ -442,6 +445,7 @@ static void umc_v12_0_ecc_info_query_ras_error_address(struct amdgpu_device *ade
|
442 | 445 | }
|
443 | 446 | }
|
444 | 447 | }
|
| 448 | +#endif |
445 | 449 |
|
446 | 450 | static bool umc_v12_0_check_ecc_err_status(struct amdgpu_device *adev,
|
447 | 451 | enum amdgpu_mca_error_type type, void *ras_error_status)
|
@@ -633,15 +637,66 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
|
633 | 637 | return 0;
|
634 | 638 | }
|
635 | 639 |
|
| 640 | +static int umc_v12_0_fill_error_record(struct amdgpu_device *adev, |
| 641 | + struct ras_ecc_err *ecc_err, void *ras_error_status) |
| 642 | +{ |
| 643 | + struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status; |
| 644 | + uint32_t i = 0; |
| 645 | + int ret = 0; |
| 646 | + |
| 647 | + if (!err_data || !ecc_err) |
| 648 | + return -EINVAL; |
| 649 | + |
| 650 | + for (i = 0; i < ecc_err->err_pages.count; i++) { |
| 651 | + ret = amdgpu_umc_fill_error_record(err_data, |
| 652 | + ecc_err->addr, |
| 653 | + ecc_err->err_pages.pfn[i] << AMDGPU_GPU_PAGE_SHIFT, |
| 654 | + MCA_IPID_2_UMC_CH(ecc_err->ipid), |
| 655 | + MCA_IPID_2_UMC_INST(ecc_err->ipid)); |
| 656 | + if (ret) |
| 657 | + break; |
| 658 | + } |
| 659 | + |
| 660 | + err_data->de_count++; |
| 661 | + |
| 662 | + return ret; |
| 663 | +} |
| 664 | + |
| 665 | +static void umc_v12_0_query_ras_ecc_err_addr(struct amdgpu_device *adev, |
| 666 | + void *ras_error_status) |
| 667 | +{ |
| 668 | + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); |
| 669 | + struct ras_ecc_err *entries[MAX_ECC_NUM_PER_RETIREMENT]; |
| 670 | + struct radix_tree_root *ecc_tree; |
| 671 | + int new_detected, ret, i; |
| 672 | + |
| 673 | + ecc_tree = &con->umc_ecc_log.de_page_tree; |
| 674 | + |
| 675 | + mutex_lock(&con->umc_ecc_log.lock); |
| 676 | + new_detected = radix_tree_gang_lookup_tag(ecc_tree, (void **)entries, |
| 677 | + 0, ARRAY_SIZE(entries), UMC_ECC_NEW_DETECTED_TAG); |
| 678 | + for (i = 0; i < new_detected; i++) { |
| 679 | + if (!entries[i]) |
| 680 | + continue; |
| 681 | + |
| 682 | + ret = umc_v12_0_fill_error_record(adev, entries[i], ras_error_status); |
| 683 | + if (ret) { |
| 684 | + dev_err(adev->dev, "Fail to fill umc error record, ret:%d\n", ret); |
| 685 | + break; |
| 686 | + } |
| 687 | + radix_tree_tag_clear(ecc_tree, entries[i]->hash_index, UMC_ECC_NEW_DETECTED_TAG); |
| 688 | + } |
| 689 | + mutex_unlock(&con->umc_ecc_log.lock); |
| 690 | +} |
| 691 | + |
636 | 692 | struct amdgpu_umc_ras umc_v12_0_ras = {
|
637 | 693 | .ras_block = {
|
638 | 694 | .hw_ops = &umc_v12_0_ras_hw_ops,
|
639 | 695 | .ras_late_init = umc_v12_0_ras_late_init,
|
640 | 696 | },
|
641 | 697 | .err_cnt_init = umc_v12_0_err_cnt_init,
|
642 | 698 | .query_ras_poison_mode = umc_v12_0_query_ras_poison_mode,
|
643 |
| - .ecc_info_query_ras_error_count = umc_v12_0_ecc_info_query_ras_error_count, |
644 |
| - .ecc_info_query_ras_error_address = umc_v12_0_ecc_info_query_ras_error_address, |
| 699 | + .ecc_info_query_ras_error_address = umc_v12_0_query_ras_ecc_err_addr, |
645 | 700 | .check_ecc_err_status = umc_v12_0_check_ecc_err_status,
|
646 | 701 | .update_ecc_status = umc_v12_0_update_ecc_status,
|
647 | 702 | };
|
|
0 commit comments