|
32 | 32 | #include "wafl/wafl2_4_0_0_smn.h"
|
33 | 33 | #include "wafl/wafl2_4_0_0_sh_mask.h"
|
34 | 34 |
|
| 35 | +#define smnPCS_XGMI23_PCS_ERROR_STATUS 0x11a01210 |
| 36 | +#define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c |
| 37 | +#define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210 |
| 38 | + |
35 | 39 | static DEFINE_MUTEX(xgmi_mutex);
|
36 | 40 |
|
37 | 41 | #define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4
|
@@ -63,6 +67,33 @@ static const int wafl_pcs_err_status_reg_arct[] = {
|
63 | 67 | smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
|
64 | 68 | };
|
65 | 69 |
|
| 70 | +static const int xgmi23_pcs_err_status_reg_aldebaran[] = { |
| 71 | + smnPCS_XGMI23_PCS_ERROR_STATUS, |
| 72 | + smnPCS_XGMI23_PCS_ERROR_STATUS + 0x100000, |
| 73 | + smnPCS_XGMI23_PCS_ERROR_STATUS + 0x200000, |
| 74 | + smnPCS_XGMI23_PCS_ERROR_STATUS + 0x300000, |
| 75 | + smnPCS_XGMI23_PCS_ERROR_STATUS + 0x400000, |
| 76 | + smnPCS_XGMI23_PCS_ERROR_STATUS + 0x500000, |
| 77 | + smnPCS_XGMI23_PCS_ERROR_STATUS + 0x600000, |
| 78 | + smnPCS_XGMI23_PCS_ERROR_STATUS + 0x700000 |
| 79 | +}; |
| 80 | + |
| 81 | +static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = { |
| 82 | + smnPCS_XGMI3X16_PCS_ERROR_STATUS, |
| 83 | + smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000, |
| 84 | + smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000, |
| 85 | + smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000, |
| 86 | + smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000, |
| 87 | + smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000, |
| 88 | + smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000, |
| 89 | + smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000 |
| 90 | +}; |
| 91 | + |
| 92 | +static const int walf_pcs_err_status_reg_aldebaran[] = { |
| 93 | + smnPCS_GOPX1_PCS_ERROR_STATUS, |
| 94 | + smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000 |
| 95 | +}; |
| 96 | + |
66 | 97 | static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
|
67 | 98 | {"XGMI PCS DataLossErr",
|
68 | 99 | SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
|
@@ -771,6 +802,17 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
|
771 | 802 | pcs_clear_status(adev,
|
772 | 803 | xgmi_pcs_err_status_reg_vg20[i]);
|
773 | 804 | break;
|
| 805 | + case CHIP_ALDEBARAN: |
| 806 | + for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) |
| 807 | + pcs_clear_status(adev, |
| 808 | + xgmi23_pcs_err_status_reg_aldebaran[i]); |
| 809 | + for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) |
| 810 | + pcs_clear_status(adev, |
| 811 | + xgmi23_pcs_err_status_reg_aldebaran[i]); |
| 812 | + for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) |
| 813 | + pcs_clear_status(adev, |
| 814 | + walf_pcs_err_status_reg_aldebaran[i]); |
| 815 | + break; |
774 | 816 | default:
|
775 | 817 | break;
|
776 | 818 | }
|
@@ -863,6 +905,29 @@ static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
|
863 | 905 | data, &ue_cnt, &ce_cnt, false);
|
864 | 906 | }
|
865 | 907 | break;
|
| 908 | + case CHIP_ALDEBARAN: |
| 909 | + /* check xgmi23 pcs error */ |
| 910 | + for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) { |
| 911 | + data = RREG32_PCIE(xgmi23_pcs_err_status_reg_aldebaran[i]); |
| 912 | + if (data) |
| 913 | + amdgpu_xgmi_query_pcs_error_status(adev, |
| 914 | + data, &ue_cnt, &ce_cnt, true); |
| 915 | + } |
| 916 | + /* check xgmi3x16 pcs error */ |
| 917 | + for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) { |
| 918 | + data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]); |
| 919 | + if (data) |
| 920 | + amdgpu_xgmi_query_pcs_error_status(adev, |
| 921 | + data, &ue_cnt, &ce_cnt, true); |
| 922 | + } |
| 923 | + /* check wafl pcs error */ |
| 924 | + for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) { |
| 925 | + data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]); |
| 926 | + if (data) |
| 927 | + amdgpu_xgmi_query_pcs_error_status(adev, |
| 928 | + data, &ue_cnt, &ce_cnt, false); |
| 929 | + } |
| 930 | + break; |
866 | 931 | default:
|
867 | 932 | dev_warn(adev->dev, "XGMI RAS error query not supported");
|
868 | 933 | break;
|
|
0 commit comments