Skip to content

Commit 3c4ff2d

Browse files
John Clementsalexdeucher
authored andcommitted
drm/amdgpu: Add support for RAS XGMI err query
Update XGMI RAS to support error query on aldebaran Reviewed-by: Hawking Zhang <[email protected]> Signed-off-by: John Clements <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 1ec06c2 commit 3c4ff2d

File tree

1 file changed

+65
-0
lines changed

1 file changed

+65
-0
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,10 @@
3232
#include "wafl/wafl2_4_0_0_smn.h"
3333
#include "wafl/wafl2_4_0_0_sh_mask.h"
3434

35+
#define smnPCS_XGMI23_PCS_ERROR_STATUS 0x11a01210
36+
#define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
37+
#define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210
38+
3539
static DEFINE_MUTEX(xgmi_mutex);
3640

3741
#define AMDGPU_MAX_XGMI_DEVICE_PER_HIVE 4
@@ -63,6 +67,33 @@ static const int wafl_pcs_err_status_reg_arct[] = {
6367
smnPCS_GOPX1_0_PCS_GOPX1_PCS_ERROR_STATUS + 0x100000,
6468
};
6569

70+
static const int xgmi23_pcs_err_status_reg_aldebaran[] = {
71+
smnPCS_XGMI23_PCS_ERROR_STATUS,
72+
smnPCS_XGMI23_PCS_ERROR_STATUS + 0x100000,
73+
smnPCS_XGMI23_PCS_ERROR_STATUS + 0x200000,
74+
smnPCS_XGMI23_PCS_ERROR_STATUS + 0x300000,
75+
smnPCS_XGMI23_PCS_ERROR_STATUS + 0x400000,
76+
smnPCS_XGMI23_PCS_ERROR_STATUS + 0x500000,
77+
smnPCS_XGMI23_PCS_ERROR_STATUS + 0x600000,
78+
smnPCS_XGMI23_PCS_ERROR_STATUS + 0x700000
79+
};
80+
81+
static const int xgmi3x16_pcs_err_status_reg_aldebaran[] = {
82+
smnPCS_XGMI3X16_PCS_ERROR_STATUS,
83+
smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x100000,
84+
smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x200000,
85+
smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x300000,
86+
smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x400000,
87+
smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x500000,
88+
smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x600000,
89+
smnPCS_XGMI3X16_PCS_ERROR_STATUS + 0x700000
90+
};
91+
92+
static const int walf_pcs_err_status_reg_aldebaran[] = {
93+
smnPCS_GOPX1_PCS_ERROR_STATUS,
94+
smnPCS_GOPX1_PCS_ERROR_STATUS + 0x100000
95+
};
96+
6697
static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
6798
{"XGMI PCS DataLossErr",
6899
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
@@ -771,6 +802,17 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
771802
pcs_clear_status(adev,
772803
xgmi_pcs_err_status_reg_vg20[i]);
773804
break;
805+
case CHIP_ALDEBARAN:
806+
for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++)
807+
pcs_clear_status(adev,
808+
xgmi23_pcs_err_status_reg_aldebaran[i]);
809+
for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++)
810+
pcs_clear_status(adev,
811+
xgmi23_pcs_err_status_reg_aldebaran[i]);
812+
for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++)
813+
pcs_clear_status(adev,
814+
walf_pcs_err_status_reg_aldebaran[i]);
815+
break;
774816
default:
775817
break;
776818
}
@@ -863,6 +905,29 @@ static int amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
863905
data, &ue_cnt, &ce_cnt, false);
864906
}
865907
break;
908+
case CHIP_ALDEBARAN:
909+
/* check xgmi23 pcs error */
910+
for (i = 0; i < ARRAY_SIZE(xgmi23_pcs_err_status_reg_aldebaran); i++) {
911+
data = RREG32_PCIE(xgmi23_pcs_err_status_reg_aldebaran[i]);
912+
if (data)
913+
amdgpu_xgmi_query_pcs_error_status(adev,
914+
data, &ue_cnt, &ce_cnt, true);
915+
}
916+
/* check xgmi3x16 pcs error */
917+
for (i = 0; i < ARRAY_SIZE(xgmi3x16_pcs_err_status_reg_aldebaran); i++) {
918+
data = RREG32_PCIE(xgmi3x16_pcs_err_status_reg_aldebaran[i]);
919+
if (data)
920+
amdgpu_xgmi_query_pcs_error_status(adev,
921+
data, &ue_cnt, &ce_cnt, true);
922+
}
923+
/* check wafl pcs error */
924+
for (i = 0; i < ARRAY_SIZE(walf_pcs_err_status_reg_aldebaran); i++) {
925+
data = RREG32_PCIE(walf_pcs_err_status_reg_aldebaran[i]);
926+
if (data)
927+
amdgpu_xgmi_query_pcs_error_status(adev,
928+
data, &ue_cnt, &ce_cnt, false);
929+
}
930+
break;
866931
default:
867932
dev_warn(adev->dev, "XGMI RAS error query not supported");
868933
break;

0 commit comments

Comments
 (0)