Skip to content

Commit 27d80f7

Browse files
Yang Wangalexdeucher
authored andcommitted
drm/amdgpu: add pcs xgmi v6.4.0 ras support
add pcs xgmi v6.4.0 ras support Signed-off-by: Yang Wang <[email protected]> Reviewed-by: Hawking Zhang <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 4abf0b0 commit 27d80f7

File tree

2 files changed

+161
-3
lines changed

2 files changed

+161
-3
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c

Lines changed: 155 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,43 @@ static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[] = {
113113
smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000
114114
};
115115

116+
static const u64 xgmi_v6_4_0_mca_base_array[] = {
117+
0x11a09200,
118+
0x11b09200,
119+
};
120+
121+
static const char *xgmi_v6_4_0_ras_error_code_ext[32] = {
122+
[0x00] = "XGMI PCS DataLossErr",
123+
[0x01] = "XGMI PCS TrainingErr",
124+
[0x02] = "XGMI PCS FlowCtrlAckErr",
125+
[0x03] = "XGMI PCS RxFifoUnderflowErr",
126+
[0x04] = "XGMI PCS RxFifoOverflowErr",
127+
[0x05] = "XGMI PCS CRCErr",
128+
[0x06] = "XGMI PCS BERExceededErr",
129+
[0x07] = "XGMI PCS TxMetaDataErr",
130+
[0x08] = "XGMI PCS ReplayBufParityErr",
131+
[0x09] = "XGMI PCS DataParityErr",
132+
[0x0a] = "XGMI PCS ReplayFifoOverflowErr",
133+
[0x0b] = "XGMI PCS ReplayFifoUnderflowErr",
134+
[0x0c] = "XGMI PCS ElasticFifoOverflowErr",
135+
[0x0d] = "XGMI PCS DeskewErr",
136+
[0x0e] = "XGMI PCS FlowCtrlCRCErr",
137+
[0x0f] = "XGMI PCS DataStartupLimitErr",
138+
[0x10] = "XGMI PCS FCInitTimeoutErr",
139+
[0x11] = "XGMI PCS RecoveryTimeoutErr",
140+
[0x12] = "XGMI PCS ReadySerialTimeoutErr",
141+
[0x13] = "XGMI PCS ReadySerialAttemptErr",
142+
[0x14] = "XGMI PCS RecoveryAttemptErr",
143+
[0x15] = "XGMI PCS RecoveryRelockAttemptErr",
144+
[0x16] = "XGMI PCS ReplayAttemptErr",
145+
[0x17] = "XGMI PCS SyncHdrErr",
146+
[0x18] = "XGMI PCS TxReplayTimeoutErr",
147+
[0x19] = "XGMI PCS RxReplayTimeoutErr",
148+
[0x1a] = "XGMI PCS LinkSubTxTimeoutErr",
149+
[0x1b] = "XGMI PCS LinkSubRxTimeoutErr",
150+
[0x1c] = "XGMI PCS RxCMDPktErr",
151+
};
152+
116153
static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields[] = {
117154
{"XGMI PCS DataLossErr",
118155
SOC15_REG_FIELD(XGMI0_PCS_GOPX16_PCS_ERROR_STATUS, DataLossErr)},
@@ -936,7 +973,7 @@ static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg
936973
WREG32_PCIE(pcs_status_reg, 0);
937974
}
938975

939-
static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
976+
static void amdgpu_xgmi_legacy_reset_ras_error_count(struct amdgpu_device *adev)
940977
{
941978
uint32_t i;
942979

@@ -974,6 +1011,39 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
9741011
}
9751012
}
9761013

1014+
static void __xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst, u64 mca_base)
1015+
{
1016+
WREG64_MCA(xgmi_inst, mca_base, MCA_REG_IDX_STATUS, 0ULL);
1017+
}
1018+
1019+
static void xgmi_v6_4_0_reset_error_count(struct amdgpu_device *adev, int xgmi_inst)
1020+
{
1021+
int i;
1022+
1023+
for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++)
1024+
__xgmi_v6_4_0_reset_error_count(adev, xgmi_inst, xgmi_v6_4_0_mca_base_array[i]);
1025+
}
1026+
1027+
static void xgmi_v6_4_0_reset_ras_error_count(struct amdgpu_device *adev)
1028+
{
1029+
int i;
1030+
1031+
for_each_inst(i, adev->aid_mask)
1032+
xgmi_v6_4_0_reset_error_count(adev, i);
1033+
}
1034+
1035+
static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
1036+
{
1037+
switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
1038+
case IP_VERSION(6, 4, 0):
1039+
xgmi_v6_4_0_reset_ras_error_count(adev);
1040+
break;
1041+
default:
1042+
amdgpu_xgmi_legacy_reset_ras_error_count(adev);
1043+
break;
1044+
}
1045+
}
1046+
9771047
static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
9781048
uint32_t value,
9791049
uint32_t mask_value,
@@ -1025,8 +1095,8 @@ static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
10251095
return 0;
10261096
}
10271097

1028-
static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
1029-
void *ras_error_status)
1098+
static void amdgpu_xgmi_legacy_query_ras_error_count(struct amdgpu_device *adev,
1099+
void *ras_error_status)
10301100
{
10311101
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
10321102
int i, supported = 1;
@@ -1121,6 +1191,88 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
11211191
err_data->ce_count += ce_cnt;
11221192
}
11231193

1194+
static enum amdgpu_mca_error_type xgmi_v6_4_0_pcs_mca_get_error_type(struct amdgpu_device *adev, u64 status)
1195+
{
1196+
const char *error_str;
1197+
int ext_error_code;
1198+
1199+
ext_error_code = MCA_REG__STATUS__ERRORCODEEXT(status);
1200+
1201+
error_str = ext_error_code < ARRAY_SIZE(xgmi_v6_4_0_ras_error_code_ext) ?
1202+
xgmi_v6_4_0_ras_error_code_ext[ext_error_code] : NULL;
1203+
if (error_str)
1204+
dev_info(adev->dev, "%s detected\n", error_str);
1205+
1206+
switch (ext_error_code) {
1207+
case 0:
1208+
return AMDGPU_MCA_ERROR_TYPE_UE;
1209+
case 6:
1210+
return AMDGPU_MCA_ERROR_TYPE_CE;
1211+
default:
1212+
return -EINVAL;
1213+
}
1214+
1215+
return -EINVAL;
1216+
}
1217+
1218+
static void __xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, struct amdgpu_smuio_mcm_config_info *mcm_info,
1219+
u64 mca_base, struct ras_err_data *err_data)
1220+
{
1221+
int xgmi_inst = mcm_info->die_id;
1222+
u64 status = 0;
1223+
1224+
status = RREG64_MCA(xgmi_inst, mca_base, MCA_REG_IDX_STATUS);
1225+
if (!MCA_REG__STATUS__VAL(status))
1226+
return;
1227+
1228+
switch (xgmi_v6_4_0_pcs_mca_get_error_type(adev, status)) {
1229+
case AMDGPU_MCA_ERROR_TYPE_UE:
1230+
amdgpu_ras_error_statistic_ue_count(err_data, mcm_info, 1ULL);
1231+
break;
1232+
case AMDGPU_MCA_ERROR_TYPE_CE:
1233+
amdgpu_ras_error_statistic_ce_count(err_data, mcm_info, 1ULL);
1234+
break;
1235+
default:
1236+
break;
1237+
}
1238+
1239+
WREG64_MCA(xgmi_inst, mca_base, MCA_REG_IDX_STATUS, 0ULL);
1240+
}
1241+
1242+
static void xgmi_v6_4_0_query_error_count(struct amdgpu_device *adev, int xgmi_inst, struct ras_err_data *err_data)
1243+
{
1244+
struct amdgpu_smuio_mcm_config_info mcm_info = {
1245+
.socket_id = adev->smuio.funcs->get_socket_id(adev),
1246+
.die_id = xgmi_inst,
1247+
};
1248+
int i;
1249+
1250+
for (i = 0; i < ARRAY_SIZE(xgmi_v6_4_0_mca_base_array); i++)
1251+
__xgmi_v6_4_0_query_error_count(adev, &mcm_info, xgmi_v6_4_0_mca_base_array[i], err_data);
1252+
}
1253+
1254+
static void xgmi_v6_4_0_query_ras_error_count(struct amdgpu_device *adev, void *ras_error_status)
1255+
{
1256+
struct ras_err_data *err_data = (struct ras_err_data *)ras_error_status;
1257+
int i;
1258+
1259+
for_each_inst(i, adev->aid_mask)
1260+
xgmi_v6_4_0_query_error_count(adev, i, err_data);
1261+
}
1262+
1263+
static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
1264+
void *ras_error_status)
1265+
{
1266+
switch (amdgpu_ip_version(adev, XGMI_HWIP, 0)) {
1267+
case IP_VERSION(6, 4, 0):
1268+
xgmi_v6_4_0_query_ras_error_count(adev, ras_error_status);
1269+
break;
1270+
default:
1271+
amdgpu_xgmi_legacy_query_ras_error_count(adev, ras_error_status);
1272+
break;
1273+
}
1274+
}
1275+
11241276
/* Trigger XGMI/WAFL error */
11251277
static int amdgpu_ras_error_inject_xgmi(struct amdgpu_device *adev,
11261278
void *inject_if, uint32_t instance_mask)

drivers/gpu/drm/amd/amdgpu/soc15_common.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -204,4 +204,10 @@
204204
+ adev->asic_funcs->encode_ext_smn_addressing(ext), \
205205
value) \
206206

207+
#define RREG64_MCA(ext, mca_base, idx) \
208+
RREG64_PCIE_EXT(adev->asic_funcs->encode_ext_smn_addressing(ext) + mca_base + (idx * 8))
209+
210+
#define WREG64_MCA(ext, mca_base, idx, val) \
211+
WREG64_PCIE_EXT(adev->asic_funcs->encode_ext_smn_addressing(ext) + mca_base + (idx * 8), val)
212+
207213
#endif

0 commit comments

Comments
 (0)