@@ -113,6 +113,43 @@ static const int xgmi3x16_pcs_err_noncorrectable_mask_reg_v6_4[] = {
113
113
smnPCS_XGMI3X16_PCS_ERROR_NONCORRECTABLE_MASK + 0x100000
114
114
};
115
115
116
+ static const u64 xgmi_v6_4_0_mca_base_array [] = {
117
+ 0x11a09200 ,
118
+ 0x11b09200 ,
119
+ };
120
+
121
+ static const char * xgmi_v6_4_0_ras_error_code_ext [32 ] = {
122
+ [0x00 ] = "XGMI PCS DataLossErr" ,
123
+ [0x01 ] = "XGMI PCS TrainingErr" ,
124
+ [0x02 ] = "XGMI PCS FlowCtrlAckErr" ,
125
+ [0x03 ] = "XGMI PCS RxFifoUnderflowErr" ,
126
+ [0x04 ] = "XGMI PCS RxFifoOverflowErr" ,
127
+ [0x05 ] = "XGMI PCS CRCErr" ,
128
+ [0x06 ] = "XGMI PCS BERExceededErr" ,
129
+ [0x07 ] = "XGMI PCS TxMetaDataErr" ,
130
+ [0x08 ] = "XGMI PCS ReplayBufParityErr" ,
131
+ [0x09 ] = "XGMI PCS DataParityErr" ,
132
+ [0x0a ] = "XGMI PCS ReplayFifoOverflowErr" ,
133
+ [0x0b ] = "XGMI PCS ReplayFifoUnderflowErr" ,
134
+ [0x0c ] = "XGMI PCS ElasticFifoOverflowErr" ,
135
+ [0x0d ] = "XGMI PCS DeskewErr" ,
136
+ [0x0e ] = "XGMI PCS FlowCtrlCRCErr" ,
137
+ [0x0f ] = "XGMI PCS DataStartupLimitErr" ,
138
+ [0x10 ] = "XGMI PCS FCInitTimeoutErr" ,
139
+ [0x11 ] = "XGMI PCS RecoveryTimeoutErr" ,
140
+ [0x12 ] = "XGMI PCS ReadySerialTimeoutErr" ,
141
+ [0x13 ] = "XGMI PCS ReadySerialAttemptErr" ,
142
+ [0x14 ] = "XGMI PCS RecoveryAttemptErr" ,
143
+ [0x15 ] = "XGMI PCS RecoveryRelockAttemptErr" ,
144
+ [0x16 ] = "XGMI PCS ReplayAttemptErr" ,
145
+ [0x17 ] = "XGMI PCS SyncHdrErr" ,
146
+ [0x18 ] = "XGMI PCS TxReplayTimeoutErr" ,
147
+ [0x19 ] = "XGMI PCS RxReplayTimeoutErr" ,
148
+ [0x1a ] = "XGMI PCS LinkSubTxTimeoutErr" ,
149
+ [0x1b ] = "XGMI PCS LinkSubRxTimeoutErr" ,
150
+ [0x1c ] = "XGMI PCS RxCMDPktErr" ,
151
+ };
152
+
116
153
static const struct amdgpu_pcs_ras_field xgmi_pcs_ras_fields [] = {
117
154
{"XGMI PCS DataLossErr" ,
118
155
SOC15_REG_FIELD (XGMI0_PCS_GOPX16_PCS_ERROR_STATUS , DataLossErr )},
@@ -936,7 +973,7 @@ static void pcs_clear_status(struct amdgpu_device *adev, uint32_t pcs_status_reg
936
973
WREG32_PCIE (pcs_status_reg , 0 );
937
974
}
938
975
939
- static void amdgpu_xgmi_reset_ras_error_count (struct amdgpu_device * adev )
976
+ static void amdgpu_xgmi_legacy_reset_ras_error_count (struct amdgpu_device * adev )
940
977
{
941
978
uint32_t i ;
942
979
@@ -974,6 +1011,39 @@ static void amdgpu_xgmi_reset_ras_error_count(struct amdgpu_device *adev)
974
1011
}
975
1012
}
976
1013
1014
+ static void __xgmi_v6_4_0_reset_error_count (struct amdgpu_device * adev , int xgmi_inst , u64 mca_base )
1015
+ {
1016
+ WREG64_MCA (xgmi_inst , mca_base , MCA_REG_IDX_STATUS , 0ULL );
1017
+ }
1018
+
1019
+ static void xgmi_v6_4_0_reset_error_count (struct amdgpu_device * adev , int xgmi_inst )
1020
+ {
1021
+ int i ;
1022
+
1023
+ for (i = 0 ; i < ARRAY_SIZE (xgmi_v6_4_0_mca_base_array ); i ++ )
1024
+ __xgmi_v6_4_0_reset_error_count (adev , xgmi_inst , xgmi_v6_4_0_mca_base_array [i ]);
1025
+ }
1026
+
1027
+ static void xgmi_v6_4_0_reset_ras_error_count (struct amdgpu_device * adev )
1028
+ {
1029
+ int i ;
1030
+
1031
+ for_each_inst (i , adev -> aid_mask )
1032
+ xgmi_v6_4_0_reset_error_count (adev , i );
1033
+ }
1034
+
1035
+ static void amdgpu_xgmi_reset_ras_error_count (struct amdgpu_device * adev )
1036
+ {
1037
+ switch (amdgpu_ip_version (adev , XGMI_HWIP , 0 )) {
1038
+ case IP_VERSION (6 , 4 , 0 ):
1039
+ xgmi_v6_4_0_reset_ras_error_count (adev );
1040
+ break ;
1041
+ default :
1042
+ amdgpu_xgmi_legacy_reset_ras_error_count (adev );
1043
+ break ;
1044
+ }
1045
+ }
1046
+
977
1047
static int amdgpu_xgmi_query_pcs_error_status (struct amdgpu_device * adev ,
978
1048
uint32_t value ,
979
1049
uint32_t mask_value ,
@@ -1025,8 +1095,8 @@ static int amdgpu_xgmi_query_pcs_error_status(struct amdgpu_device *adev,
1025
1095
return 0 ;
1026
1096
}
1027
1097
1028
- static void amdgpu_xgmi_query_ras_error_count (struct amdgpu_device * adev ,
1029
- void * ras_error_status )
1098
+ static void amdgpu_xgmi_legacy_query_ras_error_count (struct amdgpu_device * adev ,
1099
+ void * ras_error_status )
1030
1100
{
1031
1101
struct ras_err_data * err_data = (struct ras_err_data * )ras_error_status ;
1032
1102
int i , supported = 1 ;
@@ -1121,6 +1191,88 @@ static void amdgpu_xgmi_query_ras_error_count(struct amdgpu_device *adev,
1121
1191
err_data -> ce_count += ce_cnt ;
1122
1192
}
1123
1193
1194
+ static enum amdgpu_mca_error_type xgmi_v6_4_0_pcs_mca_get_error_type (struct amdgpu_device * adev , u64 status )
1195
+ {
1196
+ const char * error_str ;
1197
+ int ext_error_code ;
1198
+
1199
+ ext_error_code = MCA_REG__STATUS__ERRORCODEEXT (status );
1200
+
1201
+ error_str = ext_error_code < ARRAY_SIZE (xgmi_v6_4_0_ras_error_code_ext ) ?
1202
+ xgmi_v6_4_0_ras_error_code_ext [ext_error_code ] : NULL ;
1203
+ if (error_str )
1204
+ dev_info (adev -> dev , "%s detected\n" , error_str );
1205
+
1206
+ switch (ext_error_code ) {
1207
+ case 0 :
1208
+ return AMDGPU_MCA_ERROR_TYPE_UE ;
1209
+ case 6 :
1210
+ return AMDGPU_MCA_ERROR_TYPE_CE ;
1211
+ default :
1212
+ return - EINVAL ;
1213
+ }
1214
+
1215
+ return - EINVAL ;
1216
+ }
1217
+
1218
+ static void __xgmi_v6_4_0_query_error_count (struct amdgpu_device * adev , struct amdgpu_smuio_mcm_config_info * mcm_info ,
1219
+ u64 mca_base , struct ras_err_data * err_data )
1220
+ {
1221
+ int xgmi_inst = mcm_info -> die_id ;
1222
+ u64 status = 0 ;
1223
+
1224
+ status = RREG64_MCA (xgmi_inst , mca_base , MCA_REG_IDX_STATUS );
1225
+ if (!MCA_REG__STATUS__VAL (status ))
1226
+ return ;
1227
+
1228
+ switch (xgmi_v6_4_0_pcs_mca_get_error_type (adev , status )) {
1229
+ case AMDGPU_MCA_ERROR_TYPE_UE :
1230
+ amdgpu_ras_error_statistic_ue_count (err_data , mcm_info , 1ULL );
1231
+ break ;
1232
+ case AMDGPU_MCA_ERROR_TYPE_CE :
1233
+ amdgpu_ras_error_statistic_ce_count (err_data , mcm_info , 1ULL );
1234
+ break ;
1235
+ default :
1236
+ break ;
1237
+ }
1238
+
1239
+ WREG64_MCA (xgmi_inst , mca_base , MCA_REG_IDX_STATUS , 0ULL );
1240
+ }
1241
+
1242
+ static void xgmi_v6_4_0_query_error_count (struct amdgpu_device * adev , int xgmi_inst , struct ras_err_data * err_data )
1243
+ {
1244
+ struct amdgpu_smuio_mcm_config_info mcm_info = {
1245
+ .socket_id = adev -> smuio .funcs -> get_socket_id (adev ),
1246
+ .die_id = xgmi_inst ,
1247
+ };
1248
+ int i ;
1249
+
1250
+ for (i = 0 ; i < ARRAY_SIZE (xgmi_v6_4_0_mca_base_array ); i ++ )
1251
+ __xgmi_v6_4_0_query_error_count (adev , & mcm_info , xgmi_v6_4_0_mca_base_array [i ], err_data );
1252
+ }
1253
+
1254
+ static void xgmi_v6_4_0_query_ras_error_count (struct amdgpu_device * adev , void * ras_error_status )
1255
+ {
1256
+ struct ras_err_data * err_data = (struct ras_err_data * )ras_error_status ;
1257
+ int i ;
1258
+
1259
+ for_each_inst (i , adev -> aid_mask )
1260
+ xgmi_v6_4_0_query_error_count (adev , i , err_data );
1261
+ }
1262
+
1263
+ static void amdgpu_xgmi_query_ras_error_count (struct amdgpu_device * adev ,
1264
+ void * ras_error_status )
1265
+ {
1266
+ switch (amdgpu_ip_version (adev , XGMI_HWIP , 0 )) {
1267
+ case IP_VERSION (6 , 4 , 0 ):
1268
+ xgmi_v6_4_0_query_ras_error_count (adev , ras_error_status );
1269
+ break ;
1270
+ default :
1271
+ amdgpu_xgmi_legacy_query_ras_error_count (adev , ras_error_status );
1272
+ break ;
1273
+ }
1274
+ }
1275
+
1124
1276
/* Trigger XGMI/WAFL error */
1125
1277
static int amdgpu_ras_error_inject_xgmi (struct amdgpu_device * adev ,
1126
1278
void * inject_if , uint32_t instance_mask )
0 commit comments