Skip to content

Commit 5b1270b

Browse files
Yang Wangalexdeucher
authored andcommitted
drm/amdgpu: add ras_err_info to identify RAS error source
introduced "ras_err_info" to better identify a RAS ERROR source. NOTE: For legacy chips, keep the original RAS error print format. v1: RAS errors may come from different dies during a RAS error query, therefore, need a new data structure to identify the source of RAS ERROR. v2: - use new data structure 'amdgpu_smuio_mcm_config_info' instead of ras_err_id (in v1 patch) - refine ras error dump function name - refine ras error dump log format Signed-off-by: Yang Wang <[email protected]> Reviewed-by: Tao Zhou <[email protected]> Reviewed-by: Hawking Zhang <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 6a1c31c commit 5b1270b

File tree

6 files changed

+312
-53
lines changed

6 files changed

+312
-53
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 249 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -152,8 +152,9 @@ static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev)
152152

153153
static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address)
154154
{
155-
struct ras_err_data err_data = {0, 0, 0, NULL};
155+
struct ras_err_data err_data;
156156
struct eeprom_table_record err_rec;
157+
int ret;
157158

158159
if ((address >= adev->gmc.mc_vram_size) ||
159160
(address >= RAS_UMC_INJECT_ADDR_LIMIT)) {
@@ -170,6 +171,10 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
170171
return 0;
171172
}
172173

174+
ret = amdgpu_ras_error_data_init(&err_data);
175+
if (ret)
176+
return ret;
177+
173178
memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
174179
err_data.err_addr = &err_rec;
175180
amdgpu_umc_fill_error_record(&err_data, address, address, 0, 0);
@@ -180,6 +185,8 @@ static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t addre
180185
amdgpu_ras_save_bad_pages(adev, NULL);
181186
}
182187

188+
amdgpu_ras_error_data_fini(&err_data);
189+
183190
dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n");
184191
dev_warn(adev->dev, "Clear EEPROM:\n");
185192
dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n");
@@ -1015,25 +1022,127 @@ static void amdgpu_ras_get_ecc_info(struct amdgpu_device *adev, struct ras_err_d
10151022
}
10161023
}
10171024

1025+
static void amdgpu_ras_error_print_error_data(struct amdgpu_device *adev,
1026+
struct ras_query_if *query_if,
1027+
struct ras_err_data *err_data,
1028+
bool is_ue)
1029+
{
1030+
struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
1031+
const char *blk_name = get_ras_block_str(&query_if->head);
1032+
struct amdgpu_smuio_mcm_config_info *mcm_info;
1033+
struct ras_err_node *err_node;
1034+
struct ras_err_info *err_info;
1035+
1036+
if (is_ue)
1037+
dev_info(adev->dev, "%ld uncorrectable hardware errors detected in %s block\n",
1038+
ras_mgr->err_data.ue_count, blk_name);
1039+
else
1040+
dev_info(adev->dev, "%ld correctable hardware errors detected in %s block\n",
1041+
ras_mgr->err_data.ue_count, blk_name);
1042+
1043+
for_each_ras_error(err_node, err_data) {
1044+
err_info = &err_node->err_info;
1045+
mcm_info = &err_info->mcm_info;
1046+
if (is_ue && err_info->ue_count) {
1047+
dev_info(adev->dev, "socket: %d, die: %d "
1048+
"%lld uncorrectable hardware errors detected in %s block\n",
1049+
mcm_info->socket_id,
1050+
mcm_info->die_id,
1051+
err_info->ue_count,
1052+
blk_name);
1053+
} else if (!is_ue && err_info->ce_count) {
1054+
dev_info(adev->dev, "socket: %d, die: %d "
1055+
"%lld correctable hardware errors detected in %s block\n",
1056+
mcm_info->socket_id,
1057+
mcm_info->die_id,
1058+
err_info->ue_count,
1059+
blk_name);
1060+
}
1061+
}
1062+
}
1063+
1064+
static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
1065+
struct ras_query_if *query_if,
1066+
struct ras_err_data *err_data)
1067+
{
1068+
struct ras_manager *ras_mgr = amdgpu_ras_find_obj(adev, &query_if->head);
1069+
const char *blk_name = get_ras_block_str(&query_if->head);
1070+
1071+
if (err_data->ce_count) {
1072+
if (!list_empty(&err_data->err_node_list)) {
1073+
amdgpu_ras_error_print_error_data(adev, query_if,
1074+
err_data, false);
1075+
} else if (!adev->aid_mask &&
1076+
adev->smuio.funcs &&
1077+
adev->smuio.funcs->get_socket_id &&
1078+
adev->smuio.funcs->get_die_id) {
1079+
dev_info(adev->dev, "socket: %d, die: %d "
1080+
"%ld correctable hardware errors "
1081+
"detected in %s block, no user "
1082+
"action is needed.\n",
1083+
adev->smuio.funcs->get_socket_id(adev),
1084+
adev->smuio.funcs->get_die_id(adev),
1085+
ras_mgr->err_data.ce_count,
1086+
blk_name);
1087+
} else {
1088+
dev_info(adev->dev, "%ld correctable hardware errors "
1089+
"detected in %s block, no user "
1090+
"action is needed.\n",
1091+
ras_mgr->err_data.ce_count,
1092+
blk_name);
1093+
}
1094+
}
1095+
1096+
if (err_data->ue_count) {
1097+
if (!list_empty(&err_data->err_node_list)) {
1098+
amdgpu_ras_error_print_error_data(adev, query_if,
1099+
err_data, true);
1100+
} else if (!adev->aid_mask &&
1101+
adev->smuio.funcs &&
1102+
adev->smuio.funcs->get_socket_id &&
1103+
adev->smuio.funcs->get_die_id) {
1104+
dev_info(adev->dev, "socket: %d, die: %d "
1105+
"%ld uncorrectable hardware errors "
1106+
"detected in %s block\n",
1107+
adev->smuio.funcs->get_socket_id(adev),
1108+
adev->smuio.funcs->get_die_id(adev),
1109+
ras_mgr->err_data.ue_count,
1110+
blk_name);
1111+
} else {
1112+
dev_info(adev->dev, "%ld uncorrectable hardware errors "
1113+
"detected in %s block\n",
1114+
ras_mgr->err_data.ue_count,
1115+
blk_name);
1116+
}
1117+
}
1118+
1119+
}
1120+
10181121
/* query/inject/cure begin */
10191122
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
10201123
struct ras_query_if *info)
10211124
{
10221125
struct amdgpu_ras_block_object *block_obj = NULL;
10231126
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1024-
struct ras_err_data err_data = {0, 0, 0, NULL};
1127+
struct ras_err_data err_data;
1128+
int ret;
10251129

10261130
if (!obj)
10271131
return -EINVAL;
10281132

1133+
ret = amdgpu_ras_error_data_init(&err_data);
1134+
if (ret)
1135+
return ret;
1136+
10291137
if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
10301138
amdgpu_ras_get_ecc_info(adev, &err_data);
10311139
} else {
10321140
block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
10331141
if (!block_obj || !block_obj->hw_ops) {
10341142
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
10351143
get_ras_block_str(&info->head));
1036-
return -EINVAL;
1144+
ret = -EINVAL;
1145+
goto out_fini_err_data;
10371146
}
10381147

10391148
if (block_obj->hw_ops->query_ras_error_count)
@@ -1053,48 +1162,12 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
10531162
info->ue_count = obj->err_data.ue_count;
10541163
info->ce_count = obj->err_data.ce_count;
10551164

1056-
if (err_data.ce_count) {
1057-
if (!adev->aid_mask &&
1058-
adev->smuio.funcs &&
1059-
adev->smuio.funcs->get_socket_id &&
1060-
adev->smuio.funcs->get_die_id) {
1061-
dev_info(adev->dev, "socket: %d, die: %d "
1062-
"%ld correctable hardware errors "
1063-
"detected in %s block, no user "
1064-
"action is needed.\n",
1065-
adev->smuio.funcs->get_socket_id(adev),
1066-
adev->smuio.funcs->get_die_id(adev),
1067-
obj->err_data.ce_count,
1068-
get_ras_block_str(&info->head));
1069-
} else {
1070-
dev_info(adev->dev, "%ld correctable hardware errors "
1071-
"detected in %s block, no user "
1072-
"action is needed.\n",
1073-
obj->err_data.ce_count,
1074-
get_ras_block_str(&info->head));
1075-
}
1076-
}
1077-
if (err_data.ue_count) {
1078-
if (!adev->aid_mask &&
1079-
adev->smuio.funcs &&
1080-
adev->smuio.funcs->get_socket_id &&
1081-
adev->smuio.funcs->get_die_id) {
1082-
dev_info(adev->dev, "socket: %d, die: %d "
1083-
"%ld uncorrectable hardware errors "
1084-
"detected in %s block\n",
1085-
adev->smuio.funcs->get_socket_id(adev),
1086-
adev->smuio.funcs->get_die_id(adev),
1087-
obj->err_data.ue_count,
1088-
get_ras_block_str(&info->head));
1089-
} else {
1090-
dev_info(adev->dev, "%ld uncorrectable hardware errors "
1091-
"detected in %s block\n",
1092-
obj->err_data.ue_count,
1093-
get_ras_block_str(&info->head));
1094-
}
1095-
}
1165+
amdgpu_ras_error_generate_report(adev, info, &err_data);
10961166

1097-
return 0;
1167+
out_fini_err_data:
1168+
amdgpu_ras_error_data_fini(&err_data);
1169+
1170+
return ret;
10981171
}
10991172

11001173
int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
@@ -1744,12 +1817,16 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
17441817
struct amdgpu_iv_entry *entry)
17451818
{
17461819
struct ras_ih_data *data = &obj->ih_data;
1747-
struct ras_err_data err_data = {0, 0, 0, NULL};
1820+
struct ras_err_data err_data;
17481821
int ret;
17491822

17501823
if (!data->cb)
17511824
return;
17521825

1826+
ret = amdgpu_ras_error_data_init(&err_data);
1827+
if (ret)
1828+
return;
1829+
17531830
/* Let IP handle its data, maybe we need get the output
17541831
* from the callback to update the error type/count, etc
17551832
*/
@@ -1766,6 +1843,8 @@ static void amdgpu_ras_interrupt_umc_handler(struct ras_manager *obj,
17661843
obj->err_data.ue_count += err_data.ue_count;
17671844
obj->err_data.ce_count += err_data.ce_count;
17681845
}
1846+
1847+
amdgpu_ras_error_data_fini(&err_data);
17691848
}
17701849

17711850
static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
@@ -3383,3 +3462,128 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,
33833462
WREG32(err_status_hi_offset, 0);
33843463
}
33853464
}
3465+
3466+
int amdgpu_ras_error_data_init(struct ras_err_data *err_data)
3467+
{
3468+
memset(err_data, 0, sizeof(*err_data));
3469+
3470+
INIT_LIST_HEAD(&err_data->err_node_list);
3471+
3472+
return 0;
3473+
}
3474+
3475+
static void amdgpu_ras_error_node_release(struct ras_err_node *err_node)
3476+
{
3477+
if (!err_node)
3478+
return;
3479+
3480+
list_del(&err_node->node);
3481+
kvfree(err_node);
3482+
}
3483+
3484+
void amdgpu_ras_error_data_fini(struct ras_err_data *err_data)
3485+
{
3486+
struct ras_err_node *err_node, *tmp;
3487+
3488+
list_for_each_entry_safe(err_node, tmp, &err_data->err_node_list, node) {
3489+
amdgpu_ras_error_node_release(err_node);
3490+
list_del(&err_node->node);
3491+
}
3492+
}
3493+
3494+
static struct ras_err_node *amdgpu_ras_error_find_node_by_id(struct ras_err_data *err_data,
3495+
struct amdgpu_smuio_mcm_config_info *mcm_info)
3496+
{
3497+
struct ras_err_node *err_node;
3498+
struct amdgpu_smuio_mcm_config_info *ref_id;
3499+
3500+
if (!err_data || !mcm_info)
3501+
return NULL;
3502+
3503+
for_each_ras_error(err_node, err_data) {
3504+
ref_id = &err_node->err_info.mcm_info;
3505+
if ((mcm_info->socket_id >= 0 && mcm_info->socket_id != ref_id->socket_id) ||
3506+
(mcm_info->die_id >= 0 && mcm_info->die_id != ref_id->die_id))
3507+
continue;
3508+
3509+
return err_node;
3510+
}
3511+
3512+
return NULL;
3513+
}
3514+
3515+
static struct ras_err_node *amdgpu_ras_error_node_new(void)
3516+
{
3517+
struct ras_err_node *err_node;
3518+
3519+
err_node = kvzalloc(sizeof(*err_node), GFP_KERNEL);
3520+
if (!err_node)
3521+
return NULL;
3522+
3523+
INIT_LIST_HEAD(&err_node->node);
3524+
3525+
return err_node;
3526+
}
3527+
3528+
static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_data,
3529+
struct amdgpu_smuio_mcm_config_info *mcm_info)
3530+
{
3531+
struct ras_err_node *err_node;
3532+
3533+
err_node = amdgpu_ras_error_find_node_by_id(err_data, mcm_info);
3534+
if (err_node)
3535+
return &err_node->err_info;
3536+
3537+
err_node = amdgpu_ras_error_node_new();
3538+
if (!err_node)
3539+
return NULL;
3540+
3541+
memcpy(&err_node->err_info.mcm_info, mcm_info, sizeof(*mcm_info));
3542+
3543+
err_data->err_list_count++;
3544+
list_add_tail(&err_node->node, &err_data->err_node_list);
3545+
3546+
return &err_node->err_info;
3547+
}
3548+
3549+
int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data,
3550+
struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count)
3551+
{
3552+
struct ras_err_info *err_info;
3553+
3554+
if (!err_data || !mcm_info)
3555+
return -EINVAL;
3556+
3557+
if (!count)
3558+
return 0;
3559+
3560+
err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
3561+
if (!err_info)
3562+
return -EINVAL;
3563+
3564+
err_info->ue_count += count;
3565+
err_data->ue_count += count;
3566+
3567+
return 0;
3568+
}
3569+
3570+
int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data,
3571+
struct amdgpu_smuio_mcm_config_info *mcm_info, u64 count)
3572+
{
3573+
struct ras_err_info *err_info;
3574+
3575+
if (!err_data || !mcm_info)
3576+
return -EINVAL;
3577+
3578+
if (!count)
3579+
return 0;
3580+
3581+
err_info = amdgpu_ras_error_get_info(err_data, mcm_info);
3582+
if (!err_info)
3583+
return -EINVAL;
3584+
3585+
err_info->ce_count += count;
3586+
err_data->ce_count += count;
3587+
3588+
return 0;
3589+
}

0 commit comments

Comments
 (0)