Skip to content

Commit 8cc0f56

Browse files
Hawking Zhangalexdeucher
authored andcommitted
drm/amdgpu: Support multiple error query modes
Direct error query mode and firmware error query mode are supported for now. Signed-off-by: Hawking Zhang <[email protected]> Reviewed-by: Yang Wang <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 07c1db7 commit 8cc0f56

File tree

2 files changed

+78
-23
lines changed

2 files changed

+78
-23
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 70 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1165,13 +1165,53 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
11651165
}
11661166
}
11671167

1168-
/* query/inject/cure begin */
1169-
int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
1170-
struct ras_query_if *info)
1168+
static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
1169+
struct ras_query_if *info,
1170+
struct ras_err_data *err_data,
1171+
unsigned int error_query_mode)
11711172
{
1173+
enum amdgpu_ras_block blk = info ? info->head.block : AMDGPU_RAS_BLOCK_COUNT;
11721174
struct amdgpu_ras_block_object *block_obj = NULL;
1175+
1176+
if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY)
1177+
return -EINVAL;
1178+
1179+
if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
1180+
if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
1181+
amdgpu_ras_get_ecc_info(adev, err_data);
1182+
} else {
1183+
block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
1184+
if (!block_obj || !block_obj->hw_ops) {
1185+
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
1186+
get_ras_block_str(&info->head));
1187+
return -EINVAL;
1188+
}
1189+
1190+
if (block_obj->hw_ops->query_ras_error_count)
1191+
block_obj->hw_ops->query_ras_error_count(adev, &err_data);
1192+
1193+
if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
1194+
(info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
1195+
(info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
1196+
if (block_obj->hw_ops->query_ras_error_status)
1197+
block_obj->hw_ops->query_ras_error_status(adev);
1198+
}
1199+
}
1200+
} else {
1201+
/* FIXME: add code to check return value later */
1202+
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_UE, err_data);
1203+
amdgpu_mca_smu_log_ras_error(adev, blk, AMDGPU_MCA_ERROR_TYPE_CE, err_data);
1204+
}
1205+
1206+
return 0;
1207+
}
1208+
1209+
/* query/inject/cure begin */
1210+
int amdgpu_ras_query_error_status(struct amdgpu_device *adev, struct ras_query_if *info)
1211+
{
11731212
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
11741213
struct ras_err_data err_data;
1214+
unsigned int error_query_mode;
11751215
int ret;
11761216

11771217
if (!obj)
@@ -1181,27 +1221,14 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
11811221
if (ret)
11821222
return ret;
11831223

1184-
if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
1185-
amdgpu_ras_get_ecc_info(adev, &err_data);
1186-
} else {
1187-
block_obj = amdgpu_ras_get_ras_block(adev, info->head.block, 0);
1188-
if (!block_obj || !block_obj->hw_ops) {
1189-
dev_dbg_once(adev->dev, "%s doesn't config RAS function\n",
1190-
get_ras_block_str(&info->head));
1191-
ret = -EINVAL;
1192-
goto out_fini_err_data;
1193-
}
1194-
1195-
if (block_obj->hw_ops->query_ras_error_count)
1196-
block_obj->hw_ops->query_ras_error_count(adev, &err_data);
1224+
if (!amdgpu_ras_get_error_query_mode(adev, &error_query_mode))
1225+
return -EINVAL;
11971226

1198-
if ((info->head.block == AMDGPU_RAS_BLOCK__SDMA) ||
1199-
(info->head.block == AMDGPU_RAS_BLOCK__GFX) ||
1200-
(info->head.block == AMDGPU_RAS_BLOCK__MMHUB)) {
1201-
if (block_obj->hw_ops->query_ras_error_status)
1202-
block_obj->hw_ops->query_ras_error_status(adev);
1203-
}
1204-
}
1227+
ret = amdgpu_ras_query_error_status_helper(adev, info,
1228+
&err_data,
1229+
error_query_mode);
1230+
if (ret)
1231+
goto out_fini_err_data;
12051232

12061233
amdgpu_rasmgr_error_data_statistic_update(obj, &err_data);
12071234

@@ -3397,6 +3424,26 @@ bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev)
33973424
return true;
33983425
}
33993426

3427+
bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
3428+
unsigned int *error_query_mode)
3429+
{
3430+
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
3431+
const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs;
3432+
3433+
if (!con) {
3434+
*error_query_mode = AMDGPU_RAS_INVALID_ERROR_QUERY;
3435+
return false;
3436+
}
3437+
3438+
if (mca_funcs && mca_funcs->mca_set_debug_mode)
3439+
*error_query_mode =
3440+
(con->is_mca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
3441+
else
3442+
*error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;
3443+
3444+
return true;
3445+
}
3446+
34003447
/* Register each ip ras block into amdgpu ras */
34013448
int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
34023449
struct amdgpu_ras_block_object *ras_block_obj)

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,12 @@ enum amdgpu_ras_ret {
320320
AMDGPU_RAS_PT,
321321
};
322322

323+
enum amdgpu_ras_error_query_mode {
324+
AMDGPU_RAS_INVALID_ERROR_QUERY = 0,
325+
AMDGPU_RAS_DIRECT_ERROR_QUERY = 1,
326+
AMDGPU_RAS_FIRMWARE_ERROR_QUERY = 2,
327+
};
328+
323329
/* ras error status reisger fields */
324330
#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT 0x0
325331
#define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK 0x00000001L
@@ -769,6 +775,8 @@ int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_co
769775

770776
void amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable);
771777
bool amdgpu_ras_get_mca_debug_mode(struct amdgpu_device *adev);
778+
bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
779+
unsigned int *mode);
772780

773781
int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,
774782
struct amdgpu_ras_block_object *ras_block_obj);

0 commit comments

Comments
 (0)