Skip to content

Commit 84a2947

Browse files
vskvortsalexdeucher
authored andcommitted
drm/amdgpu: Implement virt req_ras_err_count
Enable RAS late init if VF RAS Telemetry is supported. When enabled, the VF can use this interface to query total RAS error counts from the host. The VF FB access may abruptly end due to a fatal error, therefore the VF must cache and sanitize the input. The Host allows 15 Telemetry messages every 60 seconds, afterwhich the host will ignore any more in-coming telemetry messages. The VF will rate limit its msg calling to once every 5 seconds (12 times in 60 seconds). While the VF is rate limited, it will continue to report the last good cached data. v2: Flip generate report & update statistics order for VF Signed-off-by: Victor Skvortsov <[email protected]> Acked-by: Tao Zhou <[email protected]> Reviewed-by: Zhigang Luo <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent 907fec2 commit 84a2947

File tree

7 files changed

+229
-7
lines changed

7 files changed

+229
-7
lines changed

drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4236,7 +4236,10 @@ int amdgpu_device_init(struct amdgpu_device *adev,
42364236
* for throttling interrupt) = 60 seconds.
42374237
*/
42384238
ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
4239+
ratelimit_state_init(&adev->virt.ras_telemetry_rs, 5 * HZ, 1);
4240+
42394241
ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
4242+
ratelimit_set_flags(&adev->virt.ras_telemetry_rs, RATELIMIT_MSG_ON_RELEASE);
42404243

42414244
/* Registers mapping */
42424245
/* TODO: block userspace mapping of io register */
@@ -5186,6 +5189,9 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
51865189
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4) ||
51875190
amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(11, 0, 3))
51885191
amdgpu_ras_resume(adev);
5192+
5193+
amdgpu_virt_ras_telemetry_post_reset(adev);
5194+
51895195
return 0;
51905196
}
51915197

drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -904,6 +904,9 @@ int amdgpu_gfx_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
904904
if (r)
905905
return r;
906906

907+
if (amdgpu_sriov_vf(adev))
908+
return r;
909+
907910
if (adev->gfx.cp_ecc_error_irq.funcs) {
908911
r = amdgpu_irq_get(adev, &adev->gfx.cp_ecc_error_irq, 0);
909912
if (r)

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

Lines changed: 65 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1214,6 +1214,42 @@ static void amdgpu_ras_error_generate_report(struct amdgpu_device *adev,
12141214
}
12151215
}
12161216

1217+
static void amdgpu_ras_virt_error_generate_report(struct amdgpu_device *adev,
1218+
struct ras_query_if *query_if,
1219+
struct ras_err_data *err_data,
1220+
struct ras_query_context *qctx)
1221+
{
1222+
unsigned long new_ue, new_ce, new_de;
1223+
struct ras_manager *obj = amdgpu_ras_find_obj(adev, &query_if->head);
1224+
const char *blk_name = get_ras_block_str(&query_if->head);
1225+
u64 event_id = qctx->evid.event_id;
1226+
1227+
new_ce = err_data->ce_count - obj->err_data.ce_count;
1228+
new_ue = err_data->ue_count - obj->err_data.ue_count;
1229+
new_de = err_data->de_count - obj->err_data.de_count;
1230+
1231+
if (new_ce) {
1232+
RAS_EVENT_LOG(adev, event_id, "%lu correctable hardware errors "
1233+
"detected in %s block\n",
1234+
new_ce,
1235+
blk_name);
1236+
}
1237+
1238+
if (new_ue) {
1239+
RAS_EVENT_LOG(adev, event_id, "%lu uncorrectable hardware errors "
1240+
"detected in %s block\n",
1241+
new_ue,
1242+
blk_name);
1243+
}
1244+
1245+
if (new_de) {
1246+
RAS_EVENT_LOG(adev, event_id, "%lu deferred hardware errors "
1247+
"detected in %s block\n",
1248+
new_de,
1249+
blk_name);
1250+
}
1251+
}
1252+
12171253
static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, struct ras_err_data *err_data)
12181254
{
12191255
struct ras_err_node *err_node;
@@ -1237,6 +1273,15 @@ static void amdgpu_rasmgr_error_data_statistic_update(struct ras_manager *obj, s
12371273
}
12381274
}
12391275

1276+
static void amdgpu_ras_mgr_virt_error_data_statistics_update(struct ras_manager *obj,
1277+
struct ras_err_data *err_data)
1278+
{
1279+
/* Host reports absolute counts */
1280+
obj->err_data.ue_count = err_data->ue_count;
1281+
obj->err_data.ce_count = err_data->ce_count;
1282+
obj->err_data.de_count = err_data->de_count;
1283+
}
1284+
12401285
static struct ras_manager *get_ras_manager(struct amdgpu_device *adev, enum amdgpu_ras_block blk)
12411286
{
12421287
struct ras_common_if head;
@@ -1323,7 +1368,9 @@ static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
13231368
if (error_query_mode == AMDGPU_RAS_INVALID_ERROR_QUERY)
13241369
return -EINVAL;
13251370

1326-
if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
1371+
if (error_query_mode == AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY) {
1372+
return amdgpu_virt_req_ras_err_count(adev, blk, err_data);
1373+
} else if (error_query_mode == AMDGPU_RAS_DIRECT_ERROR_QUERY) {
13271374
if (info->head.block == AMDGPU_RAS_BLOCK__UMC) {
13281375
amdgpu_ras_get_ecc_info(adev, err_data);
13291376
} else {
@@ -1405,14 +1452,22 @@ static int amdgpu_ras_query_error_status_with_event(struct amdgpu_device *adev,
14051452
if (ret)
14061453
goto out_fini_err_data;
14071454

1408-
amdgpu_rasmgr_error_data_statistic_update(obj, &err_data);
1455+
if (error_query_mode != AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY) {
1456+
amdgpu_rasmgr_error_data_statistic_update(obj, &err_data);
1457+
amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx);
1458+
} else {
1459+
/* Host provides absolute error counts. First generate the report
1460+
* using the previous VF internal count against new host count.
1461+
* Then Update VF internal count.
1462+
*/
1463+
amdgpu_ras_virt_error_generate_report(adev, info, &err_data, &qctx);
1464+
amdgpu_ras_mgr_virt_error_data_statistics_update(obj, &err_data);
1465+
}
14091466

14101467
info->ue_count = obj->err_data.ue_count;
14111468
info->ce_count = obj->err_data.ce_count;
14121469
info->de_count = obj->err_data.de_count;
14131470

1414-
amdgpu_ras_error_generate_report(adev, info, &err_data, &qctx);
1415-
14161471
out_fini_err_data:
14171472
amdgpu_ras_error_data_fini(&err_data);
14181473

@@ -3930,7 +3985,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
39303985
}
39313986

39323987
/* Guest side doesn't need init ras feature */
3933-
if (amdgpu_sriov_vf(adev))
3988+
if (amdgpu_sriov_vf(adev) && !amdgpu_sriov_ras_telemetry_en(adev))
39343989
return 0;
39353990

39363991
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
@@ -4397,11 +4452,14 @@ bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev,
43974452
return false;
43984453
}
43994454

4400-
if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode))
4455+
if (amdgpu_sriov_vf(adev)) {
4456+
*error_query_mode = AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY;
4457+
} else if ((smu_funcs && smu_funcs->set_debug_mode) || (mca_funcs && mca_funcs->mca_set_debug_mode)) {
44014458
*error_query_mode =
44024459
(con->is_aca_debug_mode) ? AMDGPU_RAS_DIRECT_ERROR_QUERY : AMDGPU_RAS_FIRMWARE_ERROR_QUERY;
4403-
else
4460+
} else {
44044461
*error_query_mode = AMDGPU_RAS_DIRECT_ERROR_QUERY;
4462+
}
44054463

44064464
return true;
44074465
}

drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,7 @@ enum amdgpu_ras_error_query_mode {
365365
AMDGPU_RAS_INVALID_ERROR_QUERY = 0,
366366
AMDGPU_RAS_DIRECT_ERROR_QUERY = 1,
367367
AMDGPU_RAS_FIRMWARE_ERROR_QUERY = 2,
368+
AMDGPU_RAS_VIRT_ERROR_COUNT_QUERY = 3,
368369
};
369370

370371
/* ras error status reisger fields */

drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -318,6 +318,9 @@ int amdgpu_umc_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *r
318318
if (r)
319319
return r;
320320

321+
if (amdgpu_sriov_vf(adev))
322+
return r;
323+
321324
if (amdgpu_ras_is_supported(adev, ras_block->block)) {
322325
r = amdgpu_irq_get(adev, &adev->gmc.ecc_irq, 0);
323326
if (r)

drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,8 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev)
524524
adev->unique_id =
525525
((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid;
526526
adev->virt.ras_en_caps.all = ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->ras_en_caps.all;
527+
adev->virt.ras_telemetry_en_caps.all =
528+
((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->ras_telemetry_en_caps.all;
527529
break;
528530
default:
529531
dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", pf2vf_info->version);
@@ -704,13 +706,17 @@ void amdgpu_virt_exchange_data(struct amdgpu_device *adev)
704706
adev->virt.fw_reserve.p_vf2pf =
705707
(struct amd_sriov_msg_vf2pf_info_header *)
706708
(adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB << 10));
709+
adev->virt.fw_reserve.ras_telemetry =
710+
(adev->mman.fw_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB << 10));
707711
} else if (adev->mman.drv_vram_usage_va) {
708712
adev->virt.fw_reserve.p_pf2vf =
709713
(struct amd_sriov_msg_pf2vf_info_header *)
710714
(adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_PF2VF_OFFSET_KB << 10));
711715
adev->virt.fw_reserve.p_vf2pf =
712716
(struct amd_sriov_msg_vf2pf_info_header *)
713717
(adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_VF2PF_OFFSET_KB << 10));
718+
adev->virt.fw_reserve.ras_telemetry =
719+
(adev->mman.drv_vram_usage_va + (AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB << 10));
714720
}
715721

716722
amdgpu_virt_read_pf2vf_data(adev);
@@ -1197,3 +1203,133 @@ bool amdgpu_virt_get_ras_capability(struct amdgpu_device *adev)
11971203

11981204
return true;
11991205
}
1206+
1207+
static inline enum amd_sriov_ras_telemetry_gpu_block
1208+
amdgpu_ras_block_to_sriov(struct amdgpu_device *adev, enum amdgpu_ras_block block) {
1209+
switch (block) {
1210+
case AMDGPU_RAS_BLOCK__UMC:
1211+
return RAS_TELEMETRY_GPU_BLOCK_UMC;
1212+
case AMDGPU_RAS_BLOCK__SDMA:
1213+
return RAS_TELEMETRY_GPU_BLOCK_SDMA;
1214+
case AMDGPU_RAS_BLOCK__GFX:
1215+
return RAS_TELEMETRY_GPU_BLOCK_GFX;
1216+
case AMDGPU_RAS_BLOCK__MMHUB:
1217+
return RAS_TELEMETRY_GPU_BLOCK_MMHUB;
1218+
case AMDGPU_RAS_BLOCK__ATHUB:
1219+
return RAS_TELEMETRY_GPU_BLOCK_ATHUB;
1220+
case AMDGPU_RAS_BLOCK__PCIE_BIF:
1221+
return RAS_TELEMETRY_GPU_BLOCK_PCIE_BIF;
1222+
case AMDGPU_RAS_BLOCK__HDP:
1223+
return RAS_TELEMETRY_GPU_BLOCK_HDP;
1224+
case AMDGPU_RAS_BLOCK__XGMI_WAFL:
1225+
return RAS_TELEMETRY_GPU_BLOCK_XGMI_WAFL;
1226+
case AMDGPU_RAS_BLOCK__DF:
1227+
return RAS_TELEMETRY_GPU_BLOCK_DF;
1228+
case AMDGPU_RAS_BLOCK__SMN:
1229+
return RAS_TELEMETRY_GPU_BLOCK_SMN;
1230+
case AMDGPU_RAS_BLOCK__SEM:
1231+
return RAS_TELEMETRY_GPU_BLOCK_SEM;
1232+
case AMDGPU_RAS_BLOCK__MP0:
1233+
return RAS_TELEMETRY_GPU_BLOCK_MP0;
1234+
case AMDGPU_RAS_BLOCK__MP1:
1235+
return RAS_TELEMETRY_GPU_BLOCK_MP1;
1236+
case AMDGPU_RAS_BLOCK__FUSE:
1237+
return RAS_TELEMETRY_GPU_BLOCK_FUSE;
1238+
case AMDGPU_RAS_BLOCK__MCA:
1239+
return RAS_TELEMETRY_GPU_BLOCK_MCA;
1240+
case AMDGPU_RAS_BLOCK__VCN:
1241+
return RAS_TELEMETRY_GPU_BLOCK_VCN;
1242+
case AMDGPU_RAS_BLOCK__JPEG:
1243+
return RAS_TELEMETRY_GPU_BLOCK_JPEG;
1244+
case AMDGPU_RAS_BLOCK__IH:
1245+
return RAS_TELEMETRY_GPU_BLOCK_IH;
1246+
case AMDGPU_RAS_BLOCK__MPIO:
1247+
return RAS_TELEMETRY_GPU_BLOCK_MPIO;
1248+
default:
1249+
dev_err(adev->dev, "Unsupported SRIOV RAS telemetry block 0x%x\n", block);
1250+
return RAS_TELEMETRY_GPU_BLOCK_COUNT;
1251+
}
1252+
}
1253+
1254+
static int amdgpu_virt_cache_host_error_counts(struct amdgpu_device *adev,
1255+
struct amdsriov_ras_telemetry *host_telemetry)
1256+
{
1257+
struct amd_sriov_ras_telemetry_error_count *tmp = NULL;
1258+
uint32_t checksum, used_size;
1259+
1260+
checksum = host_telemetry->header.checksum;
1261+
used_size = host_telemetry->header.used_size;
1262+
1263+
if (used_size > (AMD_SRIOV_RAS_TELEMETRY_SIZE_KB << 10))
1264+
return 0;
1265+
1266+
tmp = kmalloc(used_size, GFP_KERNEL);
1267+
if (!tmp)
1268+
return -ENOMEM;
1269+
1270+
memcpy(tmp, &host_telemetry->body.error_count, used_size);
1271+
1272+
if (checksum != amd_sriov_msg_checksum(tmp, used_size, 0, 0))
1273+
goto out;
1274+
1275+
memcpy(&adev->virt.count_cache, tmp,
1276+
min(used_size, sizeof(adev->virt.count_cache)));
1277+
out:
1278+
kfree(tmp);
1279+
1280+
return 0;
1281+
}
1282+
1283+
static int amdgpu_virt_req_ras_err_count_internal(struct amdgpu_device *adev, bool force_update)
1284+
{
1285+
struct amdgpu_virt *virt = &adev->virt;
1286+
1287+
/* Host allows 15 ras telemetry requests per 60 seconds. Afterwhich, the Host
1288+
* will ignore incoming guest messages. Ratelimit the guest messages to
1289+
* prevent guest self DOS.
1290+
*/
1291+
if (__ratelimit(&adev->virt.ras_telemetry_rs) || force_update) {
1292+
if (!virt->ops->req_ras_err_count(adev))
1293+
amdgpu_virt_cache_host_error_counts(adev,
1294+
adev->virt.fw_reserve.ras_telemetry);
1295+
}
1296+
1297+
return 0;
1298+
}
1299+
1300+
/* Bypass ACA interface and query ECC counts directly from host */
1301+
int amdgpu_virt_req_ras_err_count(struct amdgpu_device *adev, enum amdgpu_ras_block block,
1302+
struct ras_err_data *err_data)
1303+
{
1304+
enum amd_sriov_ras_telemetry_gpu_block sriov_block;
1305+
1306+
sriov_block = amdgpu_ras_block_to_sriov(adev, block);
1307+
1308+
if (sriov_block >= RAS_TELEMETRY_GPU_BLOCK_COUNT ||
1309+
!amdgpu_sriov_ras_telemetry_block_en(adev, sriov_block))
1310+
return -EOPNOTSUPP;
1311+
1312+
/* Host Access may be lost during reset, just return last cached data. */
1313+
if (down_read_trylock(&adev->reset_domain->sem)) {
1314+
amdgpu_virt_req_ras_err_count_internal(adev, false);
1315+
up_read(&adev->reset_domain->sem);
1316+
}
1317+
1318+
err_data->ue_count = adev->virt.count_cache.block[sriov_block].ue_count;
1319+
err_data->ce_count = adev->virt.count_cache.block[sriov_block].ce_count;
1320+
err_data->de_count = adev->virt.count_cache.block[sriov_block].de_count;
1321+
1322+
return 0;
1323+
}
1324+
1325+
int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev)
1326+
{
1327+
unsigned long ue_count, ce_count;
1328+
1329+
if (amdgpu_sriov_ras_telemetry_en(adev)) {
1330+
amdgpu_virt_req_ras_err_count_internal(adev, true);
1331+
amdgpu_ras_query_error_count(adev, &ce_count, &ue_count, NULL);
1332+
}
1333+
1334+
return 0;
1335+
}

drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ struct amdgpu_virt_ops {
104104
struct amdgpu_virt_fw_reserve {
105105
struct amd_sriov_msg_pf2vf_info_header *p_pf2vf;
106106
struct amd_sriov_msg_vf2pf_info_header *p_vf2pf;
107+
void *ras_telemetry;
107108
unsigned int checksum_key;
108109
};
109110

@@ -138,6 +139,7 @@ enum AMDGIM_FEATURE_FLAG {
138139
/* MES info */
139140
AMDGIM_FEATURE_MES_INFO_ENABLE = (1 << 8),
140141
AMDGIM_FEATURE_RAS_CAPS = (1 << 9),
142+
AMDGIM_FEATURE_RAS_TELEMETRY = (1 << 10),
141143
};
142144

143145
enum AMDGIM_REG_ACCESS_FLAG {
@@ -280,6 +282,10 @@ struct amdgpu_virt {
280282
struct mutex rlcg_reg_lock;
281283

282284
union amd_sriov_ras_caps ras_en_caps;
285+
union amd_sriov_ras_caps ras_telemetry_en_caps;
286+
287+
struct ratelimit_state ras_telemetry_rs;
288+
struct amd_sriov_ras_telemetry_error_count count_cache;
283289
};
284290

285291
struct amdgpu_video_codec_info;
@@ -327,6 +333,12 @@ struct amdgpu_video_codec_info;
327333
#define amdgpu_sriov_ras_caps_en(adev) \
328334
((adev)->virt.gim_feature & AMDGIM_FEATURE_RAS_CAPS)
329335

336+
#define amdgpu_sriov_ras_telemetry_en(adev) \
337+
(((adev)->virt.gim_feature & AMDGIM_FEATURE_RAS_TELEMETRY) && (adev)->virt.fw_reserve.ras_telemetry)
338+
339+
#define amdgpu_sriov_ras_telemetry_block_en(adev, sriov_blk) \
340+
(amdgpu_sriov_ras_telemetry_en((adev)) && (adev)->virt.ras_telemetry_en_caps.all & BIT(sriov_blk))
341+
330342
static inline bool is_virtual_machine(void)
331343
{
332344
#if defined(CONFIG_X86)
@@ -391,4 +403,7 @@ bool amdgpu_virt_get_rlcg_reg_access_flag(struct amdgpu_device *adev,
391403
bool write, u32 *rlcg_flag);
392404
u32 amdgpu_virt_rlcg_reg_rw(struct amdgpu_device *adev, u32 offset, u32 v, u32 flag, u32 xcc_id);
393405
bool amdgpu_virt_get_ras_capability(struct amdgpu_device *adev);
406+
int amdgpu_virt_req_ras_err_count(struct amdgpu_device *adev, enum amdgpu_ras_block block,
407+
struct ras_err_data *err_data);
408+
int amdgpu_virt_ras_telemetry_post_reset(struct amdgpu_device *adev);
394409
#endif

0 commit comments

Comments
 (0)