Skip to content

Commit 60c58d7

Browse files
vskvortsalexdeucher
authored andcommitted
drm/amdgpu: Update SRIOV Exchange Headers for RAS Telemetry Support
The SRIOV PF/VF Data exchange is extended by 64KB for VF RAS Telemetry data. Add Host RAS Telemetry enable capabilities bitfields. Add a new VF msg REQ_RAS_ERROR_COUNT, the host response data will be populated in the RAS Telemetry region. Signed-off-by: Victor Skvortsov <[email protected]> Reviewed-by: Zhigang Luo <[email protected]> Signed-off-by: Alex Deucher <[email protected]>
1 parent acbbbd2 commit 60c58d7

File tree

2 files changed

+115
-19
lines changed

2 files changed

+115
-19
lines changed

drivers/gpu/drm/amd/amdgpu/amdgv_sriovmsg.h

Lines changed: 112 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -28,17 +28,21 @@
2828
#define AMD_SRIOV_MSG_VBIOS_SIZE_KB 64
2929
#define AMD_SRIOV_MSG_DATAEXCHANGE_OFFSET_KB AMD_SRIOV_MSG_VBIOS_SIZE_KB
3030
#define AMD_SRIOV_MSG_DATAEXCHANGE_SIZE_KB 4
31-
31+
#define AMD_SRIOV_MSG_TMR_OFFSET_KB 2048
32+
#define AMD_SRIOV_MSG_BAD_PAGE_SIZE_KB 2
33+
#define AMD_SRIOV_RAS_TELEMETRY_SIZE_KB 64
3234
/*
3335
* layout
34-
* 0 64KB 65KB 66KB
35-
* | VBIOS | PF2VF | VF2PF | Bad Page | ...
36-
* | 64KB | 1KB | 1KB |
36+
* 0 64KB 65KB 66KB 68KB 132KB
37+
* | VBIOS | PF2VF | VF2PF | Bad Page | RAS Telemetry Region | ...
38+
* | 64KB | 1KB | 1KB | 2KB | 64KB | ...
3739
*/
40+
3841
#define AMD_SRIOV_MSG_SIZE_KB 1
3942
#define AMD_SRIOV_MSG_PF2VF_OFFSET_KB AMD_SRIOV_MSG_DATAEXCHANGE_OFFSET_KB
4043
#define AMD_SRIOV_MSG_VF2PF_OFFSET_KB (AMD_SRIOV_MSG_PF2VF_OFFSET_KB + AMD_SRIOV_MSG_SIZE_KB)
4144
#define AMD_SRIOV_MSG_BAD_PAGE_OFFSET_KB (AMD_SRIOV_MSG_VF2PF_OFFSET_KB + AMD_SRIOV_MSG_SIZE_KB)
45+
#define AMD_SRIOV_MSG_RAS_TELEMETRY_OFFSET_KB (AMD_SRIOV_MSG_BAD_PAGE_OFFSET_KB + AMD_SRIOV_MSG_BAD_PAGE_SIZE_KB)
4246

4347
/*
4448
* PF2VF history log:
@@ -86,30 +90,59 @@ enum amd_sriov_ucode_engine_id {
8690

8791
union amd_sriov_msg_feature_flags {
8892
struct {
89-
uint32_t error_log_collect : 1;
90-
uint32_t host_load_ucodes : 1;
91-
uint32_t host_flr_vramlost : 1;
92-
uint32_t mm_bw_management : 1;
93-
uint32_t pp_one_vf_mode : 1;
94-
uint32_t reg_indirect_acc : 1;
95-
uint32_t av1_support : 1;
96-
uint32_t vcn_rb_decouple : 1;
97-
uint32_t mes_info_enable : 1;
98-
uint32_t reserved : 23;
93+
uint32_t error_log_collect : 1;
94+
uint32_t host_load_ucodes : 1;
95+
uint32_t host_flr_vramlost : 1;
96+
uint32_t mm_bw_management : 1;
97+
uint32_t pp_one_vf_mode : 1;
98+
uint32_t reg_indirect_acc : 1;
99+
uint32_t av1_support : 1;
100+
uint32_t vcn_rb_decouple : 1;
101+
uint32_t mes_info_dump_enable : 1;
102+
uint32_t ras_caps : 1;
103+
uint32_t ras_telemetry : 1;
104+
uint32_t reserved : 21;
99105
} flags;
100106
uint32_t all;
101107
};
102108

103109
union amd_sriov_reg_access_flags {
104110
struct {
105-
uint32_t vf_reg_access_ih : 1;
106-
uint32_t vf_reg_access_mmhub : 1;
107-
uint32_t vf_reg_access_gc : 1;
108-
uint32_t reserved : 29;
111+
uint32_t vf_reg_access_ih : 1;
112+
uint32_t vf_reg_access_mmhub : 1;
113+
uint32_t vf_reg_access_gc : 1;
114+
uint32_t reserved : 29;
109115
} flags;
110116
uint32_t all;
111117
};
112118

119+
union amd_sriov_ras_caps {
120+
struct {
121+
uint64_t block_umc : 1;
122+
uint64_t block_sdma : 1;
123+
uint64_t block_gfx : 1;
124+
uint64_t block_mmhub : 1;
125+
uint64_t block_athub : 1;
126+
uint64_t block_pcie_bif : 1;
127+
uint64_t block_hdp : 1;
128+
uint64_t block_xgmi_wafl : 1;
129+
uint64_t block_df : 1;
130+
uint64_t block_smn : 1;
131+
uint64_t block_sem : 1;
132+
uint64_t block_mp0 : 1;
133+
uint64_t block_mp1 : 1;
134+
uint64_t block_fuse : 1;
135+
uint64_t block_mca : 1;
136+
uint64_t block_vcn : 1;
137+
uint64_t block_jpeg : 1;
138+
uint64_t block_ih : 1;
139+
uint64_t block_mpio : 1;
140+
uint64_t poison_propogation_mode : 1;
141+
uint64_t reserved : 44;
142+
} bits;
143+
uint64_t all;
144+
};
145+
113146
union amd_sriov_msg_os_info {
114147
struct {
115148
uint32_t windows : 1;
@@ -158,7 +191,7 @@ struct amd_sriov_msg_pf2vf_info_header {
158191
uint32_t reserved[2];
159192
};
160193

161-
#define AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE (49)
194+
#define AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE (55)
162195
struct amd_sriov_msg_pf2vf_info {
163196
/* header contains size and version */
164197
struct amd_sriov_msg_pf2vf_info_header header;
@@ -211,6 +244,12 @@ struct amd_sriov_msg_pf2vf_info {
211244
uint32_t pcie_atomic_ops_support_flags;
212245
/* Portion of GPU memory occupied by VF. MAX value is 65535, but set to uint32_t to maintain alignment with reserved size */
213246
uint32_t gpu_capacity;
247+
/* vf bdf on host pci tree for debug only */
248+
uint32_t bdf_on_host;
249+
uint32_t more_bp; //Reserved for future use.
250+
union amd_sriov_ras_caps ras_en_caps;
251+
union amd_sriov_ras_caps ras_telemetry_en_caps;
252+
214253
/* reserved */
215254
uint32_t reserved[256 - AMD_SRIOV_MSG_PF2VF_INFO_FILLED_SIZE];
216255
} __packed;
@@ -283,8 +322,12 @@ enum amd_sriov_mailbox_request_message {
283322
MB_REQ_MSG_REL_GPU_FINI_ACCESS,
284323
MB_REQ_MSG_REQ_GPU_RESET_ACCESS,
285324
MB_REQ_MSG_REQ_GPU_INIT_DATA,
325+
MB_REQ_MSG_PSP_VF_CMD_RELAY,
286326

287327
MB_REQ_MSG_LOG_VF_ERROR = 200,
328+
MB_REQ_MSG_READY_TO_RESET = 201,
329+
MB_REQ_MSG_RAS_POISON = 202,
330+
MB_REQ_RAS_ERROR_COUNT = 203,
288331
};
289332

290333
/* mailbox message send from host to guest */
@@ -297,10 +340,60 @@ enum amd_sriov_mailbox_response_message {
297340
MB_RES_MSG_FAIL,
298341
MB_RES_MSG_QUERY_ALIVE,
299342
MB_RES_MSG_GPU_INIT_DATA_READY,
343+
MB_RES_MSG_RAS_ERROR_COUNT_READY = 11,
300344

301345
MB_RES_MSG_TEXT_MESSAGE = 255
302346
};
303347

348+
enum amd_sriov_ras_telemetry_gpu_block {
349+
RAS_TELEMETRY_GPU_BLOCK_UMC = 0,
350+
RAS_TELEMETRY_GPU_BLOCK_SDMA = 1,
351+
RAS_TELEMETRY_GPU_BLOCK_GFX = 2,
352+
RAS_TELEMETRY_GPU_BLOCK_MMHUB = 3,
353+
RAS_TELEMETRY_GPU_BLOCK_ATHUB = 4,
354+
RAS_TELEMETRY_GPU_BLOCK_PCIE_BIF = 5,
355+
RAS_TELEMETRY_GPU_BLOCK_HDP = 6,
356+
RAS_TELEMETRY_GPU_BLOCK_XGMI_WAFL = 7,
357+
RAS_TELEMETRY_GPU_BLOCK_DF = 8,
358+
RAS_TELEMETRY_GPU_BLOCK_SMN = 9,
359+
RAS_TELEMETRY_GPU_BLOCK_SEM = 10,
360+
RAS_TELEMETRY_GPU_BLOCK_MP0 = 11,
361+
RAS_TELEMETRY_GPU_BLOCK_MP1 = 12,
362+
RAS_TELEMETRY_GPU_BLOCK_FUSE = 13,
363+
RAS_TELEMETRY_GPU_BLOCK_MCA = 14,
364+
RAS_TELEMETRY_GPU_BLOCK_VCN = 15,
365+
RAS_TELEMETRY_GPU_BLOCK_JPEG = 16,
366+
RAS_TELEMETRY_GPU_BLOCK_IH = 17,
367+
RAS_TELEMETRY_GPU_BLOCK_MPIO = 18,
368+
RAS_TELEMETRY_GPU_BLOCK_COUNT = 19,
369+
};
370+
371+
struct amd_sriov_ras_telemetry_header {
372+
uint32_t checksum;
373+
uint32_t used_size;
374+
uint32_t reserved[2];
375+
};
376+
377+
struct amd_sriov_ras_telemetry_error_count {
378+
struct {
379+
uint32_t ce_count;
380+
uint32_t ue_count;
381+
uint32_t de_count;
382+
uint32_t ce_overflow_count;
383+
uint32_t ue_overflow_count;
384+
uint32_t de_overflow_count;
385+
uint32_t reserved[6];
386+
} block[RAS_TELEMETRY_GPU_BLOCK_COUNT];
387+
};
388+
389+
struct amdsriov_ras_telemetry {
390+
struct amd_sriov_ras_telemetry_header header;
391+
392+
union {
393+
struct amd_sriov_ras_telemetry_error_count error_count;
394+
} body;
395+
};
396+
304397
/* version data stored in MAILBOX_MSGBUF_RCV_DW1 for future expansion */
305398
enum amd_sriov_gpu_init_data_version {
306399
GPU_INIT_DATA_READY_V1 = 1,

drivers/gpu/drm/amd/amdgpu/mxgpu_nv.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ enum idh_request {
4040
IDH_LOG_VF_ERROR = 200,
4141
IDH_READY_TO_RESET = 201,
4242
IDH_RAS_POISON = 202,
43+
IDH_REQ_RAS_ERROR_COUNT = 203,
4344
};
4445

4546
enum idh_event {
@@ -54,6 +55,8 @@ enum idh_event {
5455
IDH_RAS_POISON_READY,
5556
IDH_PF_SOFT_FLR_NOTIFICATION,
5657
IDH_RAS_ERROR_DETECTED,
58+
IDH_RAS_ERROR_COUNT_READY = 11,
59+
5760
IDH_TEXT_MESSAGE = 255,
5861
};
5962

0 commit comments

Comments
 (0)