Skip to content

Commit ecb6336

Browse files
zhanjunmattrope
authored andcommitted
drm/xe/guc: Plumb GuC-capture into dev coredump
When we decide to kill a job, (from guc_exec_queue_timedout_job), we could end up with 4 possible scenarios at this starting point of this decision: 1. the guc-captured register-dump is already there. 2. the driver is wedged.mode > 1, so GuC-engine-reset / GuC-err-capture will not happen. 3. the user has started the driver in execlist-submission mode. 4. the guc-captured register-dump is not ready yet so we force GuC to kill that context now, but: A. we don't know yet if GuC will be successful on the engine-reset and get the guc-err-capture, else kmd will do a manual reset later OR B. guc will be successful and we will get a guc-err-capture shortly. So to accomdate the scenarios of 2 and 4A, we will need to do a manual KMD capture first(which is not be reliable in guc-submission mode) and decide later if we need to use that for the cases of 2 or 4A. So this flow is part of the implementation for this patch. Provide xe_guc_capture_get_reg_desc_list to get the register dscriptor list. Add manual capture by read from hw engine if GuC capture is not ready. If it becomes ready at later time, GuC sourced data will be used. Although there may only be a small delay between (1) the check for whether guc-err-capture is available at the start of guc_exec_queue_timedout_job and (2) the decision on using a valid guc-err-capture or manual-capture, lets not take any chances and lock the matching node down so it doesn't get re-claimed if GuC-Err-Capture subsystem is running out of pre-cached nodes. Signed-off-by: Zhanjun Dong <[email protected]> Reviewed-by: Alan Previn <[email protected]> Signed-off-by: Matt Roper <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
1 parent 8bfc496 commit ecb6336

12 files changed

+508
-103
lines changed

drivers/gpu/drm/xe/xe_devcoredump.c

Lines changed: 7 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "xe_force_wake.h"
1818
#include "xe_gt.h"
1919
#include "xe_gt_printk.h"
20+
#include "xe_guc_capture.h"
2021
#include "xe_guc_ct.h"
2122
#include "xe_guc_log.h"
2223
#include "xe_guc_submit.h"
@@ -134,6 +135,9 @@ static void xe_devcoredump_snapshot_free(struct xe_devcoredump_snapshot *ss)
134135
xe_guc_ct_snapshot_free(ss->guc.ct);
135136
ss->guc.ct = NULL;
136137

138+
xe_guc_capture_put_matched_nodes(&ss->gt->uc.guc);
139+
ss->matched_node = NULL;
140+
137141
xe_guc_exec_queue_snapshot_free(ss->ge);
138142
ss->ge = NULL;
139143

@@ -217,6 +221,7 @@ static void xe_devcoredump_free(void *data)
217221
/* To prevent stale data on next snapshot, clear everything */
218222
memset(&coredump->snapshot, 0, sizeof(coredump->snapshot));
219223
coredump->captured = false;
224+
coredump->job = NULL;
220225
drm_info(&coredump_to_xe(coredump)->drm,
221226
"Xe device coredump has been deleted.\n");
222227
}
@@ -227,8 +232,6 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
227232
struct xe_devcoredump_snapshot *ss = &coredump->snapshot;
228233
struct xe_exec_queue *q = job->q;
229234
struct xe_guc *guc = exec_queue_to_guc(q);
230-
struct xe_hw_engine *hwe;
231-
enum xe_hw_engine_id id;
232235
u32 adj_logical_mask = q->logical_mask;
233236
u32 width_mask = (0x1 << q->width) - 1;
234237
const char *process_name = "no process";
@@ -244,6 +247,7 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
244247
strscpy(ss->process_name, process_name);
245248

246249
ss->gt = q->gt;
250+
coredump->job = job;
247251
INIT_WORK(&ss->work, xe_devcoredump_deferred_snap_work);
248252

249253
cookie = dma_fence_begin_signalling();
@@ -266,14 +270,7 @@ static void devcoredump_snapshot(struct xe_devcoredump *coredump,
266270
ss->job = xe_sched_job_snapshot_capture(job);
267271
ss->vm = xe_vm_snapshot_capture(q->vm);
268272

269-
for_each_hw_engine(hwe, q->gt, id) {
270-
if (hwe->class != q->hwe->class ||
271-
!(BIT(hwe->logical_instance) & adj_logical_mask)) {
272-
ss->hwe[id] = NULL;
273-
continue;
274-
}
275-
ss->hwe[id] = xe_hw_engine_snapshot_capture(hwe);
276-
}
273+
xe_engine_snapshot_capture_for_job(job);
277274

278275
queue_work(system_unbound_wq, &ss->work);
279276

drivers/gpu/drm/xe/xe_devcoredump_types.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,12 @@ struct xe_devcoredump_snapshot {
4949
struct xe_hw_engine_snapshot *hwe[XE_NUM_HW_ENGINES];
5050
/** @job: Snapshot of job state */
5151
struct xe_sched_job_snapshot *job;
52+
/**
53+
* @matched_node: The matched capture node for timedout job
54+
* this single-node tracker works because devcoredump will always only
55+
* produce one hw-engine capture per devcoredump event
56+
*/
57+
struct __guc_capture_parsed_output *matched_node;
5258
/** @vm: Snapshot of VM state */
5359
struct xe_vm_snapshot *vm;
5460

@@ -74,6 +80,8 @@ struct xe_devcoredump {
7480
bool captured;
7581
/** @snapshot: Snapshot is captured at time of the first crash */
7682
struct xe_devcoredump_snapshot snapshot;
83+
/** @job: Point to the faulting job */
84+
struct xe_sched_job *job;
7785
};
7886

7987
#endif

drivers/gpu/drm/xe/xe_gt_mcr.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,19 @@ void xe_gt_mcr_get_dss_steering(struct xe_gt *gt, unsigned int dss, u16 *group,
365365
*instance = dss % gt->steering_dss_per_grp;
366366
}
367367

368+
/**
369+
* xe_gt_mcr_steering_info_to_dss_id - Get DSS ID from group/instance steering
370+
* @gt: GT structure
371+
* @group: steering group ID
372+
* @instance: steering instance ID
373+
*
374+
* Return: the coverted DSS id.
375+
*/
376+
u32 xe_gt_mcr_steering_info_to_dss_id(struct xe_gt *gt, u16 group, u16 instance)
377+
{
378+
return group * dss_per_group(gt) + instance;
379+
}
380+
368381
static void init_steering_dss(struct xe_gt *gt)
369382
{
370383
gt->steering_dss_per_grp = dss_per_group(gt);

drivers/gpu/drm/xe/xe_gt_mcr.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ void xe_gt_mcr_multicast_write(struct xe_gt *gt, struct xe_reg_mcr mcr_reg,
2828

2929
void xe_gt_mcr_steering_dump(struct xe_gt *gt, struct drm_printer *p);
3030
void xe_gt_mcr_get_dss_steering(struct xe_gt *gt, unsigned int dss, u16 *group, u16 *instance);
31+
u32 xe_gt_mcr_steering_info_to_dss_id(struct xe_gt *gt, u16 group, u16 instance);
3132

3233
/*
3334
* Loop over each DSS and determine the group and instance IDs that

0 commit comments

Comments
 (0)