Skip to content

Commit a39d082

Browse files
dceraoloThomas Hellström
authored andcommitted
drm/xe: Fix early wedge on GuC load failure
When the GuC fails to load we declare the device wedged. However, the very first GuC load attempt on GT0 (from xe_gt_init_hwconfig) is done before the GT1 GuC objects are initialized, so things go bad when the wedge code attempts to cleanup GT1. To fix this, check the initialization status in the functions called during wedge. Fixes: 7dbe8af ("drm/xe: Wedge the entire device") Signed-off-by: Daniele Ceraolo Spurio <[email protected]> Cc: Rodrigo Vivi <[email protected]> Cc: Matthew Brost <[email protected]> Cc: Jonathan Cavitt <[email protected]> Cc: Lucas De Marchi <[email protected]> Cc: Zhanjun Dong <[email protected]> Cc: [email protected] # v6.12+: 1e1981b: drm/xe: Fix taking invalid lock on wedge Cc: [email protected] # v6.12+ Reviewed-by: Jonathan Cavitt <[email protected]> Reviewed-by: Lucas De Marchi <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Lucas De Marchi <[email protected]> (cherry picked from commit 0b93b7d) Signed-off-by: Thomas Hellström <[email protected]>
1 parent 87a15c8 commit a39d082

File tree

4 files changed

+21
-2
lines changed

4 files changed

+21
-2
lines changed

drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,14 @@ void xe_gt_tlb_invalidation_reset(struct xe_gt *gt)
137137
struct xe_gt_tlb_invalidation_fence *fence, *next;
138138
int pending_seqno;
139139

140+
/*
141+
* we can get here before the CTs are even initialized if we're wedging
142+
* very early, in which case there are not going to be any pending
143+
* fences so we can bail immediately.
144+
*/
145+
if (!xe_guc_ct_initialized(&gt->uc.guc.ct))
146+
return;
147+
140148
/*
141149
* CT channel is already disabled at this point. No new TLB requests can
142150
* appear.

drivers/gpu/drm/xe/xe_guc_ct.c

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -514,6 +514,9 @@ void xe_guc_ct_disable(struct xe_guc_ct *ct)
514514
*/
515515
void xe_guc_ct_stop(struct xe_guc_ct *ct)
516516
{
517+
if (!xe_guc_ct_initialized(ct))
518+
return;
519+
517520
xe_guc_ct_set_state(ct, XE_GUC_CT_STATE_STOPPED);
518521
stop_g2h_handler(ct);
519522
}
@@ -760,7 +763,7 @@ static int __guc_ct_send_locked(struct xe_guc_ct *ct, const u32 *action,
760763
u16 seqno;
761764
int ret;
762765

763-
xe_gt_assert(gt, ct->state != XE_GUC_CT_STATE_NOT_INITIALIZED);
766+
xe_gt_assert(gt, xe_guc_ct_initialized(ct));
764767
xe_gt_assert(gt, !g2h_len || !g2h_fence);
765768
xe_gt_assert(gt, !num_g2h || !g2h_fence);
766769
xe_gt_assert(gt, !g2h_len || num_g2h);
@@ -1344,7 +1347,7 @@ static int g2h_read(struct xe_guc_ct *ct, u32 *msg, bool fast_path)
13441347
u32 action;
13451348
u32 *hxg;
13461349

1347-
xe_gt_assert(gt, ct->state != XE_GUC_CT_STATE_NOT_INITIALIZED);
1350+
xe_gt_assert(gt, xe_guc_ct_initialized(ct));
13481351
lockdep_assert_held(&ct->fast_lock);
13491352

13501353
if (ct->state == XE_GUC_CT_STATE_DISABLED)

drivers/gpu/drm/xe/xe_guc_ct.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,11 @@ void xe_guc_ct_snapshot_print(struct xe_guc_ct_snapshot *snapshot, struct drm_pr
2222
void xe_guc_ct_snapshot_free(struct xe_guc_ct_snapshot *snapshot);
2323
void xe_guc_ct_print(struct xe_guc_ct *ct, struct drm_printer *p, bool want_ctb);
2424

25+
static inline bool xe_guc_ct_initialized(struct xe_guc_ct *ct)
26+
{
27+
return ct->state != XE_GUC_CT_STATE_NOT_INITIALIZED;
28+
}
29+
2530
static inline bool xe_guc_ct_enabled(struct xe_guc_ct *ct)
2631
{
2732
return ct->state == XE_GUC_CT_STATE_ENABLED;

drivers/gpu/drm/xe/xe_guc_submit.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1762,6 +1762,9 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc)
17621762
{
17631763
int ret;
17641764

1765+
if (!guc->submission_state.initialized)
1766+
return 0;
1767+
17651768
/*
17661769
* Using an atomic here rather than submission_state.lock as this
17671770
* function can be called while holding the CT lock (engine reset

0 commit comments

Comments
 (0)