Skip to content

Commit 78e6e46

Browse files
committed
Merge tag 'drm-xe-next-fixes-2024-07-18' of https://gitlab.freedesktop.org/drm/xe/kernel into drm-next
- Xe_exec ioctl minor fix on sync entry cleanup upon error (Ashutosh) - SRIOV: limit VF LMEM provisioning (Michal) - Wedge mode fixes (Brost) Signed-off-by: Dave Airlie <[email protected]> From: Rodrigo Vivi <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
2 parents 7d4ecf3 + 90936a0 commit 78e6e46

File tree

11 files changed

+102
-20
lines changed

11 files changed

+102
-20
lines changed

drivers/gpu/drm/xe/xe_device.c

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -854,6 +854,13 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
854854
return address & GENMASK_ULL(xe->info.va_bits - 1, 0);
855855
}
856856

857+
static void xe_device_wedged_fini(struct drm_device *drm, void *arg)
858+
{
859+
struct xe_device *xe = arg;
860+
861+
xe_pm_runtime_put(xe);
862+
}
863+
857864
/**
858865
* xe_device_declare_wedged - Declare device wedged
859866
* @xe: xe device instance
@@ -870,11 +877,21 @@ u64 xe_device_uncanonicalize_addr(struct xe_device *xe, u64 address)
870877
*/
871878
void xe_device_declare_wedged(struct xe_device *xe)
872879
{
880+
struct xe_gt *gt;
881+
u8 id;
882+
873883
if (xe->wedged.mode == 0) {
874884
drm_dbg(&xe->drm, "Wedged mode is forcibly disabled\n");
875885
return;
876886
}
877887

888+
if (drmm_add_action_or_reset(&xe->drm, xe_device_wedged_fini, xe)) {
889+
drm_err(&xe->drm, "Failed to register xe_device_wedged_fini clean-up. Although device is wedged.\n");
890+
return;
891+
}
892+
893+
xe_pm_runtime_get_noresume(xe);
894+
878895
if (!atomic_xchg(&xe->wedged.flag, 1)) {
879896
xe->needs_flr_on_fini = true;
880897
drm_err(&xe->drm,
@@ -883,4 +900,7 @@ void xe_device_declare_wedged(struct xe_device *xe)
883900
"Please file a _new_ bug report at https://gitlab.freedesktop.org/drm/xe/kernel/issues/new\n",
884901
dev_name(xe->drm.dev));
885902
}
903+
904+
for_each_gt(gt, xe, id)
905+
xe_gt_declare_wedged(gt);
886906
}

drivers/gpu/drm/xe/xe_exec.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
118118
u64 addresses[XE_HW_ENGINE_MAX_INSTANCE];
119119
struct drm_gpuvm_exec vm_exec = {.extra.fn = xe_exec_fn};
120120
struct drm_exec *exec = &vm_exec.exec;
121-
u32 i, num_syncs = 0, num_ufence = 0;
121+
u32 i, num_syncs, num_ufence = 0;
122122
struct xe_sched_job *job;
123123
struct xe_vm *vm;
124124
bool write_locked, skip_retry = false;
@@ -156,15 +156,15 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
156156

157157
vm = q->vm;
158158

159-
for (i = 0; i < args->num_syncs; i++) {
160-
err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs++],
161-
&syncs_user[i], SYNC_PARSE_FLAG_EXEC |
159+
for (num_syncs = 0; num_syncs < args->num_syncs; num_syncs++) {
160+
err = xe_sync_entry_parse(xe, xef, &syncs[num_syncs],
161+
&syncs_user[num_syncs], SYNC_PARSE_FLAG_EXEC |
162162
(xe_vm_in_lr_mode(vm) ?
163163
SYNC_PARSE_FLAG_LR_MODE : 0));
164164
if (err)
165165
goto err_syncs;
166166

167-
if (xe_sync_is_ufence(&syncs[i]))
167+
if (xe_sync_is_ufence(&syncs[num_syncs]))
168168
num_ufence++;
169169
}
170170

@@ -325,8 +325,8 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
325325
if (err == -EAGAIN && !skip_retry)
326326
goto retry;
327327
err_syncs:
328-
for (i = 0; i < num_syncs; i++)
329-
xe_sync_entry_cleanup(&syncs[i]);
328+
while (num_syncs--)
329+
xe_sync_entry_cleanup(&syncs[num_syncs]);
330330
kfree(syncs);
331331
err_exec_queue:
332332
xe_exec_queue_put(q);

drivers/gpu/drm/xe/xe_gt.c

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -904,3 +904,18 @@ struct xe_hw_engine *xe_gt_any_hw_engine(struct xe_gt *gt)
904904

905905
return NULL;
906906
}
907+
908+
/**
909+
* xe_gt_declare_wedged() - Declare GT wedged
910+
* @gt: the GT object
911+
*
912+
* Wedge the GT which stops all submission, saves desired debug state, and
913+
* cleans up anything which could timeout.
914+
*/
915+
void xe_gt_declare_wedged(struct xe_gt *gt)
916+
{
917+
xe_gt_assert(gt, gt_to_xe(gt)->wedged.mode);
918+
919+
xe_uc_declare_wedged(&gt->uc);
920+
xe_gt_tlb_invalidation_reset(gt);
921+
}

drivers/gpu/drm/xe/xe_gt.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ struct xe_gt *xe_gt_alloc(struct xe_tile *tile);
3737
int xe_gt_init_hwconfig(struct xe_gt *gt);
3838
int xe_gt_init_early(struct xe_gt *gt);
3939
int xe_gt_init(struct xe_gt *gt);
40+
void xe_gt_declare_wedged(struct xe_gt *gt);
4041
int xe_gt_record_default_lrcs(struct xe_gt *gt);
4142

4243
/**

drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1543,6 +1543,7 @@ static u64 pf_estimate_fair_lmem(struct xe_gt *gt, unsigned int num_vfs)
15431543
u64 fair;
15441544

15451545
fair = div_u64(available, num_vfs);
1546+
fair = rounddown_pow_of_two(fair); /* XXX: ttm_vram_mgr & drm_buddy limitation */
15461547
fair = ALIGN_DOWN(fair, alignment);
15471548
#ifdef MAX_FAIR_LMEM
15481549
fair = min_t(u64, MAX_FAIR_LMEM, fair);

drivers/gpu/drm/xe/xe_guc.c

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1178,3 +1178,19 @@ void xe_guc_print_info(struct xe_guc *guc, struct drm_printer *p)
11781178
xe_guc_ct_print(&guc->ct, p, false);
11791179
xe_guc_submit_print(guc, p);
11801180
}
1181+
1182+
/**
1183+
* xe_guc_declare_wedged() - Declare GuC wedged
1184+
* @guc: the GuC object
1185+
*
1186+
* Wedge the GuC which stops all submission, saves desired debug state, and
1187+
* cleans up anything which could timeout.
1188+
*/
1189+
void xe_guc_declare_wedged(struct xe_guc *guc)
1190+
{
1191+
xe_gt_assert(guc_to_gt(guc), guc_to_xe(guc)->wedged.mode);
1192+
1193+
xe_guc_reset_prepare(guc);
1194+
xe_guc_ct_stop(&guc->ct);
1195+
xe_guc_submit_wedge(guc);
1196+
}

drivers/gpu/drm/xe/xe_guc.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ void xe_guc_reset_wait(struct xe_guc *guc);
3737
void xe_guc_stop_prepare(struct xe_guc *guc);
3838
void xe_guc_stop(struct xe_guc *guc);
3939
int xe_guc_start(struct xe_guc *guc);
40+
void xe_guc_declare_wedged(struct xe_guc *guc);
4041

4142
static inline u16 xe_engine_class_to_guc_class(enum xe_engine_class class)
4243
{

drivers/gpu/drm/xe/xe_guc_submit.c

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -861,36 +861,47 @@ static void xe_guc_exec_queue_trigger_cleanup(struct xe_exec_queue *q)
861861
xe_sched_tdr_queue_imm(&q->guc->sched);
862862
}
863863

864-
static bool guc_submit_hint_wedged(struct xe_guc *guc)
864+
/**
865+
* xe_guc_submit_wedge() - Wedge GuC submission
866+
* @guc: the GuC object
867+
*
868+
* Save exec queue's registered with GuC state by taking a ref to each queue.
869+
* Register a DRMM handler to drop refs upon driver unload.
870+
*/
871+
void xe_guc_submit_wedge(struct xe_guc *guc)
865872
{
866873
struct xe_device *xe = guc_to_xe(guc);
867874
struct xe_exec_queue *q;
868875
unsigned long index;
869876
int err;
870877

871-
if (xe->wedged.mode != 2)
872-
return false;
873-
874-
if (xe_device_wedged(xe))
875-
return true;
876-
877-
xe_device_declare_wedged(xe);
878-
879-
xe_guc_submit_reset_prepare(guc);
880-
xe_guc_ct_stop(&guc->ct);
878+
xe_gt_assert(guc_to_gt(guc), guc_to_xe(guc)->wedged.mode);
881879

882880
err = drmm_add_action_or_reset(&guc_to_xe(guc)->drm,
883881
guc_submit_wedged_fini, guc);
884882
if (err) {
885883
drm_err(&xe->drm, "Failed to register xe_guc_submit clean-up on wedged.mode=2. Although device is wedged.\n");
886-
return true; /* Device is wedged anyway */
884+
return;
887885
}
888886

889887
mutex_lock(&guc->submission_state.lock);
890888
xa_for_each(&guc->submission_state.exec_queue_lookup, index, q)
891889
if (xe_exec_queue_get_unless_zero(q))
892890
set_exec_queue_wedged(q);
893891
mutex_unlock(&guc->submission_state.lock);
892+
}
893+
894+
static bool guc_submit_hint_wedged(struct xe_guc *guc)
895+
{
896+
struct xe_device *xe = guc_to_xe(guc);
897+
898+
if (xe->wedged.mode != 2)
899+
return false;
900+
901+
if (xe_device_wedged(xe))
902+
return true;
903+
904+
xe_device_declare_wedged(xe);
894905

895906
return true;
896907
}
@@ -1677,7 +1688,8 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc)
16771688

16781689
void xe_guc_submit_reset_wait(struct xe_guc *guc)
16791690
{
1680-
wait_event(guc->ct.wq, !guc_read_stopped(guc));
1691+
wait_event(guc->ct.wq, xe_device_wedged(guc_to_xe(guc)) ||
1692+
!guc_read_stopped(guc));
16811693
}
16821694

16831695
void xe_guc_submit_stop(struct xe_guc *guc)

drivers/gpu/drm/xe/xe_guc_submit.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ int xe_guc_submit_reset_prepare(struct xe_guc *guc);
1818
void xe_guc_submit_reset_wait(struct xe_guc *guc);
1919
void xe_guc_submit_stop(struct xe_guc *guc);
2020
int xe_guc_submit_start(struct xe_guc *guc);
21+
void xe_guc_submit_wedge(struct xe_guc *guc);
2122

2223
int xe_guc_sched_done_handler(struct xe_guc *guc, u32 *msg, u32 len);
2324
int xe_guc_deregister_done_handler(struct xe_guc *guc, u32 *msg, u32 len);

drivers/gpu/drm/xe/xe_uc.c

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,3 +300,17 @@ void xe_uc_remove(struct xe_uc *uc)
300300
{
301301
xe_gsc_remove(&uc->gsc);
302302
}
303+
304+
/**
305+
* xe_uc_declare_wedged() - Declare UC wedged
306+
* @uc: the UC object
307+
*
308+
* Wedge the UC which stops all submission, saves desired debug state, and
309+
* cleans up anything which could timeout.
310+
*/
311+
void xe_uc_declare_wedged(struct xe_uc *uc)
312+
{
313+
xe_gt_assert(uc_to_gt(uc), uc_to_xe(uc)->wedged.mode);
314+
315+
xe_guc_declare_wedged(&uc->guc);
316+
}

0 commit comments

Comments
 (0)