Skip to content

Commit 1a6bbc4

Browse files
committed
Merge tag 'drm-xe-fixes-2024-11-08' of https://gitlab.freedesktop.org/drm/xe/kernel into drm-fixes
Driver Changes: - Fix ccs_mode setting for Xe2 and later (Balasubramani) - Synchronize ccs_mode setting with client creation (Balasubramani) - Apply scheduling WA for LNL in additional places as needed (Nirmoy) - Fix leak and lock handling in error paths of xe_exec ioctl (Matthew Brost) - Fix GGTT allocation leak leading to eventual crash in SR-IOV (Michal Wajdeczko) - Move run_ticks update out of job handling to avoid synchronization with reader (Lucas) Signed-off-by: Dave Airlie <[email protected]> From: Lucas De Marchi <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/4ffcebtluaaaohquxfyf5babpihmtscxwad3jjmt5nggwh2xpm@ztw67ucywttg
2 parents 9b984a7 + 514447a commit 1a6bbc4

12 files changed

+54
-41
lines changed

drivers/gpu/drm/xe/regs/xe_gt_regs.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -517,7 +517,7 @@
517517
* [4-6] RSVD
518518
* [7] Disabled
519519
*/
520-
#define CCS_MODE XE_REG(0x14804)
520+
#define CCS_MODE XE_REG(0x14804, XE_REG_OPTION_MASKED)
521521
#define CCS_MODE_CSLICE_0_3_MASK REG_GENMASK(11, 0) /* 3 bits per cslice */
522522
#define CCS_MODE_CSLICE_MASK 0x7 /* CCS0-3 + rsvd */
523523
#define CCS_MODE_CSLICE_WIDTH ilog2(CCS_MODE_CSLICE_MASK + 1)

drivers/gpu/drm/xe/xe_device.c

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,6 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
8787
mutex_init(&xef->exec_queue.lock);
8888
xa_init_flags(&xef->exec_queue.xa, XA_FLAGS_ALLOC1);
8989

90-
spin_lock(&xe->clients.lock);
91-
xe->clients.count++;
92-
spin_unlock(&xe->clients.lock);
93-
9490
file->driver_priv = xef;
9591
kref_init(&xef->refcount);
9692

@@ -107,17 +103,12 @@ static int xe_file_open(struct drm_device *dev, struct drm_file *file)
107103
static void xe_file_destroy(struct kref *ref)
108104
{
109105
struct xe_file *xef = container_of(ref, struct xe_file, refcount);
110-
struct xe_device *xe = xef->xe;
111106

112107
xa_destroy(&xef->exec_queue.xa);
113108
mutex_destroy(&xef->exec_queue.lock);
114109
xa_destroy(&xef->vm.xa);
115110
mutex_destroy(&xef->vm.lock);
116111

117-
spin_lock(&xe->clients.lock);
118-
xe->clients.count--;
119-
spin_unlock(&xe->clients.lock);
120-
121112
xe_drm_client_put(xef->client);
122113
kfree(xef->process_name);
123114
kfree(xef);
@@ -333,7 +324,6 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
333324
xe->info.force_execlist = xe_modparam.force_execlist;
334325

335326
spin_lock_init(&xe->irq.lock);
336-
spin_lock_init(&xe->clients.lock);
337327

338328
init_waitqueue_head(&xe->ufence_wq);
339329

drivers/gpu/drm/xe/xe_device.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,4 +178,18 @@ void xe_device_declare_wedged(struct xe_device *xe);
178178
struct xe_file *xe_file_get(struct xe_file *xef);
179179
void xe_file_put(struct xe_file *xef);
180180

181+
/*
182+
* Occasionally it is seen that the G2H worker starts running after a delay of more than
183+
* a second even after being queued and activated by the Linux workqueue subsystem. This
184+
* leads to G2H timeout error. The root cause of issue lies with scheduling latency of
185+
* Lunarlake Hybrid CPU. Issue disappears if we disable Lunarlake atom cores from BIOS
186+
* and this is beyond xe kmd.
187+
*
188+
* TODO: Drop this change once workqueue scheduling delay issue is fixed on LNL Hybrid CPU.
189+
*/
190+
#define LNL_FLUSH_WORKQUEUE(wq__) \
191+
flush_workqueue(wq__)
192+
#define LNL_FLUSH_WORK(wrk__) \
193+
flush_work(wrk__)
194+
181195
#endif

drivers/gpu/drm/xe/xe_device_types.h

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -353,15 +353,6 @@ struct xe_device {
353353
struct workqueue_struct *wq;
354354
} sriov;
355355

356-
/** @clients: drm clients info */
357-
struct {
358-
/** @clients.lock: Protects drm clients info */
359-
spinlock_t lock;
360-
361-
/** @clients.count: number of drm clients */
362-
u64 count;
363-
} clients;
364-
365356
/** @usm: unified memory state */
366357
struct {
367358
/** @usm.asid: convert a ASID to VM */

drivers/gpu/drm/xe/xe_exec.c

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,12 +132,16 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
132132
if (XE_IOCTL_DBG(xe, !q))
133133
return -ENOENT;
134134

135-
if (XE_IOCTL_DBG(xe, q->flags & EXEC_QUEUE_FLAG_VM))
136-
return -EINVAL;
135+
if (XE_IOCTL_DBG(xe, q->flags & EXEC_QUEUE_FLAG_VM)) {
136+
err = -EINVAL;
137+
goto err_exec_queue;
138+
}
137139

138140
if (XE_IOCTL_DBG(xe, args->num_batch_buffer &&
139-
q->width != args->num_batch_buffer))
140-
return -EINVAL;
141+
q->width != args->num_batch_buffer)) {
142+
err = -EINVAL;
143+
goto err_exec_queue;
144+
}
141145

142146
if (XE_IOCTL_DBG(xe, q->ops->reset_status(q))) {
143147
err = -ECANCELED;
@@ -220,6 +224,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
220224
fence = xe_sync_in_fence_get(syncs, num_syncs, q, vm);
221225
if (IS_ERR(fence)) {
222226
err = PTR_ERR(fence);
227+
xe_vm_unlock(vm);
223228
goto err_unlock_list;
224229
}
225230
for (i = 0; i < num_syncs; i++)

drivers/gpu/drm/xe/xe_exec_queue.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -260,8 +260,14 @@ void xe_exec_queue_fini(struct xe_exec_queue *q)
260260
{
261261
int i;
262262

263+
/*
264+
* Before releasing our ref to lrc and xef, accumulate our run ticks
265+
*/
266+
xe_exec_queue_update_run_ticks(q);
267+
263268
for (i = 0; i < q->width; ++i)
264269
xe_lrc_put(q->lrc[i]);
270+
265271
__xe_exec_queue_free(q);
266272
}
267273

drivers/gpu/drm/xe/xe_gt_ccs_mode.c

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,12 @@ static void __xe_gt_apply_ccs_mode(struct xe_gt *gt, u32 num_engines)
6868
}
6969
}
7070

71+
/*
72+
* Mask bits need to be set for the register. Though only Xe2+
73+
* platforms require setting of mask bits, it won't harm for older
74+
* platforms as these bits are unused there.
75+
*/
76+
mode |= CCS_MODE_CSLICE_0_3_MASK << 16;
7177
xe_mmio_write32(gt, CCS_MODE, mode);
7278

7379
xe_gt_dbg(gt, "CCS_MODE=%x config:%08x, num_engines:%d, num_slices:%d\n",
@@ -133,9 +139,10 @@ ccs_mode_store(struct device *kdev, struct device_attribute *attr,
133139
}
134140

135141
/* CCS mode can only be updated when there are no drm clients */
136-
spin_lock(&xe->clients.lock);
137-
if (xe->clients.count) {
138-
spin_unlock(&xe->clients.lock);
142+
mutex_lock(&xe->drm.filelist_mutex);
143+
if (!list_empty(&xe->drm.filelist)) {
144+
mutex_unlock(&xe->drm.filelist_mutex);
145+
xe_gt_dbg(gt, "Rejecting compute mode change as there are active drm clients\n");
139146
return -EBUSY;
140147
}
141148

@@ -146,7 +153,7 @@ ccs_mode_store(struct device *kdev, struct device_attribute *attr,
146153
xe_gt_reset_async(gt);
147154
}
148155

149-
spin_unlock(&xe->clients.lock);
156+
mutex_unlock(&xe->drm.filelist_mutex);
150157

151158
return count;
152159
}

drivers/gpu/drm/xe/xe_gt_sriov_pf_config.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,8 @@ static void pf_release_ggtt(struct xe_tile *tile, struct xe_ggtt_node *node)
387387
* the xe_ggtt_clear() called by below xe_ggtt_remove_node().
388388
*/
389389
xe_ggtt_node_remove(node, false);
390+
} else {
391+
xe_ggtt_node_fini(node);
390392
}
391393
}
392394

@@ -442,7 +444,7 @@ static int pf_provision_vf_ggtt(struct xe_gt *gt, unsigned int vfid, u64 size)
442444
config->ggtt_region = node;
443445
return 0;
444446
err:
445-
xe_ggtt_node_fini(node);
447+
pf_release_ggtt(tile, node);
446448
return err;
447449
}
448450

drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,8 @@ static void xe_gt_tlb_fence_timeout(struct work_struct *work)
7272
struct xe_device *xe = gt_to_xe(gt);
7373
struct xe_gt_tlb_invalidation_fence *fence, *next;
7474

75+
LNL_FLUSH_WORK(&gt->uc.guc.ct.g2h_worker);
76+
7577
spin_lock_irq(&gt->tlb_invalidation.pending_lock);
7678
list_for_each_entry_safe(fence, next,
7779
&gt->tlb_invalidation.pending_fences, link) {

drivers/gpu/drm/xe/xe_guc_ct.c

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -897,17 +897,8 @@ static int guc_ct_send_recv(struct xe_guc_ct *ct, const u32 *action, u32 len,
897897

898898
ret = wait_event_timeout(ct->g2h_fence_wq, g2h_fence.done, HZ);
899899

900-
/*
901-
* Occasionally it is seen that the G2H worker starts running after a delay of more than
902-
* a second even after being queued and activated by the Linux workqueue subsystem. This
903-
* leads to G2H timeout error. The root cause of issue lies with scheduling latency of
904-
* Lunarlake Hybrid CPU. Issue dissappears if we disable Lunarlake atom cores from BIOS
905-
* and this is beyond xe kmd.
906-
*
907-
* TODO: Drop this change once workqueue scheduling delay issue is fixed on LNL Hybrid CPU.
908-
*/
909900
if (!ret) {
910-
flush_work(&ct->g2h_worker);
901+
LNL_FLUSH_WORK(&ct->g2h_worker);
911902
if (g2h_fence.done) {
912903
xe_gt_warn(gt, "G2H fence %u, action %04x, done\n",
913904
g2h_fence.seqno, action[0]);

0 commit comments

Comments
 (0)