Skip to content

Commit a5b5ab3

Browse files
committed
Merge tag 'drm-xe-fixes-2024-04-04' of https://gitlab.freedesktop.org/drm/xe/kernel into drm-fixes
- Stop using system_unbound_wq for preempt fences, as this can cause starvation when reaching more than max_active defined by workqueue - Fix saving unordered rebinding fences by attaching them as kernel feces to the vm's resv - Fix TLB invalidation fences completing out of order - Move rebind TLB invalidation to the ring ops to reduce the latency Signed-off-by: Dave Airlie <[email protected]> From: Lucas De Marchi <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/tizan6wdpxu4ayudeikjglxdgzmnhdzj3li3z2pgkierjtozzw@lbfddeg43a7h
2 parents 4cf09f1 + 77a0110 commit a5b5ab3

15 files changed

+140
-145
lines changed

drivers/gpu/drm/xe/xe_device.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,9 @@ static void xe_device_destroy(struct drm_device *dev, void *dummy)
193193
{
194194
struct xe_device *xe = to_xe_device(dev);
195195

196+
if (xe->preempt_fence_wq)
197+
destroy_workqueue(xe->preempt_fence_wq);
198+
196199
if (xe->ordered_wq)
197200
destroy_workqueue(xe->ordered_wq);
198201

@@ -258,9 +261,15 @@ struct xe_device *xe_device_create(struct pci_dev *pdev,
258261
INIT_LIST_HEAD(&xe->pinned.external_vram);
259262
INIT_LIST_HEAD(&xe->pinned.evicted);
260263

264+
xe->preempt_fence_wq = alloc_ordered_workqueue("xe-preempt-fence-wq", 0);
261265
xe->ordered_wq = alloc_ordered_workqueue("xe-ordered-wq", 0);
262266
xe->unordered_wq = alloc_workqueue("xe-unordered-wq", 0, 0);
263-
if (!xe->ordered_wq || !xe->unordered_wq) {
267+
if (!xe->ordered_wq || !xe->unordered_wq ||
268+
!xe->preempt_fence_wq) {
269+
/*
270+
* Cleanup done in xe_device_destroy via
271+
* drmm_add_action_or_reset register above
272+
*/
264273
drm_err(&xe->drm, "Failed to allocate xe workqueues\n");
265274
err = -ENOMEM;
266275
goto err;

drivers/gpu/drm/xe/xe_device_types.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,9 @@ struct xe_device {
363363
/** @ufence_wq: user fence wait queue */
364364
wait_queue_head_t ufence_wq;
365365

366+
/** @preempt_fence_wq: used to serialize preempt fences */
367+
struct workqueue_struct *preempt_fence_wq;
368+
366369
/** @ordered_wq: used to serialize compute mode resume */
367370
struct workqueue_struct *ordered_wq;
368371

drivers/gpu/drm/xe/xe_exec.c

Lines changed: 7 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -94,48 +94,16 @@
9494
* Unlock all
9595
*/
9696

97+
/*
98+
* Add validation and rebinding to the drm_exec locking loop, since both can
99+
* trigger eviction which may require sleeping dma_resv locks.
100+
*/
97101
static int xe_exec_fn(struct drm_gpuvm_exec *vm_exec)
98102
{
99103
struct xe_vm *vm = container_of(vm_exec->vm, struct xe_vm, gpuvm);
100-
struct drm_gem_object *obj;
101-
unsigned long index;
102-
int num_fences;
103-
int ret;
104-
105-
ret = drm_gpuvm_validate(vm_exec->vm, &vm_exec->exec);
106-
if (ret)
107-
return ret;
108-
109-
/*
110-
* 1 fence slot for the final submit, and 1 more for every per-tile for
111-
* GPU bind and 1 extra for CPU bind. Note that there are potentially
112-
* many vma per object/dma-resv, however the fence slot will just be
113-
* re-used, since they are largely the same timeline and the seqno
114-
* should be in order. In the case of CPU bind there is dummy fence used
115-
* for all CPU binds, so no need to have a per-tile slot for that.
116-
*/
117-
num_fences = 1 + 1 + vm->xe->info.tile_count;
118104

119-
/*
120-
* We don't know upfront exactly how many fence slots we will need at
121-
* the start of the exec, since the TTM bo_validate above can consume
122-
* numerous fence slots. Also due to how the dma_resv_reserve_fences()
123-
* works it only ensures that at least that many fence slots are
124-
* available i.e if there are already 10 slots available and we reserve
125-
* two more, it can just noop without reserving anything. With this it
126-
* is quite possible that TTM steals some of the fence slots and then
127-
* when it comes time to do the vma binding and final exec stage we are
128-
* lacking enough fence slots, leading to some nasty BUG_ON() when
129-
* adding the fences. Hence just add our own fences here, after the
130-
* validate stage.
131-
*/
132-
drm_exec_for_each_locked_object(&vm_exec->exec, index, obj) {
133-
ret = dma_resv_reserve_fences(obj->resv, num_fences);
134-
if (ret)
135-
return ret;
136-
}
137-
138-
return 0;
105+
/* The fence slot added here is intended for the exec sched job. */
106+
return xe_vm_validate_rebind(vm, &vm_exec->exec, 1);
139107
}
140108

141109
int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
@@ -152,7 +120,6 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
152120
struct drm_exec *exec = &vm_exec.exec;
153121
u32 i, num_syncs = 0, num_ufence = 0;
154122
struct xe_sched_job *job;
155-
struct dma_fence *rebind_fence;
156123
struct xe_vm *vm;
157124
bool write_locked, skip_retry = false;
158125
ktime_t end = 0;
@@ -290,39 +257,7 @@ int xe_exec_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
290257
goto err_exec;
291258
}
292259

293-
/*
294-
* Rebind any invalidated userptr or evicted BOs in the VM, non-compute
295-
* VM mode only.
296-
*/
297-
rebind_fence = xe_vm_rebind(vm, false);
298-
if (IS_ERR(rebind_fence)) {
299-
err = PTR_ERR(rebind_fence);
300-
goto err_put_job;
301-
}
302-
303-
/*
304-
* We store the rebind_fence in the VM so subsequent execs don't get
305-
* scheduled before the rebinds of userptrs / evicted BOs is complete.
306-
*/
307-
if (rebind_fence) {
308-
dma_fence_put(vm->rebind_fence);
309-
vm->rebind_fence = rebind_fence;
310-
}
311-
if (vm->rebind_fence) {
312-
if (test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
313-
&vm->rebind_fence->flags)) {
314-
dma_fence_put(vm->rebind_fence);
315-
vm->rebind_fence = NULL;
316-
} else {
317-
dma_fence_get(vm->rebind_fence);
318-
err = drm_sched_job_add_dependency(&job->drm,
319-
vm->rebind_fence);
320-
if (err)
321-
goto err_put_job;
322-
}
323-
}
324-
325-
/* Wait behind munmap style rebinds */
260+
/* Wait behind rebinds */
326261
if (!xe_vm_in_lr_mode(vm)) {
327262
err = drm_sched_job_add_resv_dependencies(&job->drm,
328263
xe_vm_resv(vm),

drivers/gpu/drm/xe/xe_exec_queue_types.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,11 @@ struct xe_exec_queue {
148148
const struct xe_ring_ops *ring_ops;
149149
/** @entity: DRM sched entity for this exec queue (1 to 1 relationship) */
150150
struct drm_sched_entity *entity;
151+
/**
152+
* @tlb_flush_seqno: The seqno of the last rebind tlb flush performed
153+
* Protected by @vm's resv. Unused if @vm == NULL.
154+
*/
155+
u64 tlb_flush_seqno;
151156
/** @lrc: logical ring context for this exec queue */
152157
struct xe_lrc lrc[];
153158
};

drivers/gpu/drm/xe/xe_gt_pagefault.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -100,10 +100,9 @@ static int xe_pf_begin(struct drm_exec *exec, struct xe_vma *vma,
100100
{
101101
struct xe_bo *bo = xe_vma_bo(vma);
102102
struct xe_vm *vm = xe_vma_vm(vma);
103-
unsigned int num_shared = 2; /* slots for bind + move */
104103
int err;
105104

106-
err = xe_vm_prepare_vma(exec, vma, num_shared);
105+
err = xe_vm_lock_vma(exec, vma);
107106
if (err)
108107
return err;
109108

drivers/gpu/drm/xe/xe_gt_tlb_invalidation.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,6 @@ int xe_gt_tlb_invalidation_init(struct xe_gt *gt)
6161
INIT_LIST_HEAD(&gt->tlb_invalidation.pending_fences);
6262
spin_lock_init(&gt->tlb_invalidation.pending_lock);
6363
spin_lock_init(&gt->tlb_invalidation.lock);
64-
gt->tlb_invalidation.fence_context = dma_fence_context_alloc(1);
6564
INIT_DELAYED_WORK(&gt->tlb_invalidation.fence_tdr,
6665
xe_gt_tlb_fence_timeout);
6766

drivers/gpu/drm/xe/xe_gt_types.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -177,13 +177,6 @@ struct xe_gt {
177177
* xe_gt_tlb_fence_timeout after the timeut interval is over.
178178
*/
179179
struct delayed_work fence_tdr;
180-
/** @tlb_invalidation.fence_context: context for TLB invalidation fences */
181-
u64 fence_context;
182-
/**
183-
* @tlb_invalidation.fence_seqno: seqno to TLB invalidation fences, protected by
184-
* tlb_invalidation.lock
185-
*/
186-
u32 fence_seqno;
187180
/** @tlb_invalidation.lock: protects TLB invalidation fences */
188181
spinlock_t lock;
189182
} tlb_invalidation;

drivers/gpu/drm/xe/xe_preempt_fence.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ static bool preempt_fence_enable_signaling(struct dma_fence *fence)
4949
struct xe_exec_queue *q = pfence->q;
5050

5151
pfence->error = q->ops->suspend(q);
52-
queue_work(system_unbound_wq, &pfence->preempt_work);
52+
queue_work(q->vm->xe->preempt_fence_wq, &pfence->preempt_work);
5353
return true;
5454
}
5555

drivers/gpu/drm/xe/xe_pt.c

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1135,8 +1135,7 @@ static int invalidation_fence_init(struct xe_gt *gt,
11351135
spin_lock_irq(&gt->tlb_invalidation.lock);
11361136
dma_fence_init(&ifence->base.base, &invalidation_fence_ops,
11371137
&gt->tlb_invalidation.lock,
1138-
gt->tlb_invalidation.fence_context,
1139-
++gt->tlb_invalidation.fence_seqno);
1138+
dma_fence_context_alloc(1), 1);
11401139
spin_unlock_irq(&gt->tlb_invalidation.lock);
11411140

11421141
INIT_LIST_HEAD(&ifence->base.link);
@@ -1236,6 +1235,13 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue
12361235
err = xe_pt_prepare_bind(tile, vma, entries, &num_entries);
12371236
if (err)
12381237
goto err;
1238+
1239+
err = dma_resv_reserve_fences(xe_vm_resv(vm), 1);
1240+
if (!err && !xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm)
1241+
err = dma_resv_reserve_fences(xe_vma_bo(vma)->ttm.base.resv, 1);
1242+
if (err)
1243+
goto err;
1244+
12391245
xe_tile_assert(tile, num_entries <= ARRAY_SIZE(entries));
12401246

12411247
xe_vm_dbg_print_entries(tile_to_xe(tile), entries, num_entries);
@@ -1254,11 +1260,13 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue
12541260
* non-faulting LR, in particular on user-space batch buffer chaining,
12551261
* it needs to be done here.
12561262
*/
1257-
if ((rebind && !xe_vm_in_lr_mode(vm) && !vm->batch_invalidate_tlb) ||
1258-
(!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) {
1263+
if ((!rebind && xe_vm_has_scratch(vm) && xe_vm_in_preempt_fence_mode(vm))) {
12591264
ifence = kzalloc(sizeof(*ifence), GFP_KERNEL);
12601265
if (!ifence)
12611266
return ERR_PTR(-ENOMEM);
1267+
} else if (rebind && !xe_vm_in_lr_mode(vm)) {
1268+
/* We bump also if batch_invalidate_tlb is true */
1269+
vm->tlb_flush_seqno++;
12621270
}
12631271

12641272
rfence = kzalloc(sizeof(*rfence), GFP_KERNEL);
@@ -1297,7 +1305,7 @@ __xe_pt_bind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queue
12971305
}
12981306

12991307
/* add shared fence now for pagetable delayed destroy */
1300-
dma_resv_add_fence(xe_vm_resv(vm), fence, !rebind &&
1308+
dma_resv_add_fence(xe_vm_resv(vm), fence, rebind ||
13011309
last_munmap_rebind ?
13021310
DMA_RESV_USAGE_KERNEL :
13031311
DMA_RESV_USAGE_BOOKKEEP);
@@ -1576,6 +1584,7 @@ __xe_pt_unbind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queu
15761584
struct dma_fence *fence = NULL;
15771585
struct invalidation_fence *ifence;
15781586
struct xe_range_fence *rfence;
1587+
int err;
15791588

15801589
LLIST_HEAD(deferred);
15811590

@@ -1593,6 +1602,12 @@ __xe_pt_unbind_vma(struct xe_tile *tile, struct xe_vma *vma, struct xe_exec_queu
15931602
xe_pt_calc_rfence_interval(vma, &unbind_pt_update, entries,
15941603
num_entries);
15951604

1605+
err = dma_resv_reserve_fences(xe_vm_resv(vm), 1);
1606+
if (!err && !xe_vma_has_no_bo(vma) && !xe_vma_bo(vma)->vm)
1607+
err = dma_resv_reserve_fences(xe_vma_bo(vma)->ttm.base.resv, 1);
1608+
if (err)
1609+
return ERR_PTR(err);
1610+
15961611
ifence = kzalloc(sizeof(*ifence), GFP_KERNEL);
15971612
if (!ifence)
15981613
return ERR_PTR(-ENOMEM);

drivers/gpu/drm/xe/xe_ring_ops.c

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -219,10 +219,9 @@ static void __emit_job_gen12_simple(struct xe_sched_job *job, struct xe_lrc *lrc
219219
{
220220
u32 dw[MAX_JOB_SIZE_DW], i = 0;
221221
u32 ppgtt_flag = get_ppgtt_flag(job);
222-
struct xe_vm *vm = job->q->vm;
223222
struct xe_gt *gt = job->q->gt;
224223

225-
if (vm && vm->batch_invalidate_tlb) {
224+
if (job->ring_ops_flush_tlb) {
226225
dw[i++] = preparser_disable(true);
227226
i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
228227
seqno, true, dw, i);
@@ -270,7 +269,6 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
270269
struct xe_gt *gt = job->q->gt;
271270
struct xe_device *xe = gt_to_xe(gt);
272271
bool decode = job->q->class == XE_ENGINE_CLASS_VIDEO_DECODE;
273-
struct xe_vm *vm = job->q->vm;
274272

275273
dw[i++] = preparser_disable(true);
276274

@@ -282,13 +280,13 @@ static void __emit_job_gen12_video(struct xe_sched_job *job, struct xe_lrc *lrc,
282280
i = emit_aux_table_inv(gt, VE0_AUX_INV, dw, i);
283281
}
284282

285-
if (vm && vm->batch_invalidate_tlb)
283+
if (job->ring_ops_flush_tlb)
286284
i = emit_flush_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
287285
seqno, true, dw, i);
288286

289287
dw[i++] = preparser_disable(false);
290288

291-
if (!vm || !vm->batch_invalidate_tlb)
289+
if (!job->ring_ops_flush_tlb)
292290
i = emit_store_imm_ggtt(xe_lrc_start_seqno_ggtt_addr(lrc),
293291
seqno, dw, i);
294292

@@ -317,7 +315,6 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
317315
struct xe_gt *gt = job->q->gt;
318316
struct xe_device *xe = gt_to_xe(gt);
319317
bool lacks_render = !(gt->info.engine_mask & XE_HW_ENGINE_RCS_MASK);
320-
struct xe_vm *vm = job->q->vm;
321318
u32 mask_flags = 0;
322319

323320
dw[i++] = preparser_disable(true);
@@ -327,7 +324,7 @@ static void __emit_job_gen12_render_compute(struct xe_sched_job *job,
327324
mask_flags = PIPE_CONTROL_3D_ENGINE_FLAGS;
328325

329326
/* See __xe_pt_bind_vma() for a discussion on TLB invalidations. */
330-
i = emit_pipe_invalidate(mask_flags, vm && vm->batch_invalidate_tlb, dw, i);
327+
i = emit_pipe_invalidate(mask_flags, job->ring_ops_flush_tlb, dw, i);
331328

332329
/* hsdes: 1809175790 */
333330
if (has_aux_ccs(xe))

0 commit comments

Comments
 (0)