Skip to content

Commit c3f1ed9

Browse files
icklejnikula
authored andcommitted
drm/i915/gt: Allow temporary suspension of inflight requests
In order to support out-of-line error capture, we need to remove the active request from HW and put it to one side while a worker compresses and stores all the details associated with that request. (As that compression may take an arbitrary user-controlled amount of time, we want to let the engine continue running on other workloads while the hanging request is dumped.) Not only do we need to remove the active request, but we also have to remove its context and all requests that were dependent on it (both in flight, queued and future submission). Finally once the capture is complete, we need to be able to resubmit the request and its dependents and allow them to execute. v2: Replace stack recursion with a simple list. v3: Check all the parents, not just the first, when searching for a stuck ancestor! References: https://gitlab.freedesktop.org/drm/intel/issues/738 Signed-off-by: Chris Wilson <[email protected]> Cc: Tvrtko Ursulin <[email protected]> Reviewed-by: Tvrtko Ursulin <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected] (cherry picked from commit 32ff621) Signed-off-by: Jani Nikula <[email protected]>
1 parent 9e2750f commit c3f1ed9

File tree

5 files changed

+321
-6
lines changed

5 files changed

+321
-6
lines changed

drivers/gpu/drm/i915/gt/intel_engine_cs.c

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -671,6 +671,7 @@ void
671671
intel_engine_init_active(struct intel_engine_cs *engine, unsigned int subclass)
672672
{
673673
INIT_LIST_HEAD(&engine->active.requests);
674+
INIT_LIST_HEAD(&engine->active.hold);
674675

675676
spin_lock_init(&engine->active.lock);
676677
lockdep_set_subclass(&engine->active.lock, subclass);
@@ -1422,6 +1423,17 @@ static void print_request_ring(struct drm_printer *m, struct i915_request *rq)
14221423
}
14231424
}
14241425

1426+
static unsigned long list_count(struct list_head *list)
1427+
{
1428+
struct list_head *pos;
1429+
unsigned long count = 0;
1430+
1431+
list_for_each(pos, list)
1432+
count++;
1433+
1434+
return count;
1435+
}
1436+
14251437
void intel_engine_dump(struct intel_engine_cs *engine,
14261438
struct drm_printer *m,
14271439
const char *header, ...)
@@ -1491,6 +1503,7 @@ void intel_engine_dump(struct intel_engine_cs *engine,
14911503
hexdump(m, rq->context->lrc_reg_state, PAGE_SIZE);
14921504
}
14931505
}
1506+
drm_printf(m, "\tOn hold?: %lu\n", list_count(&engine->active.hold));
14941507
spin_unlock_irqrestore(&engine->active.lock, flags);
14951508

14961509
drm_printf(m, "\tMMIO base: 0x%08x\n", engine->mmio_base);

drivers/gpu/drm/i915/gt/intel_engine_types.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,7 @@ struct intel_engine_cs {
295295
struct {
296296
spinlock_t lock;
297297
struct list_head requests;
298+
struct list_head hold; /* ready requests, but on hold */
298299
} active;
299300

300301
struct llist_head barrier_tasks;

drivers/gpu/drm/i915/gt/intel_lrc.c

Lines changed: 161 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1635,8 +1635,8 @@ static void defer_request(struct i915_request *rq, struct list_head * const pl)
16351635
!i915_request_completed(rq));
16361636

16371637
GEM_BUG_ON(i915_request_is_active(w));
1638-
if (list_empty(&w->sched.link))
1639-
continue; /* Not yet submitted; unready */
1638+
if (!i915_request_is_ready(w))
1639+
continue;
16401640

16411641
if (rq_prio(w) < rq_prio(rq))
16421642
continue;
@@ -2354,6 +2354,145 @@ static void __execlists_submission_tasklet(struct intel_engine_cs *const engine)
23542354
}
23552355
}
23562356

2357+
static void __execlists_hold(struct i915_request *rq)
2358+
{
2359+
LIST_HEAD(list);
2360+
2361+
do {
2362+
struct i915_dependency *p;
2363+
2364+
if (i915_request_is_active(rq))
2365+
__i915_request_unsubmit(rq);
2366+
2367+
RQ_TRACE(rq, "on hold\n");
2368+
clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2369+
list_move_tail(&rq->sched.link, &rq->engine->active.hold);
2370+
i915_request_set_hold(rq);
2371+
2372+
list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
2373+
struct i915_request *w =
2374+
container_of(p->waiter, typeof(*w), sched);
2375+
2376+
/* Leave semaphores spinning on the other engines */
2377+
if (w->engine != rq->engine)
2378+
continue;
2379+
2380+
if (!i915_request_is_ready(w))
2381+
continue;
2382+
2383+
if (i915_request_completed(w))
2384+
continue;
2385+
2386+
if (i915_request_on_hold(rq))
2387+
continue;
2388+
2389+
list_move_tail(&w->sched.link, &list);
2390+
}
2391+
2392+
rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2393+
} while (rq);
2394+
}
2395+
2396+
__maybe_unused
2397+
static void execlists_hold(struct intel_engine_cs *engine,
2398+
struct i915_request *rq)
2399+
{
2400+
spin_lock_irq(&engine->active.lock);
2401+
2402+
/*
2403+
* Transfer this request onto the hold queue to prevent it
2404+
* being resumbitted to HW (and potentially completed) before we have
2405+
* released it. Since we may have already submitted following
2406+
* requests, we need to remove those as well.
2407+
*/
2408+
GEM_BUG_ON(i915_request_on_hold(rq));
2409+
GEM_BUG_ON(rq->engine != engine);
2410+
__execlists_hold(rq);
2411+
2412+
spin_unlock_irq(&engine->active.lock);
2413+
}
2414+
2415+
static bool hold_request(const struct i915_request *rq)
2416+
{
2417+
struct i915_dependency *p;
2418+
2419+
/*
2420+
* If one of our ancestors is on hold, we must also be on hold,
2421+
* otherwise we will bypass it and execute before it.
2422+
*/
2423+
list_for_each_entry(p, &rq->sched.signalers_list, signal_link) {
2424+
const struct i915_request *s =
2425+
container_of(p->signaler, typeof(*s), sched);
2426+
2427+
if (s->engine != rq->engine)
2428+
continue;
2429+
2430+
if (i915_request_on_hold(s))
2431+
return true;
2432+
}
2433+
2434+
return false;
2435+
}
2436+
2437+
static void __execlists_unhold(struct i915_request *rq)
2438+
{
2439+
LIST_HEAD(list);
2440+
2441+
do {
2442+
struct i915_dependency *p;
2443+
2444+
GEM_BUG_ON(!i915_request_on_hold(rq));
2445+
GEM_BUG_ON(!i915_sw_fence_signaled(&rq->submit));
2446+
2447+
i915_request_clear_hold(rq);
2448+
list_move_tail(&rq->sched.link,
2449+
i915_sched_lookup_priolist(rq->engine,
2450+
rq_prio(rq)));
2451+
set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
2452+
RQ_TRACE(rq, "hold release\n");
2453+
2454+
/* Also release any children on this engine that are ready */
2455+
list_for_each_entry(p, &rq->sched.waiters_list, wait_link) {
2456+
struct i915_request *w =
2457+
container_of(p->waiter, typeof(*w), sched);
2458+
2459+
if (w->engine != rq->engine)
2460+
continue;
2461+
2462+
if (!i915_request_on_hold(rq))
2463+
continue;
2464+
2465+
/* Check that no other parents are also on hold */
2466+
if (hold_request(rq))
2467+
continue;
2468+
2469+
list_move_tail(&w->sched.link, &list);
2470+
}
2471+
2472+
rq = list_first_entry_or_null(&list, typeof(*rq), sched.link);
2473+
} while (rq);
2474+
}
2475+
2476+
__maybe_unused
2477+
static void execlists_unhold(struct intel_engine_cs *engine,
2478+
struct i915_request *rq)
2479+
{
2480+
spin_lock_irq(&engine->active.lock);
2481+
2482+
/*
2483+
* Move this request back to the priority queue, and all of its
2484+
* children and grandchildren that were suspended along with it.
2485+
*/
2486+
__execlists_unhold(rq);
2487+
2488+
if (rq_prio(rq) > engine->execlists.queue_priority_hint) {
2489+
engine->execlists.queue_priority_hint = rq_prio(rq);
2490+
tasklet_hi_schedule(&engine->execlists.tasklet);
2491+
}
2492+
2493+
spin_unlock_irq(&engine->active.lock);
2494+
}
2495+
23572496
static noinline void preempt_reset(struct intel_engine_cs *engine)
23582497
{
23592498
const unsigned int bit = I915_RESET_ENGINE + engine->id;
@@ -2466,6 +2605,13 @@ static void submit_queue(struct intel_engine_cs *engine,
24662605
__submit_queue_imm(engine);
24672606
}
24682607

2608+
static bool ancestor_on_hold(const struct intel_engine_cs *engine,
2609+
const struct i915_request *rq)
2610+
{
2611+
GEM_BUG_ON(i915_request_on_hold(rq));
2612+
return !list_empty(&engine->active.hold) && hold_request(rq);
2613+
}
2614+
24692615
static void execlists_submit_request(struct i915_request *request)
24702616
{
24712617
struct intel_engine_cs *engine = request->engine;
@@ -2474,12 +2620,17 @@ static void execlists_submit_request(struct i915_request *request)
24742620
/* Will be called from irq-context when using foreign fences. */
24752621
spin_lock_irqsave(&engine->active.lock, flags);
24762622

2477-
queue_request(engine, request);
2623+
if (unlikely(ancestor_on_hold(engine, request))) {
2624+
list_add_tail(&request->sched.link, &engine->active.hold);
2625+
i915_request_set_hold(request);
2626+
} else {
2627+
queue_request(engine, request);
24782628

2479-
GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2480-
GEM_BUG_ON(list_empty(&request->sched.link));
2629+
GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
2630+
GEM_BUG_ON(list_empty(&request->sched.link));
24812631

2482-
submit_queue(engine, request);
2632+
submit_queue(engine, request);
2633+
}
24832634

24842635
spin_unlock_irqrestore(&engine->active.lock, flags);
24852636
}
@@ -3328,6 +3479,10 @@ static void execlists_reset_cancel(struct intel_engine_cs *engine)
33283479
i915_priolist_free(p);
33293480
}
33303481

3482+
/* On-hold requests will be flushed to timeline upon their release */
3483+
list_for_each_entry(rq, &engine->active.hold, sched.link)
3484+
mark_eio(rq);
3485+
33313486
/* Cancel all attached virtual engines */
33323487
while ((rb = rb_first_cached(&execlists->virtual))) {
33333488
struct virtual_engine *ve =

drivers/gpu/drm/i915/gt/selftest_lrc.c

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -285,6 +285,108 @@ static int live_unlite_preempt(void *arg)
285285
return live_unlite_restore(arg, I915_USER_PRIORITY(I915_PRIORITY_MAX));
286286
}
287287

288+
static int live_hold_reset(void *arg)
289+
{
290+
struct intel_gt *gt = arg;
291+
struct intel_engine_cs *engine;
292+
enum intel_engine_id id;
293+
struct igt_spinner spin;
294+
int err = 0;
295+
296+
/*
297+
* In order to support offline error capture for fast preempt reset,
298+
* we need to decouple the guilty request and ensure that it and its
299+
* descendents are not executed while the capture is in progress.
300+
*/
301+
302+
if (!intel_has_reset_engine(gt))
303+
return 0;
304+
305+
if (igt_spinner_init(&spin, gt))
306+
return -ENOMEM;
307+
308+
for_each_engine(engine, gt, id) {
309+
struct intel_context *ce;
310+
unsigned long heartbeat;
311+
struct i915_request *rq;
312+
313+
ce = intel_context_create(engine);
314+
if (IS_ERR(ce)) {
315+
err = PTR_ERR(ce);
316+
break;
317+
}
318+
319+
engine_heartbeat_disable(engine, &heartbeat);
320+
321+
rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
322+
if (IS_ERR(rq)) {
323+
err = PTR_ERR(rq);
324+
goto out;
325+
}
326+
i915_request_add(rq);
327+
328+
if (!igt_wait_for_spinner(&spin, rq)) {
329+
intel_gt_set_wedged(gt);
330+
err = -ETIME;
331+
goto out;
332+
}
333+
334+
/* We have our request executing, now remove it and reset */
335+
336+
if (test_and_set_bit(I915_RESET_ENGINE + id,
337+
&gt->reset.flags)) {
338+
spin_unlock_irq(&engine->active.lock);
339+
intel_gt_set_wedged(gt);
340+
err = -EBUSY;
341+
goto out;
342+
}
343+
tasklet_disable(&engine->execlists.tasklet);
344+
345+
engine->execlists.tasklet.func(engine->execlists.tasklet.data);
346+
GEM_BUG_ON(execlists_active(&engine->execlists) != rq);
347+
348+
execlists_hold(engine, rq);
349+
GEM_BUG_ON(!i915_request_on_hold(rq));
350+
351+
intel_engine_reset(engine, NULL);
352+
GEM_BUG_ON(rq->fence.error != -EIO);
353+
354+
tasklet_enable(&engine->execlists.tasklet);
355+
clear_and_wake_up_bit(I915_RESET_ENGINE + id,
356+
&gt->reset.flags);
357+
358+
/* Check that we do not resubmit the held request */
359+
i915_request_get(rq);
360+
if (!i915_request_wait(rq, 0, HZ / 5)) {
361+
pr_err("%s: on hold request completed!\n",
362+
engine->name);
363+
i915_request_put(rq);
364+
err = -EIO;
365+
goto out;
366+
}
367+
GEM_BUG_ON(!i915_request_on_hold(rq));
368+
369+
/* But is resubmitted on release */
370+
execlists_unhold(engine, rq);
371+
if (i915_request_wait(rq, 0, HZ / 5) < 0) {
372+
pr_err("%s: held request did not complete!\n",
373+
engine->name);
374+
intel_gt_set_wedged(gt);
375+
err = -ETIME;
376+
}
377+
i915_request_put(rq);
378+
379+
out:
380+
engine_heartbeat_enable(engine, heartbeat);
381+
intel_context_put(ce);
382+
if (err)
383+
break;
384+
}
385+
386+
igt_spinner_fini(&spin);
387+
return err;
388+
}
389+
288390
static int
289391
emit_semaphore_chain(struct i915_request *rq, struct i915_vma *vma, int idx)
290392
{
@@ -3315,6 +3417,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
33153417
SUBTEST(live_sanitycheck),
33163418
SUBTEST(live_unlite_switch),
33173419
SUBTEST(live_unlite_preempt),
3420+
SUBTEST(live_hold_reset),
33183421
SUBTEST(live_timeslice_preempt),
33193422
SUBTEST(live_timeslice_queue),
33203423
SUBTEST(live_busywait_preempt),

0 commit comments

Comments
 (0)