Skip to content

Commit 317e039

Browse files
icklejnikula
authored andcommitted
drm/i915/execlists: Take a reference while capturing the guilty request
Thanks to preempt-to-busy, we leave the request on the HW as we submit the preemption request. This means that the request may complete at any moment as we process HW events, and in particular the request may be retired as we are planning to capture it for a preemption timeout. Be more careful while obtaining the request to capture after a preemption timeout, and check to see if it completed before we were able to put it on the on-hold list. If we do see it did complete just before we capture the request, proclaim the preemption-timeout a false positive and pardon the reset as we should hit an arbitration point momentarily and so be able to process the preemption. Note that even after we move the request to be on hold it may be retired (as the reset to stop the HW comes after), so we do require to hold our own reference as we work on the request for capture (and all of the peeking at state within the request needs to be carefully protected). Fixes: c3f1ed9 ("drm/i915/gt: Allow temporary suspension of inflight requests") Closes: https://gitlab.freedesktop.org/drm/intel/issues/997 Signed-off-by: Chris Wilson <[email protected]> Cc: Tvrtko Ursulin <[email protected]> Reviewed-by: Tvrtko Ursulin <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected] (cherry picked from commit 4ba5c08) Signed-off-by: Jani Nikula <[email protected]>
1 parent ad18ba7 commit 317e039

File tree

2 files changed

+32
-10
lines changed

2 files changed

+32
-10
lines changed

drivers/gpu/drm/i915/gt/intel_lrc.c

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2393,11 +2393,16 @@ static void __execlists_hold(struct i915_request *rq)
23932393
} while (rq);
23942394
}
23952395

2396-
static void execlists_hold(struct intel_engine_cs *engine,
2396+
static bool execlists_hold(struct intel_engine_cs *engine,
23972397
struct i915_request *rq)
23982398
{
23992399
spin_lock_irq(&engine->active.lock);
24002400

2401+
if (i915_request_completed(rq)) { /* too late! */
2402+
rq = NULL;
2403+
goto unlock;
2404+
}
2405+
24012406
/*
24022407
* Transfer this request onto the hold queue to prevent it
24032408
* being resumbitted to HW (and potentially completed) before we have
@@ -2408,7 +2413,9 @@ static void execlists_hold(struct intel_engine_cs *engine,
24082413
GEM_BUG_ON(rq->engine != engine);
24092414
__execlists_hold(rq);
24102415

2416+
unlock:
24112417
spin_unlock_irq(&engine->active.lock);
2418+
return rq;
24122419
}
24132420

24142421
static bool hold_request(const struct i915_request *rq)
@@ -2524,6 +2531,7 @@ static void execlists_capture_work(struct work_struct *work)
25242531

25252532
/* Return this request and all that depend upon it for signaling */
25262533
execlists_unhold(engine, cap->rq);
2534+
i915_request_put(cap->rq);
25272535

25282536
kfree(cap);
25292537
}
@@ -2560,12 +2568,12 @@ static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
25602568
return NULL;
25612569
}
25622570

2563-
static void execlists_capture(struct intel_engine_cs *engine)
2571+
static bool execlists_capture(struct intel_engine_cs *engine)
25642572
{
25652573
struct execlists_capture *cap;
25662574

25672575
if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2568-
return;
2576+
return true;
25692577

25702578
/*
25712579
* We need to _quickly_ capture the engine state before we reset.
@@ -2574,13 +2582,17 @@ static void execlists_capture(struct intel_engine_cs *engine)
25742582
*/
25752583
cap = capture_regs(engine);
25762584
if (!cap)
2577-
return;
2585+
return true;
25782586

25792587
cap->rq = execlists_active(&engine->execlists);
25802588
GEM_BUG_ON(!cap->rq);
25812589

2590+
rcu_read_lock();
25822591
cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2583-
GEM_BUG_ON(!cap->rq);
2592+
cap->rq = i915_request_get_rcu(cap->rq);
2593+
rcu_read_unlock();
2594+
if (!cap->rq)
2595+
goto err_free;
25842596

25852597
/*
25862598
* Remove the request from the execlists queue, and take ownership
@@ -2602,10 +2614,19 @@ static void execlists_capture(struct intel_engine_cs *engine)
26022614
* simply hold that request accountable for being non-preemptible
26032615
* long enough to force the reset.
26042616
*/
2605-
execlists_hold(engine, cap->rq);
2617+
if (!execlists_hold(engine, cap->rq))
2618+
goto err_rq;
26062619

26072620
INIT_WORK(&cap->work, execlists_capture_work);
26082621
schedule_work(&cap->work);
2622+
return true;
2623+
2624+
err_rq:
2625+
i915_request_put(cap->rq);
2626+
err_free:
2627+
i915_gpu_coredump_put(cap->error);
2628+
kfree(cap);
2629+
return false;
26092630
}
26102631

26112632
static noinline void preempt_reset(struct intel_engine_cs *engine)
@@ -2627,8 +2648,10 @@ static noinline void preempt_reset(struct intel_engine_cs *engine)
26272648
jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
26282649

26292650
ring_set_paused(engine, 1); /* Freeze the current request in place */
2630-
execlists_capture(engine);
2631-
intel_engine_reset(engine, "preemption time out");
2651+
if (execlists_capture(engine))
2652+
intel_engine_reset(engine, "preemption time out");
2653+
else
2654+
ring_set_paused(engine, 0);
26322655

26332656
tasklet_enable(&engine->execlists.tasklet);
26342657
clear_and_wake_up_bit(bit, lock);

drivers/gpu/drm/i915/gt/selftest_lrc.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -335,7 +335,6 @@ static int live_hold_reset(void *arg)
335335

336336
if (test_and_set_bit(I915_RESET_ENGINE + id,
337337
&gt->reset.flags)) {
338-
spin_unlock_irq(&engine->active.lock);
339338
intel_gt_set_wedged(gt);
340339
err = -EBUSY;
341340
goto out;
@@ -345,6 +344,7 @@ static int live_hold_reset(void *arg)
345344
engine->execlists.tasklet.func(engine->execlists.tasklet.data);
346345
GEM_BUG_ON(execlists_active(&engine->execlists) != rq);
347346

347+
i915_request_get(rq);
348348
execlists_hold(engine, rq);
349349
GEM_BUG_ON(!i915_request_on_hold(rq));
350350

@@ -356,7 +356,6 @@ static int live_hold_reset(void *arg)
356356
&gt->reset.flags);
357357

358358
/* Check that we do not resubmit the held request */
359-
i915_request_get(rq);
360359
if (!i915_request_wait(rq, 0, HZ / 5)) {
361360
pr_err("%s: on hold request completed!\n",
362361
engine->name);

0 commit comments

Comments
 (0)