Skip to content

Commit a2f90f4

Browse files
icklejnikula
authored andcommitted
drm/i915/execlists: Reclaim the hanging virtual request
If we encounter a hang on a virtual engine, as we process the hang the request may already have been moved back to the virtual engine (we are processing the hang on the physical engine). We need to reclaim the request from the virtual engine so that the locking is consistent and local to the real engine on which we will hold the request for error state capturing. v2: Pull the reclamation into execlists_hold() and assert that cannot be called from outside of the reset (i.e. with the tasklet disabled). v3: Added selftest v4: Drop the reference owned by the virtual engine Fixes: ad18ba7 ("drm/i915/execlists: Offline error capture") Testcase: igt/gem_exec_balancer/hang Signed-off-by: Chris Wilson <[email protected]> Cc: Mika Kuoppala <[email protected]> Cc: Tvrtko Ursulin <[email protected]> Reviewed-by: Tvrtko Ursulin <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected] (cherry picked from commit 989df3a) Signed-off-by: Jani Nikula <[email protected]>
1 parent 317e039 commit a2f90f4

File tree

2 files changed

+185
-0
lines changed

2 files changed

+185
-0
lines changed

drivers/gpu/drm/i915/gt/intel_lrc.c

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2403,6 +2403,35 @@ static bool execlists_hold(struct intel_engine_cs *engine,
24032403
goto unlock;
24042404
}
24052405

2406+
if (rq->engine != engine) { /* preempted virtual engine */
2407+
struct virtual_engine *ve = to_virtual_engine(rq->engine);
2408+
2409+
/*
2410+
* intel_context_inflight() is only protected by virtue
2411+
* of process_csb() being called only by the tasklet (or
2412+
* directly from inside reset while the tasklet is suspended).
2413+
* Assert that neither of those are allowed to run while we
2414+
* poke at the request queues.
2415+
*/
2416+
GEM_BUG_ON(!reset_in_progress(&engine->execlists));
2417+
2418+
/*
2419+
* An unsubmitted request along a virtual engine will
2420+
* remain on the active (this) engine until we are able
2421+
* to process the context switch away (and so mark the
2422+
* context as no longer in flight). That cannot have happened
2423+
* yet, otherwise we would not be hanging!
2424+
*/
2425+
spin_lock(&ve->base.active.lock);
2426+
GEM_BUG_ON(intel_context_inflight(rq->context) != engine);
2427+
GEM_BUG_ON(ve->request != rq);
2428+
ve->request = NULL;
2429+
spin_unlock(&ve->base.active.lock);
2430+
i915_request_put(rq);
2431+
2432+
rq->engine = engine;
2433+
}
2434+
24062435
/*
24072436
* Transfer this request onto the hold queue to prevent it
24082437
* being resumbitted to HW (and potentially completed) before we have

drivers/gpu/drm/i915/gt/selftest_lrc.c

Lines changed: 156 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3410,6 +3410,161 @@ static int live_virtual_bond(void *arg)
34103410
return 0;
34113411
}
34123412

3413+
static int reset_virtual_engine(struct intel_gt *gt,
3414+
struct intel_engine_cs **siblings,
3415+
unsigned int nsibling)
3416+
{
3417+
struct intel_engine_cs *engine;
3418+
struct intel_context *ve;
3419+
unsigned long *heartbeat;
3420+
struct igt_spinner spin;
3421+
struct i915_request *rq;
3422+
unsigned int n;
3423+
int err = 0;
3424+
3425+
/*
3426+
* In order to support offline error capture for fast preempt reset,
3427+
* we need to decouple the guilty request and ensure that it and its
3428+
* descendents are not executed while the capture is in progress.
3429+
*/
3430+
3431+
heartbeat = kmalloc_array(nsibling, sizeof(*heartbeat), GFP_KERNEL);
3432+
if (!heartbeat)
3433+
return -ENOMEM;
3434+
3435+
if (igt_spinner_init(&spin, gt)) {
3436+
err = -ENOMEM;
3437+
goto out_free;
3438+
}
3439+
3440+
ve = intel_execlists_create_virtual(siblings, nsibling);
3441+
if (IS_ERR(ve)) {
3442+
err = PTR_ERR(ve);
3443+
goto out_spin;
3444+
}
3445+
3446+
for (n = 0; n < nsibling; n++)
3447+
engine_heartbeat_disable(siblings[n], &heartbeat[n]);
3448+
3449+
rq = igt_spinner_create_request(&spin, ve, MI_ARB_CHECK);
3450+
if (IS_ERR(rq)) {
3451+
err = PTR_ERR(rq);
3452+
goto out_heartbeat;
3453+
}
3454+
i915_request_add(rq);
3455+
3456+
if (!igt_wait_for_spinner(&spin, rq)) {
3457+
intel_gt_set_wedged(gt);
3458+
err = -ETIME;
3459+
goto out_heartbeat;
3460+
}
3461+
3462+
engine = rq->engine;
3463+
GEM_BUG_ON(engine == ve->engine);
3464+
3465+
/* Take ownership of the reset and tasklet */
3466+
if (test_and_set_bit(I915_RESET_ENGINE + engine->id,
3467+
&gt->reset.flags)) {
3468+
intel_gt_set_wedged(gt);
3469+
err = -EBUSY;
3470+
goto out_heartbeat;
3471+
}
3472+
tasklet_disable(&engine->execlists.tasklet);
3473+
3474+
engine->execlists.tasklet.func(engine->execlists.tasklet.data);
3475+
GEM_BUG_ON(execlists_active(&engine->execlists) != rq);
3476+
3477+
/* Fake a preemption event; failed of course */
3478+
spin_lock_irq(&engine->active.lock);
3479+
__unwind_incomplete_requests(engine);
3480+
spin_unlock_irq(&engine->active.lock);
3481+
GEM_BUG_ON(rq->engine != ve->engine);
3482+
3483+
/* Reset the engine while keeping our active request on hold */
3484+
execlists_hold(engine, rq);
3485+
GEM_BUG_ON(!i915_request_on_hold(rq));
3486+
3487+
intel_engine_reset(engine, NULL);
3488+
GEM_BUG_ON(rq->fence.error != -EIO);
3489+
3490+
/* Release our grasp on the engine, letting CS flow again */
3491+
tasklet_enable(&engine->execlists.tasklet);
3492+
clear_and_wake_up_bit(I915_RESET_ENGINE + engine->id, &gt->reset.flags);
3493+
3494+
/* Check that we do not resubmit the held request */
3495+
i915_request_get(rq);
3496+
if (!i915_request_wait(rq, 0, HZ / 5)) {
3497+
pr_err("%s: on hold request completed!\n",
3498+
engine->name);
3499+
intel_gt_set_wedged(gt);
3500+
err = -EIO;
3501+
goto out_rq;
3502+
}
3503+
GEM_BUG_ON(!i915_request_on_hold(rq));
3504+
3505+
/* But is resubmitted on release */
3506+
execlists_unhold(engine, rq);
3507+
if (i915_request_wait(rq, 0, HZ / 5) < 0) {
3508+
pr_err("%s: held request did not complete!\n",
3509+
engine->name);
3510+
intel_gt_set_wedged(gt);
3511+
err = -ETIME;
3512+
}
3513+
3514+
out_rq:
3515+
i915_request_put(rq);
3516+
out_heartbeat:
3517+
for (n = 0; n < nsibling; n++)
3518+
engine_heartbeat_enable(siblings[n], heartbeat[n]);
3519+
3520+
intel_context_put(ve);
3521+
out_spin:
3522+
igt_spinner_fini(&spin);
3523+
out_free:
3524+
kfree(heartbeat);
3525+
return err;
3526+
}
3527+
3528+
static int live_virtual_reset(void *arg)
3529+
{
3530+
struct intel_gt *gt = arg;
3531+
struct intel_engine_cs *siblings[MAX_ENGINE_INSTANCE + 1];
3532+
unsigned int class, inst;
3533+
3534+
/*
3535+
* Check that we handle a reset event within a virtual engine.
3536+
* Only the physical engine is reset, but we have to check the flow
3537+
* of the virtual requests around the reset, and make sure it is not
3538+
* forgotten.
3539+
*/
3540+
3541+
if (USES_GUC_SUBMISSION(gt->i915))
3542+
return 0;
3543+
3544+
if (!intel_has_reset_engine(gt))
3545+
return 0;
3546+
3547+
for (class = 0; class <= MAX_ENGINE_CLASS; class++) {
3548+
int nsibling, err;
3549+
3550+
nsibling = 0;
3551+
for (inst = 0; inst <= MAX_ENGINE_INSTANCE; inst++) {
3552+
if (!gt->engine_class[class][inst])
3553+
continue;
3554+
3555+
siblings[nsibling++] = gt->engine_class[class][inst];
3556+
}
3557+
if (nsibling < 2)
3558+
continue;
3559+
3560+
err = reset_virtual_engine(gt, siblings, nsibling);
3561+
if (err)
3562+
return err;
3563+
}
3564+
3565+
return 0;
3566+
}
3567+
34133568
int intel_execlists_live_selftests(struct drm_i915_private *i915)
34143569
{
34153570
static const struct i915_subtest tests[] = {
@@ -3435,6 +3590,7 @@ int intel_execlists_live_selftests(struct drm_i915_private *i915)
34353590
SUBTEST(live_virtual_mask),
34363591
SUBTEST(live_virtual_preserved),
34373592
SUBTEST(live_virtual_bond),
3593+
SUBTEST(live_virtual_reset),
34383594
};
34393595

34403596
if (!HAS_EXECLISTS(i915))

0 commit comments

Comments
 (0)