Skip to content

Commit ad18ba7

Browse files
icklejnikula
authored andcommitted
drm/i915/execlists: Offline error capture
Currently, we skip error capture upon forced preemption. We apply forced preemption when there is a higher priority request that should be running but is being blocked, and we skip inline error capture so that the preemption request is not further delayed by a user controlled capture -- extending the denial of service. However, preemption reset is also used for heartbeats and regular GPU hangs. By skipping the error capture, we remove the ability to debug GPU hangs. In order to capture the error without delaying the preemption request further, we can do an out-of-line capture by removing the guilty request from the execution queue and scheduling a worker to dump that request. When removing a request, we need to remove the entire context and all descendants from the execution queue, so that they do not jump past. Closes: https://gitlab.freedesktop.org/drm/intel/issues/738 Fixes: 3a7a92a ("drm/i915/execlists: Force preemption") Signed-off-by: Chris Wilson <[email protected]> Cc: Mika Kuoppala <[email protected]> Cc: Tvrtko Ursulin <[email protected]> Reviewed-by: Tvrtko Ursulin <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected] (cherry picked from commit 7483173) Signed-off-by: Jani Nikula <[email protected]>
1 parent c3f1ed9 commit ad18ba7

File tree

1 file changed

+120
-2
lines changed

1 file changed

+120
-2
lines changed

drivers/gpu/drm/i915/gt/intel_lrc.c

Lines changed: 120 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2393,7 +2393,6 @@ static void __execlists_hold(struct i915_request *rq)
23932393
} while (rq);
23942394
}
23952395

2396-
__maybe_unused
23972396
static void execlists_hold(struct intel_engine_cs *engine,
23982397
struct i915_request *rq)
23992398
{
@@ -2473,7 +2472,6 @@ static void __execlists_unhold(struct i915_request *rq)
24732472
} while (rq);
24742473
}
24752474

2476-
__maybe_unused
24772475
static void execlists_unhold(struct intel_engine_cs *engine,
24782476
struct i915_request *rq)
24792477
{
@@ -2493,6 +2491,123 @@ static void execlists_unhold(struct intel_engine_cs *engine,
24932491
spin_unlock_irq(&engine->active.lock);
24942492
}
24952493

2494+
struct execlists_capture {
2495+
struct work_struct work;
2496+
struct i915_request *rq;
2497+
struct i915_gpu_coredump *error;
2498+
};
2499+
2500+
static void execlists_capture_work(struct work_struct *work)
2501+
{
2502+
struct execlists_capture *cap = container_of(work, typeof(*cap), work);
2503+
const gfp_t gfp = GFP_KERNEL | __GFP_RETRY_MAYFAIL | __GFP_NOWARN;
2504+
struct intel_engine_cs *engine = cap->rq->engine;
2505+
struct intel_gt_coredump *gt = cap->error->gt;
2506+
struct intel_engine_capture_vma *vma;
2507+
2508+
/* Compress all the objects attached to the request, slow! */
2509+
vma = intel_engine_coredump_add_request(gt->engine, cap->rq, gfp);
2510+
if (vma) {
2511+
struct i915_vma_compress *compress =
2512+
i915_vma_capture_prepare(gt);
2513+
2514+
intel_engine_coredump_add_vma(gt->engine, vma, compress);
2515+
i915_vma_capture_finish(gt, compress);
2516+
}
2517+
2518+
gt->simulated = gt->engine->simulated;
2519+
cap->error->simulated = gt->simulated;
2520+
2521+
/* Publish the error state, and announce it to the world */
2522+
i915_error_state_store(cap->error);
2523+
i915_gpu_coredump_put(cap->error);
2524+
2525+
/* Return this request and all that depend upon it for signaling */
2526+
execlists_unhold(engine, cap->rq);
2527+
2528+
kfree(cap);
2529+
}
2530+
2531+
static struct execlists_capture *capture_regs(struct intel_engine_cs *engine)
2532+
{
2533+
const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
2534+
struct execlists_capture *cap;
2535+
2536+
cap = kmalloc(sizeof(*cap), gfp);
2537+
if (!cap)
2538+
return NULL;
2539+
2540+
cap->error = i915_gpu_coredump_alloc(engine->i915, gfp);
2541+
if (!cap->error)
2542+
goto err_cap;
2543+
2544+
cap->error->gt = intel_gt_coredump_alloc(engine->gt, gfp);
2545+
if (!cap->error->gt)
2546+
goto err_gpu;
2547+
2548+
cap->error->gt->engine = intel_engine_coredump_alloc(engine, gfp);
2549+
if (!cap->error->gt->engine)
2550+
goto err_gt;
2551+
2552+
return cap;
2553+
2554+
err_gt:
2555+
kfree(cap->error->gt);
2556+
err_gpu:
2557+
kfree(cap->error);
2558+
err_cap:
2559+
kfree(cap);
2560+
return NULL;
2561+
}
2562+
2563+
static void execlists_capture(struct intel_engine_cs *engine)
2564+
{
2565+
struct execlists_capture *cap;
2566+
2567+
if (!IS_ENABLED(CONFIG_DRM_I915_CAPTURE_ERROR))
2568+
return;
2569+
2570+
/*
2571+
* We need to _quickly_ capture the engine state before we reset.
2572+
* We are inside an atomic section (softirq) here and we are delaying
2573+
* the forced preemption event.
2574+
*/
2575+
cap = capture_regs(engine);
2576+
if (!cap)
2577+
return;
2578+
2579+
cap->rq = execlists_active(&engine->execlists);
2580+
GEM_BUG_ON(!cap->rq);
2581+
2582+
cap->rq = active_request(cap->rq->context->timeline, cap->rq);
2583+
GEM_BUG_ON(!cap->rq);
2584+
2585+
/*
2586+
* Remove the request from the execlists queue, and take ownership
2587+
* of the request. We pass it to our worker who will _slowly_ compress
2588+
* all the pages the _user_ requested for debugging their batch, after
2589+
* which we return it to the queue for signaling.
2590+
*
2591+
* By removing them from the execlists queue, we also remove the
2592+
* requests from being processed by __unwind_incomplete_requests()
2593+
* during the intel_engine_reset(), and so they will *not* be replayed
2594+
* afterwards.
2595+
*
2596+
* Note that because we have not yet reset the engine at this point,
2597+
* it is possible for the request that we have identified as being
2598+
* guilty, did in fact complete and we will then hit an arbitration
2599+
* point allowing the outstanding preemption to succeed. The likelihood
2600+
* of that is very low (as capturing of the engine registers should be
2601+
* fast enough to run inside an irq-off atomic section!), so we will
2602+
* simply hold that request accountable for being non-preemptible
2603+
* long enough to force the reset.
2604+
*/
2605+
execlists_hold(engine, cap->rq);
2606+
2607+
INIT_WORK(&cap->work, execlists_capture_work);
2608+
schedule_work(&cap->work);
2609+
}
2610+
24962611
static noinline void preempt_reset(struct intel_engine_cs *engine)
24972612
{
24982613
const unsigned int bit = I915_RESET_ENGINE + engine->id;
@@ -2510,6 +2625,9 @@ static noinline void preempt_reset(struct intel_engine_cs *engine)
25102625
ENGINE_TRACE(engine, "preempt timeout %lu+%ums\n",
25112626
READ_ONCE(engine->props.preempt_timeout_ms),
25122627
jiffies_to_msecs(jiffies - engine->execlists.preempt.expires));
2628+
2629+
ring_set_paused(engine, 1); /* Freeze the current request in place */
2630+
execlists_capture(engine);
25132631
intel_engine_reset(engine, "preemption time out");
25142632

25152633
tasklet_enable(&engine->execlists.tasklet);

0 commit comments

Comments
 (0)