Skip to content

Commit b2295e2

Browse files
committed
drm/i915/gt: Be defensive in the face of false CS events
If the HW throws a curve ball and reports either en event before it is possible, or just a completely impossible event, we have to grin and bear it. The first few events, we will likely not notice as we would be expecting some event, but as soon as we stop expecting an event and yet they still keep coming, then we enter into undefined state territory. In which case, bail out, stop processing the events, and reset the engine and our set of queued requests to recover. The sporadic hangs and warnings will continue to plague CI, but at least system stability should not be compromised. v2: Commentary and force the reset-on-error. v3: Customised user facing message for forced resets from internal errors. Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2045 Signed-off-by: Chris Wilson <[email protected]> Cc: Tvrtko Ursulin <[email protected]> Reviewed-by: Tvrtko Ursulin <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
1 parent ed2690a commit b2295e2

File tree

3 files changed

+45
-7
lines changed

3 files changed

+45
-7
lines changed

drivers/gpu/drm/i915/gt/intel_engine_types.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,8 +177,12 @@ struct intel_engine_execlists {
177177
* the first error interrupt, record the EIR and schedule the tasklet.
178178
* In the tasklet, we process the pending CS events to ensure we have
179179
* the guilty request, and then reset the engine.
180+
*
181+
* Low 16b are used by HW, with the upper 16b used as the enabling mask.
182+
* Reserve the upper 16b for tracking internal errors.
180183
*/
181184
u32 error_interrupt;
185+
#define ERROR_CSB BIT(31)
182186

183187
/**
184188
* @reset_ccid: Active CCID [EXECLISTS_STATUS_HI] at the time of reset

drivers/gpu/drm/i915/gt/intel_gt_irq.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ cs_irq_handler(struct intel_engine_cs *engine, u32 iir)
2727
if (unlikely(iir & GT_CS_MASTER_ERROR_INTERRUPT)) {
2828
u32 eir;
2929

30-
eir = ENGINE_READ(engine, RING_EIR);
30+
/* Upper 16b are the enabling mask, rsvd for internal errors */
31+
eir = ENGINE_READ(engine, RING_EIR) & GENMASK(15, 0);
3132
ENGINE_TRACE(engine, "CS error: %x\n", eir);
3233

3334
/* Disable the error interrupt until after the reset */

drivers/gpu/drm/i915/gt/intel_lrc.c

Lines changed: 39 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2568,6 +2568,25 @@ static void process_csb(struct intel_engine_cs *engine)
25682568
if (unlikely(head == tail))
25692569
return;
25702570

2571+
/*
2572+
* We will consume all events from HW, or at least pretend to.
2573+
*
2574+
* The sequence of events from the HW is deterministic, and derived
2575+
* from our writes to the ELSP, with a smidgen of variability for
2576+
* the arrival of the asynchronous requests wrt to the inflight
2577+
* execution. If the HW sends an event that does not correspond with
2578+
* the one we are expecting, we have to abandon all hope as we lose
2579+
* all tracking of what the engine is actually executing. We will
2580+
* only detect we are out of sequence with the HW when we get an
2581+
* 'impossible' event because we have already drained our own
2582+
* preemption/promotion queue. If this occurs, we know that we likely
2583+
* lost track of execution earlier and must unwind and restart, the
2584+
* simplest way is by stop processing the event queue and force the
2585+
* engine to reset.
2586+
*/
2587+
execlists->csb_head = tail;
2588+
ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
2589+
25712590
/*
25722591
* Hopefully paired with a wmb() in HW!
25732592
*
@@ -2577,8 +2596,6 @@ static void process_csb(struct intel_engine_cs *engine)
25772596
* we perform the READ_ONCE(*csb_write).
25782597
*/
25792598
rmb();
2580-
2581-
ENGINE_TRACE(engine, "cs-irq head=%d, tail=%d\n", head, tail);
25822599
do {
25832600
bool promote;
25842601

@@ -2613,6 +2630,11 @@ static void process_csb(struct intel_engine_cs *engine)
26132630
if (promote) {
26142631
struct i915_request * const *old = execlists->active;
26152632

2633+
if (GEM_WARN_ON(!*execlists->pending)) {
2634+
execlists->error_interrupt |= ERROR_CSB;
2635+
break;
2636+
}
2637+
26162638
ring_set_paused(engine, 0);
26172639

26182640
/* Point active to the new ELSP; prevent overwriting */
@@ -2635,7 +2657,10 @@ static void process_csb(struct intel_engine_cs *engine)
26352657

26362658
WRITE_ONCE(execlists->pending[0], NULL);
26372659
} else {
2638-
GEM_BUG_ON(!*execlists->active);
2660+
if (GEM_WARN_ON(!*execlists->active)) {
2661+
execlists->error_interrupt |= ERROR_CSB;
2662+
break;
2663+
}
26392664

26402665
/* port0 completed, advanced to port1 */
26412666
trace_ports(execlists, "completed", execlists->active);
@@ -2686,7 +2711,6 @@ static void process_csb(struct intel_engine_cs *engine)
26862711
}
26872712
} while (head != tail);
26882713

2689-
execlists->csb_head = head;
26902714
set_timeslice(engine);
26912715

26922716
/*
@@ -3117,9 +3141,18 @@ static void execlists_submission_tasklet(unsigned long data)
31173141
process_csb(engine);
31183142

31193143
if (unlikely(READ_ONCE(engine->execlists.error_interrupt))) {
3144+
const char *msg;
3145+
3146+
/* Generate the error message in priority wrt to the user! */
3147+
if (engine->execlists.error_interrupt & GENMASK(15, 0))
3148+
msg = "CS error"; /* thrown by a user payload */
3149+
else if (engine->execlists.error_interrupt & ERROR_CSB)
3150+
msg = "invalid CSB event";
3151+
else
3152+
msg = "internal error";
3153+
31203154
engine->execlists.error_interrupt = 0;
3121-
if (ENGINE_READ(engine, RING_ESR)) /* confirm the error */
3122-
execlists_reset(engine, "CS error");
3155+
execlists_reset(engine, msg);
31233156
}
31243157

31253158
if (!READ_ONCE(engine->execlists.pending[0]) || timeout) {

0 commit comments

Comments
 (0)