Skip to content

Commit 98ae6fb

Browse files
icklejlahtine-intel
authored andcommitted
drm/i915/execlists: Move reset_active() from schedule-out to schedule-in
The gem_ctx_persistence/smoketest was detecting an odd coherency issue inside the LRC context image; that the address of the ring buffer did not match our associated struct intel_ring. As we set the address into the context image when we pin the ring buffer into place before the context is active, that leaves the question of where did it get overwritten. Either the HW context save occurred after our pin which would imply that our idle barriers are broken, or we overwrote the context image ourselves. It is only in reset_active() where we dabble inside the context image outside of a serialised path from schedule-out; but we could equally perform the operation inside schedule-in which is then fully serialised with the context pin -- and remains serialised by the engine pulse with kill_context(). (The only downside, aside from doing more work inside the engine->active.lock, was the plan to merge all the reset paths into doing their context scrubbing on schedule-out needs more thought.) Fixes: d12acee ("drm/i915/execlists: Cancel banned contexts on schedule-out") Testcase: igt/gem_ctx_persistence/smoketest Signed-off-by: Chris Wilson <[email protected]> Cc: Tvrtko Ursulin <[email protected]> Reviewed-by: Tvrtko Ursulin <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected] (cherry picked from commit 31b61f0) Signed-off-by: Joonas Lahtinen <[email protected]>
1 parent cee7fb4 commit 98ae6fb

File tree

1 file changed

+62
-56
lines changed

1 file changed

+62
-56
lines changed

drivers/gpu/drm/i915/gt/intel_lrc.c

Lines changed: 62 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -990,6 +990,59 @@ static void intel_engine_context_out(struct intel_engine_cs *engine)
990990
write_sequnlock_irqrestore(&engine->stats.lock, flags);
991991
}
992992

993+
static void restore_default_state(struct intel_context *ce,
994+
struct intel_engine_cs *engine)
995+
{
996+
u32 *regs = ce->lrc_reg_state;
997+
998+
if (engine->pinned_default_state)
999+
memcpy(regs, /* skip restoring the vanilla PPHWSP */
1000+
engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1001+
engine->context_size - PAGE_SIZE);
1002+
1003+
execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1004+
}
1005+
1006+
static void reset_active(struct i915_request *rq,
1007+
struct intel_engine_cs *engine)
1008+
{
1009+
struct intel_context * const ce = rq->hw_context;
1010+
u32 head;
1011+
1012+
/*
1013+
* The executing context has been cancelled. We want to prevent
1014+
* further execution along this context and propagate the error on
1015+
* to anything depending on its results.
1016+
*
1017+
* In __i915_request_submit(), we apply the -EIO and remove the
1018+
* requests' payloads for any banned requests. But first, we must
1019+
* rewind the context back to the start of the incomplete request so
1020+
* that we do not jump back into the middle of the batch.
1021+
*
1022+
* We preserve the breadcrumbs and semaphores of the incomplete
1023+
* requests so that inter-timeline dependencies (i.e other timelines)
1024+
* remain correctly ordered. And we defer to __i915_request_submit()
1025+
* so that all asynchronous waits are correctly handled.
1026+
*/
1027+
GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
1028+
__func__, engine->name, rq->fence.context, rq->fence.seqno);
1029+
1030+
/* On resubmission of the active request, payload will be scrubbed */
1031+
if (i915_request_completed(rq))
1032+
head = rq->tail;
1033+
else
1034+
head = active_request(ce->timeline, rq)->head;
1035+
ce->ring->head = intel_ring_wrap(ce->ring, head);
1036+
intel_ring_update_space(ce->ring);
1037+
1038+
/* Scrub the context image to prevent replaying the previous batch */
1039+
restore_default_state(ce, engine);
1040+
__execlists_update_reg_state(ce, engine);
1041+
1042+
/* We've switched away, so this should be a no-op, but intent matters */
1043+
ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1044+
}
1045+
9931046
static inline struct intel_engine_cs *
9941047
__execlists_schedule_in(struct i915_request *rq)
9951048
{
@@ -998,6 +1051,9 @@ __execlists_schedule_in(struct i915_request *rq)
9981051

9991052
intel_context_get(ce);
10001053

1054+
if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
1055+
reset_active(rq, engine);
1056+
10011057
if (ce->tag) {
10021058
/* Use a fixed tag for OA and friends */
10031059
ce->lrc_desc |= (u64)ce->tag << 32;
@@ -1047,72 +1103,22 @@ static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
10471103
tasklet_schedule(&ve->base.execlists.tasklet);
10481104
}
10491105

1050-
static void restore_default_state(struct intel_context *ce,
1051-
struct intel_engine_cs *engine)
1052-
{
1053-
u32 *regs = ce->lrc_reg_state;
1054-
1055-
if (engine->pinned_default_state)
1056-
memcpy(regs, /* skip restoring the vanilla PPHWSP */
1057-
engine->pinned_default_state + LRC_STATE_PN * PAGE_SIZE,
1058-
engine->context_size - PAGE_SIZE);
1059-
1060-
execlists_init_reg_state(regs, ce, engine, ce->ring, false);
1061-
}
1062-
1063-
static void reset_active(struct i915_request *rq,
1064-
struct intel_engine_cs *engine)
1065-
{
1066-
struct intel_context * const ce = rq->hw_context;
1067-
u32 head;
1068-
1069-
/*
1070-
* The executing context has been cancelled. We want to prevent
1071-
* further execution along this context and propagate the error on
1072-
* to anything depending on its results.
1073-
*
1074-
* In __i915_request_submit(), we apply the -EIO and remove the
1075-
* requests' payloads for any banned requests. But first, we must
1076-
* rewind the context back to the start of the incomplete request so
1077-
* that we do not jump back into the middle of the batch.
1078-
*
1079-
* We preserve the breadcrumbs and semaphores of the incomplete
1080-
* requests so that inter-timeline dependencies (i.e other timelines)
1081-
* remain correctly ordered. And we defer to __i915_request_submit()
1082-
* so that all asynchronous waits are correctly handled.
1083-
*/
1084-
GEM_TRACE("%s(%s): { rq=%llx:%lld }\n",
1085-
__func__, engine->name, rq->fence.context, rq->fence.seqno);
1086-
1087-
/* On resubmission of the active request, payload will be scrubbed */
1088-
if (i915_request_completed(rq))
1089-
head = rq->tail;
1090-
else
1091-
head = active_request(ce->timeline, rq)->head;
1092-
ce->ring->head = intel_ring_wrap(ce->ring, head);
1093-
intel_ring_update_space(ce->ring);
1094-
1095-
/* Scrub the context image to prevent replaying the previous batch */
1096-
restore_default_state(ce, engine);
1097-
__execlists_update_reg_state(ce, engine);
1098-
1099-
/* We've switched away, so this should be a no-op, but intent matters */
1100-
ce->lrc_desc |= CTX_DESC_FORCE_RESTORE;
1101-
}
1102-
11031106
static inline void
11041107
__execlists_schedule_out(struct i915_request *rq,
11051108
struct intel_engine_cs * const engine)
11061109
{
11071110
struct intel_context * const ce = rq->hw_context;
11081111

1112+
/*
1113+
* NB process_csb() is not under the engine->active.lock and hence
1114+
* schedule_out can race with schedule_in meaning that we should
1115+
* refrain from doing non-trivial work here.
1116+
*/
1117+
11091118
intel_engine_context_out(engine);
11101119
execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
11111120
intel_gt_pm_put(engine->gt);
11121121

1113-
if (unlikely(i915_gem_context_is_banned(ce->gem_context)))
1114-
reset_active(rq, engine);
1115-
11161122
/*
11171123
* If this is part of a virtual engine, its next request may
11181124
* have been blocked waiting for access to the active context.

0 commit comments

Comments
 (0)