Skip to content

Commit e8dbb56

Browse files
tursulindanvet
authored andcommitted
drm/i915: Fail too long user submissions by default
A new Kconfig option CONFIG_DRM_I915_REQUEST_TIMEOUT is added, defaulting to 20s, and this timeout is applied to all users contexts using the previously added watchdog facility. Result of this is that any user submission will simply fail after this timeout, either causing a reset (for non-preemptable), or incomplete results. This can have an effect that workloads which used to work fine will suddenly start failing. Even workloads comprised of short batches but in long dependency chains can be terminated. And because of lack of agreement on usefulness and safety of fence error propagation this partial execution can be invisible to userspace even if it is "listening" to returned fence status. Another interaction is with hangcheck where care needs to be taken timeout is not set lower or close to three times the heartbeat interval. Otherwise a hang in any application can cause complete termination of all submissions from unrelated clients. Any users modifying the per engine heartbeat intervals therefore need to be aware of this potential denial of service to avoid inadvertently enabling it. Given all this I am personally not convinced the scheme is a good idea. Intuitively it feels object importers would be better positioned to enforce the time they are willing to wait for something to complete. v2: * Improved commit message and Kconfig text. * Pull in some helper code from patch which got dropped. v3: * Bump timeout to 20s to see if it helps Tigerlake. Signed-off-by: Tvrtko Ursulin <[email protected]> Cc: Daniel Vetter <[email protected]> Acked-by: Matthew Auld <[email protected]> Signed-off-by: Daniel Vetter <[email protected]> Link: https://patchwork.freedesktop.org/patch/msgid/[email protected]
1 parent 9b4d059 commit e8dbb56

File tree

4 files changed

+66
-1
lines changed

4 files changed

+66
-1
lines changed

drivers/gpu/drm/i915/Kconfig.profile

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,17 @@
1+
config DRM_I915_REQUEST_TIMEOUT
2+
int "Default timeout for requests (ms)"
3+
default 20000 # milliseconds
4+
help
5+
Configures the default timeout after which any user submissions will
6+
be forcefully terminated.
7+
8+
Beware setting this value lower, or close to heartbeat interval
9+
rounded to whole seconds times three, in order to avoid allowing
10+
misbehaving applications causing total rendering failure in unrelated
11+
clients.
12+
13+
May be 0 to disable the timeout.
14+
115
config DRM_I915_FENCE_TIMEOUT
216
int "Timeout for unsignaled foreign fences (ms, jiffy granularity)"
317
default 10000 # milliseconds

drivers/gpu/drm/i915/gem/i915_gem_context.c

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,8 @@ static void intel_context_set_gem(struct intel_context *ce,
232232
if (ctx->sched.priority >= I915_PRIORITY_NORMAL &&
233233
intel_engine_has_timeslices(ce->engine))
234234
__set_bit(CONTEXT_USE_SEMAPHORES, &ce->flags);
235+
236+
intel_context_set_watchdog_us(ce, ctx->watchdog.timeout_us);
235237
}
236238

237239
static void __free_engines(struct i915_gem_engines *e, unsigned int count)
@@ -790,6 +792,40 @@ static void __assign_timeline(struct i915_gem_context *ctx,
790792
context_apply_all(ctx, __apply_timeline, timeline);
791793
}
792794

795+
static int __apply_watchdog(struct intel_context *ce, void *timeout_us)
796+
{
797+
return intel_context_set_watchdog_us(ce, (uintptr_t)timeout_us);
798+
}
799+
800+
static int
801+
__set_watchdog(struct i915_gem_context *ctx, unsigned long timeout_us)
802+
{
803+
int ret;
804+
805+
ret = context_apply_all(ctx, __apply_watchdog,
806+
(void *)(uintptr_t)timeout_us);
807+
if (!ret)
808+
ctx->watchdog.timeout_us = timeout_us;
809+
810+
return ret;
811+
}
812+
813+
static void __set_default_fence_expiry(struct i915_gem_context *ctx)
814+
{
815+
struct drm_i915_private *i915 = ctx->i915;
816+
int ret;
817+
818+
if (!IS_ACTIVE(CONFIG_DRM_I915_REQUEST_TIMEOUT))
819+
return;
820+
821+
/* Default expiry for user fences. */
822+
ret = __set_watchdog(ctx, CONFIG_DRM_I915_REQUEST_TIMEOUT * 1000);
823+
if (ret)
824+
drm_notice(&i915->drm,
825+
"Failed to configure default fence expiry! (%d)",
826+
ret);
827+
}
828+
793829
static struct i915_gem_context *
794830
i915_gem_create_context(struct drm_i915_private *i915, unsigned int flags)
795831
{
@@ -834,6 +870,8 @@ i915_gem_create_context(struct drm_i915_private *i915, unsigned int flags)
834870
intel_timeline_put(timeline);
835871
}
836872

873+
__set_default_fence_expiry(ctx);
874+
837875
trace_i915_context_create(ctx);
838876

839877
return ctx;

drivers/gpu/drm/i915/gem/i915_gem_context_types.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,10 @@ struct i915_gem_context {
154154
*/
155155
atomic_t active_count;
156156

157+
struct {
158+
u64 timeout_us;
159+
} watchdog;
160+
157161
/**
158162
* @hang_timestamp: The last time(s) this context caused a GPU hang
159163
*/

drivers/gpu/drm/i915/gt/intel_context_param.h

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,18 @@
66
#ifndef INTEL_CONTEXT_PARAM_H
77
#define INTEL_CONTEXT_PARAM_H
88

9-
struct intel_context;
9+
#include <linux/types.h>
10+
11+
#include "intel_context.h"
1012

1113
int intel_context_set_ring_size(struct intel_context *ce, long sz);
1214
long intel_context_get_ring_size(struct intel_context *ce);
1315

16+
static inline int
17+
intel_context_set_watchdog_us(struct intel_context *ce, u64 timeout_us)
18+
{
19+
ce->watchdog.timeout_us = timeout_us;
20+
return 0;
21+
}
22+
1423
#endif /* INTEL_CONTEXT_PARAM_H */

0 commit comments

Comments
 (0)