Skip to content

Commit 99cdb8b

Browse files
Christian Braunerkees
authored andcommitted
seccomp: notify about unused filter
We've been making heavy use of the seccomp notifier to intercept and handle certain syscalls for containers. This patch allows a syscall supervisor listening on a given notifier to be notified when a seccomp filter has become unused. A container is often managed by a singleton supervisor process the so-called "monitor". This monitor process has an event loop which has various event handlers registered. If the user specified a seccomp profile that included a notifier for various syscalls then we also register a seccomp notify even handler. For any container using a separate pid namespace the lifecycle of the seccomp notifier is bound to the init process of the pid namespace, i.e. when the init process exits the filter must be unused. If a new process attaches to a container we force it to assume a seccomp profile. This can either be the same seccomp profile as the container was started with or a modified one. If the attaching process makes use of the seccomp notifier we will register a new seccomp notifier handler in the monitor's event loop. However, when the attaching process exits we can't simply delete the handler since other child processes could've been created (daemons spawned etc.) that have inherited the seccomp filter and so we need to keep the seccomp notifier fd alive in the event loop. But this is problematic since we don't get a notification when the seccomp filter has become unused and so we currently never remove the seccomp notifier fd from the event loop and just keep accumulating fds in the event loop. We've had this issue for a while but it has recently become more pressing as more and larger users make use of this. To fix this, we introduce a new "users" reference counter that tracks any tasks and dependent filters making use of a filter. When a notifier is registered waiting tasks will be notified that the filter is now empty by receiving a (E)POLLHUP event. The concept in this patch introduces is the same as for signal_struct, i.e. reference counting for life-cycle management is decoupled from reference counting taks using the object. There's probably some trickery possible but the second counter is just the correct way of doing this IMHO and has precedence. Cc: Tycho Andersen <[email protected]> Cc: Kees Cook <[email protected]> Cc: Matt Denton <[email protected]> Cc: Sargun Dhillon <[email protected]> Cc: Jann Horn <[email protected]> Cc: Chris Palmer <[email protected]> Cc: Aleksa Sarai <[email protected]> Cc: Robert Sesek <[email protected]> Cc: Jeffrey Vander Stoep <[email protected]> Cc: Linux Containers <[email protected]> Signed-off-by: Christian Brauner <[email protected]> Link: https://lore.kernel.org/r/[email protected] Signed-off-by: Kees Cook <[email protected]>
1 parent 76194c4 commit 99cdb8b

File tree

1 file changed

+39
-5
lines changed

1 file changed

+39
-5
lines changed

kernel/seccomp.c

Lines changed: 39 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,14 @@ struct notification {
110110
* attached task, once for the dependent filter, and if
111111
* requested for the user notifier. When @refs reaches zero,
112112
* the filter can be freed.
113+
* @users: A filter's @users count is incremented for each directly
114+
* attached task (filter installation, fork(), thread_sync),
115+
* and once for the dependent filter (tracked in filter->prev).
116+
* When it reaches zero it indicates that no direct or indirect
117+
* users of that filter exist. No new tasks can get associated with
118+
* this filter after reaching 0. The @users count is always smaller
119+
* or equal to @refs. Hence, reaching 0 for @users does not mean
120+
* the filter can be freed.
113121
* @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
114122
* @prev: points to a previously installed, or inherited, filter
115123
* @prog: the BPF program to evaluate
@@ -129,6 +137,7 @@ struct notification {
129137
*/
130138
struct seccomp_filter {
131139
refcount_t refs;
140+
refcount_t users;
132141
bool log;
133142
struct seccomp_filter *prev;
134143
struct bpf_prog *prog;
@@ -376,6 +385,15 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter)
376385
}
377386
}
378387

388+
static void __seccomp_filter_orphan(struct seccomp_filter *orig)
389+
{
390+
while (orig && refcount_dec_and_test(&orig->users)) {
391+
if (waitqueue_active(&orig->wqh))
392+
wake_up_poll(&orig->wqh, EPOLLHUP);
393+
orig = orig->prev;
394+
}
395+
}
396+
379397
static void __put_seccomp_filter(struct seccomp_filter *orig)
380398
{
381399
/* Clean up single-reference branches iteratively. */
@@ -386,10 +404,18 @@ static void __put_seccomp_filter(struct seccomp_filter *orig)
386404
}
387405
}
388406

407+
static void __seccomp_filter_release(struct seccomp_filter *orig)
408+
{
409+
/* Notify about any unused filters in the task's former filter tree. */
410+
__seccomp_filter_orphan(orig);
411+
/* Finally drop all references to the task's former tree. */
412+
__put_seccomp_filter(orig);
413+
}
414+
389415
/**
390-
* seccomp_filter_release - Detach the task from its filter tree
391-
* and drop its reference count during
392-
* exit.
416+
* seccomp_filter_release - Detach the task from its filter tree,
417+
* drop its reference count, and notify
418+
* about unused filters
393419
*
394420
* This function should only be called when the task is exiting as
395421
* it detaches it from its filter tree. As such, READ_ONCE() and
@@ -401,7 +427,7 @@ void seccomp_filter_release(struct task_struct *tsk)
401427

402428
/* Detach task from its filter tree. */
403429
tsk->seccomp.filter = NULL;
404-
__put_seccomp_filter(orig);
430+
__seccomp_filter_release(orig);
405431
}
406432

407433
/**
@@ -428,12 +454,15 @@ static inline void seccomp_sync_threads(unsigned long flags)
428454

429455
/* Get a task reference for the new leaf node. */
430456
get_seccomp_filter(caller);
457+
431458
/*
432459
* Drop the task reference to the shared ancestor since
433460
* current's path will hold a reference. (This also
434461
* allows a put before the assignment.)
435462
*/
436-
__put_seccomp_filter(thread->seccomp.filter);
463+
__seccomp_filter_release(thread->seccomp.filter);
464+
465+
/* Make our new filter tree visible. */
437466
smp_store_release(&thread->seccomp.filter,
438467
caller->seccomp.filter);
439468
atomic_set(&thread->seccomp.filter_count,
@@ -502,6 +531,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
502531
}
503532

504533
refcount_set(&sfilter->refs, 1);
534+
refcount_set(&sfilter->users, 1);
505535
init_waitqueue_head(&sfilter->wqh);
506536

507537
return sfilter;
@@ -606,6 +636,7 @@ void get_seccomp_filter(struct task_struct *tsk)
606636
if (!orig)
607637
return;
608638
__get_seccomp_filter(orig);
639+
refcount_inc(&orig->users);
609640
}
610641

611642
static void seccomp_init_siginfo(kernel_siginfo_t *info, int syscall, int reason)
@@ -1234,6 +1265,9 @@ static __poll_t seccomp_notify_poll(struct file *file,
12341265

12351266
mutex_unlock(&filter->notify_lock);
12361267

1268+
if (refcount_read(&filter->users) == 0)
1269+
ret |= EPOLLHUP;
1270+
12371271
return ret;
12381272
}
12391273

0 commit comments

Comments
 (0)