Skip to content

Commit beddc9f

Browse files
benzeajmberg-intel
authored andcommitted
um: Add SECCOMP support detection and initialization
This detects seccomp support, sets the global using_seccomp variable and initilizes the exec registers. The support is only enabled if the seccomp= kernel parameter is set to either "on" or "auto". With "auto" a fallback to ptrace mode will happen if initialization failed. Signed-off-by: Benjamin Berg <[email protected]> Signed-off-by: Benjamin Berg <[email protected]> Link: https://patch.msgid.link/[email protected] [extend help with Kconfig text from v2, use exit syscall instead of libc, remove unneeded mctx_offset assignment, disable on 32-bit for now] Signed-off-by: Johannes Berg <[email protected]>
1 parent 406d17c commit beddc9f

File tree

2 files changed

+195
-4
lines changed

2 files changed

+195
-4
lines changed

arch/um/os-Linux/registers.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414

1515
/* This is set once at boot time and not changed thereafter */
1616

17-
static unsigned long exec_regs[MAX_REG_NR];
18-
static unsigned long *exec_fp_regs;
17+
unsigned long exec_regs[MAX_REG_NR];
18+
unsigned long *exec_fp_regs;
1919

2020
int init_pid_registers(int pid)
2121
{

arch/um/os-Linux/start_up.c

Lines changed: 193 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
// SPDX-License-Identifier: GPL-2.0
22
/*
3+
* Copyright (C) 2021 Benjamin Berg <[email protected]>
34
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
45
*/
56

@@ -24,6 +25,13 @@
2425
#include <kern_util.h>
2526
#include <mem_user.h>
2627
#include <ptrace_user.h>
28+
#include <stdbool.h>
29+
#include <stub-data.h>
30+
#include <sys/prctl.h>
31+
#include <linux/seccomp.h>
32+
#include <linux/filter.h>
33+
#include <sysdep/mcontext.h>
34+
#include <sysdep/stub.h>
2735
#include <registers.h>
2836
#include <skas.h>
2937
#include "internal.h"
@@ -224,6 +232,140 @@ static void __init check_ptrace(void)
224232
check_sysemu();
225233
}
226234

235+
extern unsigned long host_fp_size;
236+
extern unsigned long exec_regs[MAX_REG_NR];
237+
extern unsigned long *exec_fp_regs;
238+
239+
__initdata static struct stub_data *seccomp_test_stub_data;
240+
241+
static void __init sigsys_handler(int sig, siginfo_t *info, void *p)
242+
{
243+
ucontext_t *uc = p;
244+
245+
/* Stow away the location of the mcontext in the stack */
246+
seccomp_test_stub_data->mctx_offset = (unsigned long)&uc->uc_mcontext -
247+
(unsigned long)&seccomp_test_stub_data->sigstack[0];
248+
249+
/* Prevent libc from clearing memory (mctx_offset in particular) */
250+
syscall(__NR_exit, 0);
251+
}
252+
253+
static int __init seccomp_helper(void *data)
254+
{
255+
static struct sock_filter filter[] = {
256+
BPF_STMT(BPF_LD | BPF_W | BPF_ABS,
257+
offsetof(struct seccomp_data, nr)),
258+
BPF_JUMP(BPF_JMP | BPF_JEQ | BPF_K, __NR_clock_nanosleep, 1, 0),
259+
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_ALLOW),
260+
BPF_STMT(BPF_RET | BPF_K, SECCOMP_RET_TRAP),
261+
};
262+
static struct sock_fprog prog = {
263+
.len = ARRAY_SIZE(filter),
264+
.filter = filter,
265+
};
266+
struct sigaction sa;
267+
268+
set_sigstack(seccomp_test_stub_data->sigstack,
269+
sizeof(seccomp_test_stub_data->sigstack));
270+
271+
sa.sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO;
272+
sa.sa_sigaction = (void *) sigsys_handler;
273+
sa.sa_restorer = NULL;
274+
if (sigaction(SIGSYS, &sa, NULL) < 0)
275+
exit(1);
276+
277+
prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
278+
if (syscall(__NR_seccomp, SECCOMP_SET_MODE_FILTER,
279+
SECCOMP_FILTER_FLAG_TSYNC, &prog) != 0)
280+
exit(2);
281+
282+
sleep(0);
283+
284+
/* Never reached. */
285+
_exit(3);
286+
}
287+
288+
static bool __init init_seccomp(void)
289+
{
290+
int pid;
291+
int status;
292+
int n;
293+
unsigned long sp;
294+
295+
/* doesn't work on 32-bit right now */
296+
if (!IS_ENABLED(CONFIG_64BIT))
297+
return false;
298+
299+
/*
300+
* We check that we can install a seccomp filter and then exit(0)
301+
* from a trapped syscall.
302+
*
303+
* Note that we cannot verify that no seccomp filter already exists
304+
* for a syscall that results in the process/thread to be killed.
305+
*/
306+
307+
os_info("Checking that seccomp filters can be installed...");
308+
309+
seccomp_test_stub_data = mmap(0, sizeof(*seccomp_test_stub_data),
310+
PROT_READ | PROT_WRITE,
311+
MAP_SHARED | MAP_ANON, 0, 0);
312+
313+
/* Use the syscall data area as stack, we just need something */
314+
sp = (unsigned long)&seccomp_test_stub_data->syscall_data +
315+
sizeof(seccomp_test_stub_data->syscall_data) -
316+
sizeof(void *);
317+
pid = clone(seccomp_helper, (void *)sp, CLONE_VFORK | CLONE_VM, NULL);
318+
319+
if (pid < 0)
320+
fatal_perror("check_seccomp : clone failed");
321+
322+
CATCH_EINTR(n = waitpid(pid, &status, __WCLONE));
323+
if (n < 0)
324+
fatal_perror("check_seccomp : waitpid failed");
325+
326+
if (WIFEXITED(status) && WEXITSTATUS(status) == 0) {
327+
struct uml_pt_regs *regs;
328+
unsigned long fp_size;
329+
int r;
330+
331+
/* Fill in the host_fp_size from the mcontext. */
332+
regs = calloc(1, sizeof(struct uml_pt_regs));
333+
get_stub_state(regs, seccomp_test_stub_data, &fp_size);
334+
host_fp_size = fp_size;
335+
free(regs);
336+
337+
/* Repeat with the correct size */
338+
regs = calloc(1, sizeof(struct uml_pt_regs) + host_fp_size);
339+
r = get_stub_state(regs, seccomp_test_stub_data, NULL);
340+
341+
/* Store as the default startup registers */
342+
exec_fp_regs = malloc(host_fp_size);
343+
memcpy(exec_regs, regs->gp, sizeof(exec_regs));
344+
memcpy(exec_fp_regs, regs->fp, host_fp_size);
345+
346+
munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
347+
348+
free(regs);
349+
350+
if (r) {
351+
os_info("failed to fetch registers: %d\n", r);
352+
return false;
353+
}
354+
355+
os_info("OK\n");
356+
return true;
357+
}
358+
359+
if (WIFEXITED(status) && WEXITSTATUS(status) == 2)
360+
os_info("missing\n");
361+
else
362+
os_info("error\n");
363+
364+
munmap(seccomp_test_stub_data, sizeof(*seccomp_test_stub_data));
365+
return false;
366+
}
367+
368+
227369
static void __init check_coredump_limit(void)
228370
{
229371
struct rlimit lim;
@@ -278,6 +420,44 @@ void __init get_host_cpu_features(
278420
}
279421
}
280422

423+
static int seccomp_config __initdata;
424+
425+
static int __init uml_seccomp_config(char *line, int *add)
426+
{
427+
*add = 0;
428+
429+
if (strcmp(line, "off") == 0)
430+
seccomp_config = 0;
431+
else if (strcmp(line, "auto") == 0)
432+
seccomp_config = 1;
433+
else if (strcmp(line, "on") == 0)
434+
seccomp_config = 2;
435+
else
436+
fatal("Invalid seccomp option '%s', expected on/auto/off\n",
437+
line);
438+
439+
return 0;
440+
}
441+
442+
__uml_setup("seccomp=", uml_seccomp_config,
443+
"seccomp=<on/auto/off>\n"
444+
" Configure whether or not SECCOMP is used. With SECCOMP, userspace\n"
445+
" processes work collaboratively with the kernel instead of being\n"
446+
" traced using ptrace. All syscalls from the application are caught and\n"
447+
" redirected using a signal. This signal handler in turn is permitted to\n"
448+
" do the selected set of syscalls to communicate with the UML kernel and\n"
449+
" do the required memory management.\n"
450+
"\n"
451+
" This method is overall faster than the ptrace based userspace, primarily\n"
452+
" because it reduces the number of context switches for (minor) page faults.\n"
453+
"\n"
454+
" However, the SECCOMP filter is not (yet) restrictive enough to prevent\n"
455+
" userspace from reading and writing all physical memory. Userspace\n"
456+
" processes could also trick the stub into disabling SIGALRM which\n"
457+
" prevents it from being interrupted for scheduling purposes.\n"
458+
"\n"
459+
" This is insecure and should only be used with a trusted userspace\n\n"
460+
);
281461

282462
void __init os_early_checks(void)
283463
{
@@ -286,13 +466,24 @@ void __init os_early_checks(void)
286466
/* Print out the core dump limits early */
287467
check_coredump_limit();
288468

289-
check_ptrace();
290-
291469
/* Need to check this early because mmapping happens before the
292470
* kernel is running.
293471
*/
294472
check_tmpexec();
295473

474+
if (seccomp_config) {
475+
if (init_seccomp()) {
476+
using_seccomp = 1;
477+
return;
478+
}
479+
480+
if (seccomp_config == 2)
481+
fatal("SECCOMP userspace requested but not functional!\n");
482+
}
483+
484+
using_seccomp = 0;
485+
check_ptrace();
486+
296487
pid = start_ptraced_child();
297488
if (init_pid_registers(pid))
298489
fatal("Failed to initialize default registers");

0 commit comments

Comments
 (0)