1
1
// SPDX-License-Identifier: GPL-2.0
2
2
/*
3
+ * Copyright (C) 2021 Benjamin Berg <[email protected] >
3
4
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
4
5
*/
5
6
24
25
#include <kern_util.h>
25
26
#include <mem_user.h>
26
27
#include <ptrace_user.h>
28
+ #include <stdbool.h>
29
+ #include <stub-data.h>
30
+ #include <sys/prctl.h>
31
+ #include <linux/seccomp.h>
32
+ #include <linux/filter.h>
33
+ #include <sysdep/mcontext.h>
34
+ #include <sysdep/stub.h>
27
35
#include <registers.h>
28
36
#include <skas.h>
29
37
#include "internal.h"
@@ -224,6 +232,140 @@ static void __init check_ptrace(void)
224
232
check_sysemu ();
225
233
}
226
234
235
+ extern unsigned long host_fp_size ;
236
+ extern unsigned long exec_regs [MAX_REG_NR ];
237
+ extern unsigned long * exec_fp_regs ;
238
+
239
+ __initdata static struct stub_data * seccomp_test_stub_data ;
240
+
241
+ static void __init sigsys_handler (int sig , siginfo_t * info , void * p )
242
+ {
243
+ ucontext_t * uc = p ;
244
+
245
+ /* Stow away the location of the mcontext in the stack */
246
+ seccomp_test_stub_data -> mctx_offset = (unsigned long )& uc -> uc_mcontext -
247
+ (unsigned long )& seccomp_test_stub_data -> sigstack [0 ];
248
+
249
+ /* Prevent libc from clearing memory (mctx_offset in particular) */
250
+ syscall (__NR_exit , 0 );
251
+ }
252
+
253
+ static int __init seccomp_helper (void * data )
254
+ {
255
+ static struct sock_filter filter [] = {
256
+ BPF_STMT (BPF_LD | BPF_W | BPF_ABS ,
257
+ offsetof(struct seccomp_data , nr )),
258
+ BPF_JUMP (BPF_JMP | BPF_JEQ | BPF_K , __NR_clock_nanosleep , 1 , 0 ),
259
+ BPF_STMT (BPF_RET | BPF_K , SECCOMP_RET_ALLOW ),
260
+ BPF_STMT (BPF_RET | BPF_K , SECCOMP_RET_TRAP ),
261
+ };
262
+ static struct sock_fprog prog = {
263
+ .len = ARRAY_SIZE (filter ),
264
+ .filter = filter ,
265
+ };
266
+ struct sigaction sa ;
267
+
268
+ set_sigstack (seccomp_test_stub_data -> sigstack ,
269
+ sizeof (seccomp_test_stub_data -> sigstack ));
270
+
271
+ sa .sa_flags = SA_ONSTACK | SA_NODEFER | SA_SIGINFO ;
272
+ sa .sa_sigaction = (void * ) sigsys_handler ;
273
+ sa .sa_restorer = NULL ;
274
+ if (sigaction (SIGSYS , & sa , NULL ) < 0 )
275
+ exit (1 );
276
+
277
+ prctl (PR_SET_NO_NEW_PRIVS , 1 , 0 , 0 , 0 );
278
+ if (syscall (__NR_seccomp , SECCOMP_SET_MODE_FILTER ,
279
+ SECCOMP_FILTER_FLAG_TSYNC , & prog ) != 0 )
280
+ exit (2 );
281
+
282
+ sleep (0 );
283
+
284
+ /* Never reached. */
285
+ _exit (3 );
286
+ }
287
+
288
+ static bool __init init_seccomp (void )
289
+ {
290
+ int pid ;
291
+ int status ;
292
+ int n ;
293
+ unsigned long sp ;
294
+
295
+ /* doesn't work on 32-bit right now */
296
+ if (!IS_ENABLED (CONFIG_64BIT ))
297
+ return false;
298
+
299
+ /*
300
+ * We check that we can install a seccomp filter and then exit(0)
301
+ * from a trapped syscall.
302
+ *
303
+ * Note that we cannot verify that no seccomp filter already exists
304
+ * for a syscall that results in the process/thread to be killed.
305
+ */
306
+
307
+ os_info ("Checking that seccomp filters can be installed..." );
308
+
309
+ seccomp_test_stub_data = mmap (0 , sizeof (* seccomp_test_stub_data ),
310
+ PROT_READ | PROT_WRITE ,
311
+ MAP_SHARED | MAP_ANON , 0 , 0 );
312
+
313
+ /* Use the syscall data area as stack, we just need something */
314
+ sp = (unsigned long )& seccomp_test_stub_data -> syscall_data +
315
+ sizeof (seccomp_test_stub_data -> syscall_data ) -
316
+ sizeof (void * );
317
+ pid = clone (seccomp_helper , (void * )sp , CLONE_VFORK | CLONE_VM , NULL );
318
+
319
+ if (pid < 0 )
320
+ fatal_perror ("check_seccomp : clone failed" );
321
+
322
+ CATCH_EINTR (n = waitpid (pid , & status , __WCLONE ));
323
+ if (n < 0 )
324
+ fatal_perror ("check_seccomp : waitpid failed" );
325
+
326
+ if (WIFEXITED (status ) && WEXITSTATUS (status ) == 0 ) {
327
+ struct uml_pt_regs * regs ;
328
+ unsigned long fp_size ;
329
+ int r ;
330
+
331
+ /* Fill in the host_fp_size from the mcontext. */
332
+ regs = calloc (1 , sizeof (struct uml_pt_regs ));
333
+ get_stub_state (regs , seccomp_test_stub_data , & fp_size );
334
+ host_fp_size = fp_size ;
335
+ free (regs );
336
+
337
+ /* Repeat with the correct size */
338
+ regs = calloc (1 , sizeof (struct uml_pt_regs ) + host_fp_size );
339
+ r = get_stub_state (regs , seccomp_test_stub_data , NULL );
340
+
341
+ /* Store as the default startup registers */
342
+ exec_fp_regs = malloc (host_fp_size );
343
+ memcpy (exec_regs , regs -> gp , sizeof (exec_regs ));
344
+ memcpy (exec_fp_regs , regs -> fp , host_fp_size );
345
+
346
+ munmap (seccomp_test_stub_data , sizeof (* seccomp_test_stub_data ));
347
+
348
+ free (regs );
349
+
350
+ if (r ) {
351
+ os_info ("failed to fetch registers: %d\n" , r );
352
+ return false;
353
+ }
354
+
355
+ os_info ("OK\n" );
356
+ return true;
357
+ }
358
+
359
+ if (WIFEXITED (status ) && WEXITSTATUS (status ) == 2 )
360
+ os_info ("missing\n" );
361
+ else
362
+ os_info ("error\n" );
363
+
364
+ munmap (seccomp_test_stub_data , sizeof (* seccomp_test_stub_data ));
365
+ return false;
366
+ }
367
+
368
+
227
369
static void __init check_coredump_limit (void )
228
370
{
229
371
struct rlimit lim ;
@@ -278,6 +420,44 @@ void __init get_host_cpu_features(
278
420
}
279
421
}
280
422
423
+ static int seccomp_config __initdata ;
424
+
425
+ static int __init uml_seccomp_config (char * line , int * add )
426
+ {
427
+ * add = 0 ;
428
+
429
+ if (strcmp (line , "off" ) == 0 )
430
+ seccomp_config = 0 ;
431
+ else if (strcmp (line , "auto" ) == 0 )
432
+ seccomp_config = 1 ;
433
+ else if (strcmp (line , "on" ) == 0 )
434
+ seccomp_config = 2 ;
435
+ else
436
+ fatal ("Invalid seccomp option '%s', expected on/auto/off\n" ,
437
+ line );
438
+
439
+ return 0 ;
440
+ }
441
+
442
+ __uml_setup ("seccomp=" , uml_seccomp_config ,
443
+ "seccomp=<on/auto/off>\n"
444
+ " Configure whether or not SECCOMP is used. With SECCOMP, userspace\n"
445
+ " processes work collaboratively with the kernel instead of being\n"
446
+ " traced using ptrace. All syscalls from the application are caught and\n"
447
+ " redirected using a signal. This signal handler in turn is permitted to\n"
448
+ " do the selected set of syscalls to communicate with the UML kernel and\n"
449
+ " do the required memory management.\n"
450
+ "\n"
451
+ " This method is overall faster than the ptrace based userspace, primarily\n"
452
+ " because it reduces the number of context switches for (minor) page faults.\n"
453
+ "\n"
454
+ " However, the SECCOMP filter is not (yet) restrictive enough to prevent\n"
455
+ " userspace from reading and writing all physical memory. Userspace\n"
456
+ " processes could also trick the stub into disabling SIGALRM which\n"
457
+ " prevents it from being interrupted for scheduling purposes.\n"
458
+ "\n"
459
+ " This is insecure and should only be used with a trusted userspace\n\n"
460
+ );
281
461
282
462
void __init os_early_checks (void )
283
463
{
@@ -286,13 +466,24 @@ void __init os_early_checks(void)
286
466
/* Print out the core dump limits early */
287
467
check_coredump_limit ();
288
468
289
- check_ptrace ();
290
-
291
469
/* Need to check this early because mmapping happens before the
292
470
* kernel is running.
293
471
*/
294
472
check_tmpexec ();
295
473
474
+ if (seccomp_config ) {
475
+ if (init_seccomp ()) {
476
+ using_seccomp = 1 ;
477
+ return ;
478
+ }
479
+
480
+ if (seccomp_config == 2 )
481
+ fatal ("SECCOMP userspace requested but not functional!\n" );
482
+ }
483
+
484
+ using_seccomp = 0 ;
485
+ check_ptrace ();
486
+
296
487
pid = start_ptraced_child ();
297
488
if (init_pid_registers (pid ))
298
489
fatal ("Failed to initialize default registers" );
0 commit comments