@@ -322,30 +322,6 @@ static int clone_parent(jmp_buf *env, int jmpval)
322322 return clone (child_func , ca .stack_ptr , CLONE_PARENT | SIGCHLD , & ca );
323323}
324324
325- /* Returns the clone(2) flag for a namespace, given the name of a namespace. */
326- static int nsflag (char * name )
327- {
328- if (!strcmp (name , "cgroup" ))
329- return CLONE_NEWCGROUP ;
330- else if (!strcmp (name , "ipc" ))
331- return CLONE_NEWIPC ;
332- else if (!strcmp (name , "mnt" ))
333- return CLONE_NEWNS ;
334- else if (!strcmp (name , "net" ))
335- return CLONE_NEWNET ;
336- else if (!strcmp (name , "pid" ))
337- return CLONE_NEWPID ;
338- else if (!strcmp (name , "user" ))
339- return CLONE_NEWUSER ;
340- else if (!strcmp (name , "uts" ))
341- return CLONE_NEWUTS ;
342- else if (!strcmp (name , "time" ))
343- return CLONE_NEWTIME ;
344-
345- /* If we don't recognise a name, fallback to 0. */
346- return 0 ;
347- }
348-
349325static uint32_t readint32 (char * buf )
350326{
351327 return * (uint32_t * ) buf ;
@@ -444,35 +420,67 @@ void nl_free(struct nlconfig_t *config)
444420 free (config -> data );
445421}
446422
447- void join_namespaces (char * nslist )
448- {
449- int num = 0 , i ;
450- char * saveptr = NULL ;
451- char * namespace = strtok_r (nslist , "," , & saveptr );
452- struct namespace_t {
453- int fd ;
454- char type [PATH_MAX ];
455- char path [PATH_MAX ];
456- } * namespaces = NULL ;
423+ struct namespace_t {
424+ int fd ;
425+ char type [PATH_MAX ];
426+ char path [PATH_MAX ];
427+ };
457428
458- if (!namespace || !strlen (namespace ) || !strlen (nslist ))
459- bail ("ns paths are empty" );
429+ typedef int nsset_t ;
430+
431+ static struct nstype_t {
432+ int type ;
433+ char * name ;
434+ } all_ns_types [] = {
435+ { CLONE_NEWCGROUP , "cgroup" },
436+ { CLONE_NEWIPC , "ipc" },
437+ { CLONE_NEWNS , "mnt" },
438+ { CLONE_NEWNET , "net" },
439+ { CLONE_NEWPID , "pid" },
440+ { CLONE_NEWTIME , "time" },
441+ { CLONE_NEWUSER , "user" },
442+ { CLONE_NEWUTS , "uts" },
443+ { }, /* null terminator */
444+ };
460445
446+ /* Returns the clone(2) flag for a namespace, given the name of a namespace. */
447+ static int nstype (char * name )
448+ {
449+ for (struct nstype_t * ns = all_ns_types ; ns -> name != NULL ; ns ++ )
450+ if (!strcmp (name , ns -> name ))
451+ return ns -> type ;
461452 /*
462- * We have to open the file descriptors first, since after
463- * we join the mnt namespace we might no longer be able to
464- * access the paths.
453+ * setns(2) lets us join namespaces without knowing the type, but
454+ * namespaces usually require special handling of some kind (so joining
455+ * a namespace without knowing its type or joining a new namespace type
456+ * without corresponding handling could result in broken behaviour) and
457+ * the rest of runc doesn't allow unknown namespace types anyway.
465458 */
459+ bail ("unknown namespace type %s" , name );
460+ }
461+
462+ static nsset_t __open_namespaces (char * nsspec , struct namespace_t * * ns_list , size_t * ns_len )
463+ {
464+ int len = 0 ;
465+ nsset_t ns_to_join = 0 ;
466+ char * namespace , * saveptr = NULL ;
467+ struct namespace_t * namespaces = NULL ;
468+
469+ namespace = strtok_r (nsspec , "," , & saveptr );
470+
471+ if (!namespace || !strlen (namespace ) || !strlen (nsspec ))
472+ bail ("ns paths are empty" );
473+
466474 do {
467475 int fd ;
468476 char * path ;
469477 struct namespace_t * ns ;
470478
471479 /* Resize the namespace array. */
472- namespaces = realloc (namespaces , ++ num * sizeof (struct namespace_t ));
480+ namespaces = realloc (namespaces , ++ len * sizeof (struct namespace_t ));
473481 if (!namespaces )
474482 bail ("failed to reallocate namespace array" );
475- ns = & namespaces [num - 1 ];
483+ ns = & namespaces [len - 1 ];
476484
477485 /* Split 'ns:path'. */
478486 path = strstr (namespace , ":" );
@@ -488,38 +496,145 @@ void join_namespaces(char *nslist)
488496 strncpy (ns -> type , namespace , PATH_MAX - 1 );
489497 strncpy (ns -> path , path , PATH_MAX - 1 );
490498 ns -> path [PATH_MAX - 1 ] = '\0' ;
491- } while ((namespace = strtok_r (NULL , "," , & saveptr )) != NULL );
492499
493- /*
494- * The ordering in which we join namespaces is important. We should
495- * always join the user namespace *first*. This is all guaranteed
496- * from the container_linux.go side of this, so we're just going to
497- * follow the order given to us.
498- */
500+ ns_to_join |= nstype (ns -> type );
501+ } while ((namespace = strtok_r (NULL , "," , & saveptr )) != NULL );
499502
500- for (i = 0 ; i < num ; i ++ ) {
501- struct namespace_t * ns = & namespaces [i ];
502- int flag = nsflag (ns -> type );
503+ * ns_list = namespaces ;
504+ * ns_len = len ;
505+ return ns_to_join ;
506+ }
503507
504- write_log (DEBUG , "setns(%#x) into %s namespace (with path %s)" , flag , ns -> type , ns -> path );
505- if (setns (ns -> fd , flag ) < 0 )
508+ /*
509+ * Try to join all namespaces that are in the "allow" nsset, and return the
510+ * set we were able to successfully join. If a permission error is returned
511+ * from nsset(2), the namespace is skipped (non-permission errors are fatal).
512+ */
513+ static nsset_t __join_namespaces (nsset_t allow , struct namespace_t * ns_list , size_t ns_len )
514+ {
515+ nsset_t joined = 0 ;
516+
517+ for (size_t i = 0 ; i < ns_len ; i ++ ) {
518+ struct namespace_t * ns = & ns_list [i ];
519+ int type = nstype (ns -> type );
520+ int err , saved_errno ;
521+
522+ if (!(type & allow ))
523+ continue ;
524+
525+ err = setns (ns -> fd , type );
526+ saved_errno = errno ;
527+ write_log (DEBUG , "setns(%#x) into %s namespace (with path %s): %s" ,
528+ type , ns -> type , ns -> path , strerror (errno ));
529+ if (err < 0 ) {
530+ /* Skip permission errors. */
531+ if (saved_errno == EPERM )
532+ continue ;
506533 bail ("failed to setns into %s namespace" , ns -> type );
534+ }
535+ joined |= type ;
507536
508537 /*
509538 * If we change user namespaces, make sure we switch to root in the
510539 * namespace (this matches the logic for unshare(CLONE_NEWUSER)), lots
511540 * of things can break if we aren't the right user. See
512541 * <https://github.com/opencontainers/runc/issues/4466> for one example.
513542 */
514- if (flag == CLONE_NEWUSER ) {
543+ if (type == CLONE_NEWUSER ) {
515544 if (setresuid (0 , 0 , 0 ) < 0 )
516545 bail ("failed to become root in user namespace" );
517546 }
518547
519548 close (ns -> fd );
549+ ns -> fd = -1 ;
550+ }
551+ return joined ;
552+ }
553+
554+ static char * strappend (char * dst , char * src )
555+ {
556+ if (!dst )
557+ return strdup (src );
558+
559+ size_t len = strlen (dst ) + strlen (src ) + 1 ;
560+ dst = realloc (dst , len );
561+ strncat (dst , src , len );
562+ return dst ;
563+ }
564+
565+ static char * nsset_to_str (nsset_t nsset )
566+ {
567+ char * str = NULL ;
568+ for (struct nstype_t * ns = all_ns_types ; ns -> name != NULL ; ns ++ ) {
569+ if (ns -> type & nsset ) {
570+ if (str )
571+ str = strappend (str , ", " );
572+ str = strappend (str , ns -> name );
573+ }
574+ }
575+ return str ? : strdup ("" );
576+ }
577+
578+ static void __close_namespaces (nsset_t to_join , nsset_t joined , struct namespace_t * ns_list , size_t ns_len )
579+ {
580+ /* We expect to have joined every namespace. */
581+ nsset_t failed_to_join = to_join & ~joined ;
582+
583+ /* Double-check that we used up (and thus joined) all of the nsfds. */
584+ for (size_t i = 0 ; i < ns_len ; i ++ ) {
585+ struct namespace_t * ns = & ns_list [i ];
586+ int type = nstype (ns -> type );
587+
588+ if (ns -> fd < 0 )
589+ continue ;
590+
591+ failed_to_join |= type ;
592+ write_log (FATAL , "failed to setns(%#x) into %s namespace (with path %s): %s" ,
593+ type , ns -> type , ns -> path , strerror (EPERM ));
594+ close (ns -> fd );
595+ ns -> fd = -1 ;
520596 }
521597
522- free (namespaces );
598+ /* Make sure we joined the namespaces we planned to. */
599+ if (failed_to_join )
600+ bail ("failed to join {%s} namespaces: %s" , nsset_to_str (failed_to_join ), strerror (EPERM ));
601+
602+ free (ns_list );
603+ }
604+
605+ void join_namespaces (char * nsspec )
606+ {
607+ nsset_t to_join = 0 , joined = 0 ;
608+ struct namespace_t * ns_list ;
609+ size_t ns_len ;
610+
611+ /*
612+ * We have to open the file descriptors first, since after we join the
613+ * mnt or user namespaces we might no longer be able to access the
614+ * paths.
615+ */
616+ to_join = __open_namespaces (nsspec , & ns_list , & ns_len );
617+
618+ /*
619+ * We first try to join all non-userns namespaces to join any namespaces
620+ * that we might not be able to join once we switch credentials to the
621+ * container's userns. We then join the user namespace, and then try to
622+ * join any remaining namespaces (this last step is needed for rootless
623+ * containers -- we don't get setns(2) permissions until we join the userns
624+ * and get CAP_SYS_ADMIN).
625+ *
626+ * Splitting the joins this way is necessary for containers that are
627+ * configured to join some externally-created namespace but are also
628+ * configured to join an unrelated user namespace.
629+ *
630+ * This is similar to what nsenter(1) seems to do in practice.
631+ */
632+ joined |= __join_namespaces (to_join & ~(joined | CLONE_NEWUSER ), ns_list , ns_len );
633+ joined |= __join_namespaces (CLONE_NEWUSER , ns_list , ns_len );
634+ joined |= __join_namespaces (to_join & ~(joined | CLONE_NEWUSER ), ns_list , ns_len );
635+
636+ /* Verify that we joined all of the namespaces. */
637+ __close_namespaces (to_join , joined , ns_list , ns_len );
523638}
524639
525640static inline int sane_kill (pid_t pid , int signum )
0 commit comments