diff --git a/Dockerfile b/Dockerfile index c971448ccbc..fd9be94c098 100644 --- a/Dockerfile +++ b/Dockerfile @@ -6,6 +6,7 @@ RUN echo 'deb http://httpredir.debian.org/debian jessie-backports main' > /etc/a RUN apt-get update && apt-get install -y \ build-essential \ curl \ + sudo \ gawk \ iptables \ jq \ @@ -22,6 +23,12 @@ RUN apt-get update && apt-get install -y \ --no-install-recommends \ && apt-get clean +# Add a dummy user for the rootless integration tests. While runC does +# not require an entry in /etc/passwd to operate, one of the tests uses +# `git clone` -- and `git clone` does not allow you to clone a +# repository if the current uid does not have an entry in /etc/passwd. +RUN useradd -u1000 -m -d/home/rootless -s/bin/bash rootless + # install bats RUN cd /tmp \ && git clone https://github.com/sstephenson/bats.git \ diff --git a/Makefile b/Makefile index b82884af6cb..1cecca176f8 100644 --- a/Makefile +++ b/Makefile @@ -4,7 +4,7 @@ SOURCES := $(shell find . 2>&1 | grep -E '.*\.(c|h|go)$$') PREFIX := $(DESTDIR)/usr/local -BINDIR := $(PREFIX)/sbin +BINDIR := $(PREFIX)/bin GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null) GIT_BRANCH_CLEAN := $(shell echo $(GIT_BRANCH) | sed -e "s/[^[:alnum:]]/-/g") RUNC_IMAGE := runc_dev$(if $(GIT_BRANCH_CLEAN),:$(GIT_BRANCH_CLEAN)) @@ -79,10 +79,10 @@ runcimage: docker build -t $(RUNC_IMAGE) . test: - make unittest integration + make unittest integration rootlessintegration localtest: - make localunittest localintegration + make localunittest localintegration localrootlessintegration unittest: runcimage docker run -e TESTFLAGS -t --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) make localunittest @@ -96,6 +96,13 @@ integration: runcimage localintegration: all bats -t tests/integration${TESTFLAGS} +rootlessintegration: runcimage + docker run -e TESTFLAGS -t --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) --cap-drop=ALL -u rootless $(RUNC_IMAGE) make localintegration + +# FIXME: This should not be separate from rootlessintegration's method of running. +localrootlessintegration: all + sudo -u rootless -H PATH="${PATH}" bats -t tests/integration${TESTFLAGS} + shell: all docker run -e TESTFLAGS -ti --privileged --rm -v $(CURDIR):/go/src/$(PROJECT) $(RUNC_IMAGE) bash diff --git a/checkpoint.go b/checkpoint.go index dd7704f6161..78977d71a35 100644 --- a/checkpoint.go +++ b/checkpoint.go @@ -39,6 +39,11 @@ checkpointed.`, if err := checkArgs(context, 1, exactArgs); err != nil { return err } + // XXX: Currently this is untested with rootless containers. + if isRootless() { + return fmt.Errorf("runc checkpoint requires root") + } + container, err := getContainer(context) if err != nil { return err diff --git a/exec.go b/exec.go index 84061e6b705..22f2689abcc 100644 --- a/exec.go +++ b/exec.go @@ -90,9 +90,6 @@ following will output a list of processes running in the container: if err := checkArgs(context, 1, minArgs); err != nil { return err } - if os.Geteuid() != 0 { - return fmt.Errorf("runc should be run as root") - } if err := revisePidFile(context); err != nil { return err } diff --git a/libcontainer/cgroups/fs/apply_raw.go b/libcontainer/cgroups/fs/apply_raw.go index d316313c28e..22d82acb4e2 100644 --- a/libcontainer/cgroups/fs/apply_raw.go +++ b/libcontainer/cgroups/fs/apply_raw.go @@ -267,25 +267,8 @@ func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { }, nil } -func (raw *cgroupData) parentPath(subsystem, mountpoint, root string) (string, error) { - // Use GetThisCgroupDir instead of GetInitCgroupDir, because the creating - // process could in container and shared pid namespace with host, and - // /proc/1/cgroup could point to whole other world of cgroups. - initPath, err := cgroups.GetThisCgroupDir(subsystem) - if err != nil { - return "", err - } - // This is needed for nested containers, because in /proc/self/cgroup we - // see pathes from host, which don't exist in container. - relDir, err := filepath.Rel(root, initPath) - if err != nil { - return "", err - } - return filepath.Join(mountpoint, relDir), nil -} - func (raw *cgroupData) path(subsystem string) (string, error) { - mnt, root, err := cgroups.FindCgroupMountpointAndRoot(subsystem) + mnt, err := cgroups.FindCgroupMountpoint(subsystem) // If we didn't mount the subsystem, there is no point we make the path. if err != nil { return "", err @@ -297,7 +280,10 @@ func (raw *cgroupData) path(subsystem string) (string, error) { return filepath.Join(raw.root, filepath.Base(mnt), raw.innerPath), nil } - parentPath, err := raw.parentPath(subsystem, mnt, root) + // Use GetOwnCgroupPath instead of GetInitCgroupPath, because the creating + // process could in container and shared pid namespace with host, and + // /proc/1/cgroup could point to whole other world of cgroups. + parentPath, err := cgroups.GetOwnCgroupPath(subsystem) if err != nil { return "", err } diff --git a/libcontainer/cgroups/rootless/rootless.go b/libcontainer/cgroups/rootless/rootless.go new file mode 100644 index 00000000000..b1efbfd9997 --- /dev/null +++ b/libcontainer/cgroups/rootless/rootless.go @@ -0,0 +1,128 @@ +// +build linux + +package rootless + +import ( + "fmt" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/configs/validate" +) + +// TODO: This is copied from libcontainer/cgroups/fs, which duplicates this code +// needlessly. We should probably export this list. + +var subsystems = []subsystem{ + &fs.CpusetGroup{}, + &fs.DevicesGroup{}, + &fs.MemoryGroup{}, + &fs.CpuGroup{}, + &fs.CpuacctGroup{}, + &fs.PidsGroup{}, + &fs.BlkioGroup{}, + &fs.HugetlbGroup{}, + &fs.NetClsGroup{}, + &fs.NetPrioGroup{}, + &fs.PerfEventGroup{}, + &fs.FreezerGroup{}, + &fs.NameGroup{GroupName: "name=systemd"}, +} + +type subsystem interface { + // Name returns the name of the subsystem. + Name() string + + // Returns the stats, as 'stats', corresponding to the cgroup under 'path'. + GetStats(path string, stats *cgroups.Stats) error +} + +// The noop cgroup manager is used for rootless containers, because we currently +// cannot manage cgroups if we are in a rootless setup. This manager is chosen +// by factory if we are in rootless mode. We error out if any cgroup options are +// set in the config -- this may change in the future with upcoming kernel features +// like the cgroup namespace. + +type Manager struct { + Cgroups *configs.Cgroup + Paths map[string]string +} + +func (m *Manager) Apply(pid int) error { + // If there are no cgroup settings, there's nothing to do. + if m.Cgroups == nil { + return nil + } + + // We can't set paths. + // TODO(cyphar): Implement the case where the runner of a rootless container + // owns their own cgroup, which would allow us to set up a + // cgroup for each path. + if m.Cgroups.Paths != nil { + return fmt.Errorf("cannot change cgroup path in rootless container") + } + + // We load the paths into the manager. + paths := make(map[string]string) + for _, sys := range subsystems { + name := sys.Name() + + path, err := cgroups.GetOwnCgroupPath(name) + if err != nil { + // Ignore paths we couldn't resolve. + continue + } + + paths[name] = path + } + + m.Paths = paths + return nil +} + +func (m *Manager) GetPaths() map[string]string { + return m.Paths +} + +func (m *Manager) Set(container *configs.Config) error { + // We have to re-do the validation here, since someone might decide to + // update a rootless container. + return validate.New().Validate(container) +} + +func (m *Manager) GetPids() ([]int, error) { + dir, err := cgroups.GetOwnCgroupPath("devices") + if err != nil { + return nil, err + } + return cgroups.GetPids(dir) +} + +func (m *Manager) GetAllPids() ([]int, error) { + dir, err := cgroups.GetOwnCgroupPath("devices") + if err != nil { + return nil, err + } + return cgroups.GetAllPids(dir) +} + +func (m *Manager) GetStats() (*cgroups.Stats, error) { + // TODO(cyphar): We can make this work if we figure out a way to allow usage + // of cgroups with a rootless container. While this doesn't + // actually require write access to a cgroup directory, the + // statistics are not useful if they can be affected by + // non-container processes. + return nil, fmt.Errorf("cannot get cgroup stats in rootless container") +} + +func (m *Manager) Freeze(state configs.FreezerState) error { + // TODO(cyphar): We can make this work if we figure out a way to allow usage + // of cgroups with a rootless container. + return fmt.Errorf("cannot use freezer cgroup in rootless container") +} + +func (m *Manager) Destroy() error { + // We don't have to do anything here because we didn't do any setup. + return nil +} diff --git a/libcontainer/cgroups/systemd/apply_systemd.go b/libcontainer/cgroups/systemd/apply_systemd.go index 2872bfac78a..456c57d975d 100644 --- a/libcontainer/cgroups/systemd/apply_systemd.go +++ b/libcontainer/cgroups/systemd/apply_systemd.go @@ -426,7 +426,7 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { return "", err } - initPath, err := cgroups.GetInitCgroupDir(subsystem) + initPath, err := cgroups.GetInitCgroup(subsystem) if err != nil { return "", err } diff --git a/libcontainer/cgroups/utils.go b/libcontainer/cgroups/utils.go index 52fc87eb3e6..5db37344983 100644 --- a/libcontainer/cgroups/utils.go +++ b/libcontainer/cgroups/utils.go @@ -109,7 +109,7 @@ type Mount struct { Subsystems []string } -func (m Mount) GetThisCgroupDir(cgroups map[string]string) (string, error) { +func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) { if len(m.Subsystems) == 0 { return "", fmt.Errorf("no subsystem for mount") } @@ -203,8 +203,8 @@ func GetAllSubsystems() ([]string, error) { return subsystems, nil } -// GetThisCgroupDir returns the relative path to the cgroup docker is running in. -func GetThisCgroupDir(subsystem string) (string, error) { +// GetOwnCgroup returns the relative path to the cgroup docker is running in. +func GetOwnCgroup(subsystem string) (string, error) { cgroups, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { return "", err @@ -213,8 +213,16 @@ func GetThisCgroupDir(subsystem string) (string, error) { return getControllerPath(subsystem, cgroups) } -func GetInitCgroupDir(subsystem string) (string, error) { +func GetOwnCgroupPath(subsystem string) (string, error) { + cgroup, err := GetOwnCgroup(subsystem) + if err != nil { + return "", err + } + return getCgroupPathHelper(subsystem, cgroup) +} + +func GetInitCgroup(subsystem string) (string, error) { cgroups, err := ParseCgroupFile("/proc/1/cgroup") if err != nil { return "", err @@ -223,6 +231,31 @@ func GetInitCgroupDir(subsystem string) (string, error) { return getControllerPath(subsystem, cgroups) } +func GetInitCgroupPath(subsystem string) (string, error) { + cgroup, err := GetInitCgroup(subsystem) + if err != nil { + return "", err + } + + return getCgroupPathHelper(subsystem, cgroup) +} + +func getCgroupPathHelper(subsystem, cgroup string) (string, error) { + mnt, root, err := FindCgroupMountpointAndRoot(subsystem) + if err != nil { + return "", err + } + + // This is needed for nested containers, because in /proc/self/cgroup we + // see pathes from host, which don't exist in container. + relCgroup, err := filepath.Rel(root, cgroup) + if err != nil { + return "", err + } + + return filepath.Join(mnt, relCgroup), nil +} + func readProcsFile(dir string) ([]int, error) { f, err := os.Open(filepath.Join(dir, CgroupProcesses)) if err != nil { diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index 890cd7d19c7..98f4b8585f3 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -183,6 +183,9 @@ type Config struct { // NoNewKeyring will not allocated a new session keyring for the container. It will use the // callers keyring in this case. NoNewKeyring bool `json:"no_new_keyring"` + + // Rootless specifies whether the container is a rootless container. + Rootless bool `json:"rootless"` } type Hooks struct { diff --git a/libcontainer/configs/config_unix.go b/libcontainer/configs/config_unix.go index a60554a7b96..84463995d05 100644 --- a/libcontainer/configs/config_unix.go +++ b/libcontainer/configs/config_unix.go @@ -4,38 +4,50 @@ package configs import "fmt" -// HostUID gets the root uid for the process on host which could be non-zero -// when user namespaces are enabled. -func (c Config) HostUID() (int, error) { +// HostUID gets the translated uid for the process on host which could be +// different when user namespaces are enabled. +func (c Config) HostUID(containerId int) (int, error) { if c.Namespaces.Contains(NEWUSER) { if c.UidMappings == nil { - return -1, fmt.Errorf("User namespaces enabled, but no user mappings found.") + return -1, fmt.Errorf("User namespaces enabled, but no uid mappings found.") } - id, found := c.hostIDFromMapping(0, c.UidMappings) + id, found := c.hostIDFromMapping(containerId, c.UidMappings) if !found { - return -1, fmt.Errorf("User namespaces enabled, but no root user mapping found.") + return -1, fmt.Errorf("User namespaces enabled, but no user mapping found.") } return id, nil } - // Return default root uid 0 - return 0, nil + // Return unchanged id. + return containerId, nil } -// HostGID gets the root gid for the process on host which could be non-zero +// HostRootUID gets the root uid for the process on host which could be non-zero // when user namespaces are enabled. -func (c Config) HostGID() (int, error) { +func (c Config) HostRootUID() (int, error) { + return c.HostUID(0) +} + +// HostGID gets the translated gid for the process on host which could be +// different when user namespaces are enabled. +func (c Config) HostGID(containerId int) (int, error) { if c.Namespaces.Contains(NEWUSER) { if c.GidMappings == nil { return -1, fmt.Errorf("User namespaces enabled, but no gid mappings found.") } - id, found := c.hostIDFromMapping(0, c.GidMappings) + id, found := c.hostIDFromMapping(containerId, c.GidMappings) if !found { - return -1, fmt.Errorf("User namespaces enabled, but no root group mapping found.") + return -1, fmt.Errorf("User namespaces enabled, but no group mapping found.") } return id, nil } - // Return default root gid 0 - return 0, nil + // Return unchanged id. + return containerId, nil +} + +// HostRootGID gets the root gid for the process on host which could be non-zero +// when user namespaces are enabled. +func (c Config) HostRootGID() (int, error) { + return c.HostGID(0) } // Utility function that gets a host ID for a container ID from user namespace map diff --git a/libcontainer/configs/config_unix_test.go b/libcontainer/configs/config_unix_test.go index dc01cd0132f..7f966152d6d 100644 --- a/libcontainer/configs/config_unix_test.go +++ b/libcontainer/configs/config_unix_test.go @@ -65,11 +65,11 @@ func TestRemoveNamespace(t *testing.T) { } } -func TestHostUIDNoUSERNS(t *testing.T) { +func TestHostRootUIDNoUSERNS(t *testing.T) { config := &Config{ Namespaces: Namespaces{}, } - uid, err := config.HostUID() + uid, err := config.HostRootUID() if err != nil { t.Fatal(err) } @@ -78,7 +78,7 @@ func TestHostUIDNoUSERNS(t *testing.T) { } } -func TestHostUIDWithUSERNS(t *testing.T) { +func TestHostRootUIDWithUSERNS(t *testing.T) { config := &Config{ Namespaces: Namespaces{{Type: NEWUSER}}, UidMappings: []IDMap{ @@ -89,7 +89,7 @@ func TestHostUIDWithUSERNS(t *testing.T) { }, }, } - uid, err := config.HostUID() + uid, err := config.HostRootUID() if err != nil { t.Fatal(err) } @@ -98,11 +98,11 @@ func TestHostUIDWithUSERNS(t *testing.T) { } } -func TestHostGIDNoUSERNS(t *testing.T) { +func TestHostRootGIDNoUSERNS(t *testing.T) { config := &Config{ Namespaces: Namespaces{}, } - uid, err := config.HostGID() + uid, err := config.HostRootGID() if err != nil { t.Fatal(err) } @@ -111,7 +111,7 @@ func TestHostGIDNoUSERNS(t *testing.T) { } } -func TestHostGIDWithUSERNS(t *testing.T) { +func TestHostRootGIDWithUSERNS(t *testing.T) { config := &Config{ Namespaces: Namespaces{{Type: NEWUSER}}, GidMappings: []IDMap{ @@ -122,7 +122,7 @@ func TestHostGIDWithUSERNS(t *testing.T) { }, }, } - uid, err := config.HostGID() + uid, err := config.HostRootGID() if err != nil { t.Fatal(err) } diff --git a/libcontainer/configs/validate/rootless.go b/libcontainer/configs/validate/rootless.go new file mode 100644 index 00000000000..0cebfaf801a --- /dev/null +++ b/libcontainer/configs/validate/rootless.go @@ -0,0 +1,117 @@ +package validate + +import ( + "fmt" + "os" + "reflect" + "strings" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +var ( + geteuid = os.Geteuid + getegid = os.Getegid +) + +func (v *ConfigValidator) rootless(config *configs.Config) error { + if err := rootlessMappings(config); err != nil { + return err + } + if err := rootlessMount(config); err != nil { + return err + } + // Currently, cgroups cannot effectively be used in rootless containers. + // The new cgroup namespace doesn't really help us either because it doesn't + // have nice interactions with the user namespace (we're working with upstream + // to fix this). + if err := rootlessCgroup(config); err != nil { + return err + } + + // XXX: We currently can't verify the user config at all, because + // configs.Config doesn't store the user-related configs. So this + // has to be verified by setupUser() in init_linux.go. + + return nil +} + +func rootlessMappings(config *configs.Config) error { + rootuid, err := config.HostRootUID() + if err != nil { + return fmt.Errorf("failed to get root uid from uidMappings: %v", err) + } + if euid := geteuid(); euid != 0 { + if !config.Namespaces.Contains(configs.NEWUSER) { + return fmt.Errorf("rootless containers require user namespaces") + } + if rootuid != euid { + return fmt.Errorf("rootless containers cannot map container root to a different host user") + } + } + + rootgid, err := config.HostRootGID() + if err != nil { + return fmt.Errorf("failed to get root gid from gidMappings: %v", err) + } + + // Similar to the above test, we need to make sure that we aren't trying to + // map to a group ID that we don't have the right to be. + if rootgid != getegid() { + return fmt.Errorf("rootless containers cannot map container root to a different host group") + } + + // We can only map one user and group inside a container (our own). + if len(config.UidMappings) != 1 || config.UidMappings[0].Size != 1 { + return fmt.Errorf("rootless containers cannot map more than one user") + } + if len(config.GidMappings) != 1 || config.GidMappings[0].Size != 1 { + return fmt.Errorf("rootless containers cannot map more than one group") + } + + return nil +} + +// cgroup verifies that the user isn't trying to set any cgroup limits or paths. +func rootlessCgroup(config *configs.Config) error { + // Nothing set at all. + if config.Cgroups == nil || config.Cgroups.Resources == nil { + return nil + } + + // Used for comparing to the zero value. + left := reflect.ValueOf(*config.Cgroups.Resources) + right := reflect.Zero(left.Type()) + + // This is all we need to do, since specconv won't add cgroup options in + // rootless mode. + if !reflect.DeepEqual(left.Interface(), right.Interface()) { + return fmt.Errorf("cannot specify resource limits in rootless container") + } + + return nil +} + +// mount verifies that the user isn't trying to set up any mounts they don't have +// the rights to do. In addition, it makes sure that no mount has a `uid=` or +// `gid=` option that doesn't resolve to root. +func rootlessMount(config *configs.Config) error { + // XXX: We could whitelist allowed devices at this point, but I'm not + // convinced that's a good idea. The kernel is the best arbiter of + // access control. + + for _, mount := range config.Mounts { + // Check that the options list doesn't contain any uid= or gid= entries + // that don't resolve to root. + for _, opt := range strings.Split(mount.Data, ",") { + if strings.HasPrefix(opt, "uid=") && opt != "uid=0" { + return fmt.Errorf("cannot specify uid= mount options in rootless containers where argument isn't 0") + } + if strings.HasPrefix(opt, "gid=") && opt != "gid=0" { + return fmt.Errorf("cannot specify gid= mount options in rootless containers where argument isn't 0") + } + } + } + + return nil +} diff --git a/libcontainer/configs/validate/rootless_test.go b/libcontainer/configs/validate/rootless_test.go new file mode 100644 index 00000000000..23d678d97ea --- /dev/null +++ b/libcontainer/configs/validate/rootless_test.go @@ -0,0 +1,195 @@ +package validate + +import ( + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +func init() { + geteuid = func() int { return 1337 } + getegid = func() int { return 7331 } +} + +func rootlessConfig() *configs.Config { + return &configs.Config{ + Rootfs: "/var", + Rootless: true, + Namespaces: configs.Namespaces( + []configs.Namespace{ + {Type: configs.NEWUSER}, + }, + ), + UidMappings: []configs.IDMap{ + { + HostID: geteuid(), + ContainerID: 0, + Size: 1, + }, + }, + GidMappings: []configs.IDMap{ + { + HostID: getegid(), + ContainerID: 0, + Size: 1, + }, + }, + } +} + +func TestValidateRootless(t *testing.T) { + validator := New() + + config := rootlessConfig() + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur: %+v", err) + } +} + +/* rootlessMappings() */ + +func TestValidateRootlessUserns(t *testing.T) { + validator := New() + + config := rootlessConfig() + config.Namespaces = nil + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if user namespaces not set") + } +} + +func TestValidateRootlessMappingUid(t *testing.T) { + validator := New() + + config := rootlessConfig() + config.UidMappings = nil + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if no uid mappings provided") + } + + config = rootlessConfig() + config.UidMappings[0].HostID = geteuid() + 1 + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if geteuid() != mapped uid") + } + + config = rootlessConfig() + config.UidMappings[0].Size = 1024 + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if more than one uid mapped") + } + + config = rootlessConfig() + config.UidMappings = append(config.UidMappings, configs.IDMap{ + HostID: geteuid() + 1, + ContainerID: 0, + Size: 1, + }) + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if more than one uid extent mapped") + } +} + +func TestValidateRootlessMappingGid(t *testing.T) { + validator := New() + + config := rootlessConfig() + config.GidMappings = nil + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if no gid mappings provided") + } + + config = rootlessConfig() + config.GidMappings[0].HostID = getegid() + 1 + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if getegid() != mapped gid") + } + + config = rootlessConfig() + config.GidMappings[0].Size = 1024 + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if more than one gid mapped") + } + + config = rootlessConfig() + config.GidMappings = append(config.GidMappings, configs.IDMap{ + HostID: getegid() + 1, + ContainerID: 0, + Size: 1, + }) + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if more than one gid extent mapped") + } +} + +/* rootlessMount() */ + +func TestValidateRootlessMountUid(t *testing.T) { + config := rootlessConfig() + validator := New() + + config.Mounts = []*configs.Mount{ + { + Source: "devpts", + Destination: "/dev/pts", + Device: "devpts", + }, + } + + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when uid= not set in mount options: %+v", err) + } + + config.Mounts[0].Data = "uid=5" + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur when setting uid=5 in mount options") + } + + config.Mounts[0].Data = "uid=0" + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when setting uid=0 in mount options: %+v", err) + } +} + +func TestValidateRootlessMountGid(t *testing.T) { + config := rootlessConfig() + validator := New() + + config.Mounts = []*configs.Mount{ + { + Source: "devpts", + Destination: "/dev/pts", + Device: "devpts", + }, + } + + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when gid= not set in mount options: %+v", err) + } + + config.Mounts[0].Data = "gid=5" + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur when setting gid=5 in mount options") + } + + config.Mounts[0].Data = "gid=0" + if err := validator.Validate(config); err != nil { + t.Errorf("Expected error to not occur when setting gid=0 in mount options: %+v", err) + } +} + +/* rootlessCgroup() */ + +func TestValidateRootlessCgroup(t *testing.T) { + validator := New() + + config := rootlessConfig() + config.Cgroups = &configs.Cgroup{ + Resources: &configs.Resources{ + PidsLimit: 1337, + }, + } + if err := validator.Validate(config); err == nil { + t.Errorf("Expected error to occur if cgroup limits set") + } +} diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go index f076f506a24..0dd580ac901 100644 --- a/libcontainer/configs/validate/validator.go +++ b/libcontainer/configs/validate/validator.go @@ -40,6 +40,11 @@ func (v *ConfigValidator) Validate(config *configs.Config) error { if err := v.sysctl(config); err != nil { return err } + if config.Rootless { + if err := v.rootless(config); err != nil { + return err + } + } return nil } diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 28dff866be3..faecc4683e5 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -51,6 +51,9 @@ type State struct { // Platform specific fields below here + // Specifies if the container was started under the rootless mode. + Rootless bool `json:"rootless"` + // Path to all the cgroups setup for a container. Key is cgroup subsystem name // with the value as the path. CgroupPaths map[string]string `json:"cgroup_paths"` @@ -304,11 +307,11 @@ func (c *linuxContainer) Signal(s os.Signal, all bool) error { } func (c *linuxContainer) createExecFifo() error { - rootuid, err := c.Config().HostUID() + rootuid, err := c.Config().HostRootUID() if err != nil { return err } - rootgid, err := c.Config().HostGID() + rootgid, err := c.Config().HostRootGID() if err != nil { return err } @@ -452,6 +455,7 @@ func (c *linuxContainer) newInitConfig(process *Process) *initConfig { PassedFilesCount: len(process.ExtraFiles), ContainerId: c.ID(), NoNewPrivileges: c.config.NoNewPrivileges, + Rootless: c.config.Rootless, AppArmorProfile: c.config.AppArmorProfile, ProcessLabel: c.config.ProcessLabel, Rlimits: c.config.Rlimits, @@ -516,10 +520,18 @@ func (c *linuxContainer) Resume() error { } func (c *linuxContainer) NotifyOOM() (<-chan struct{}, error) { + // XXX(cyphar): This requires cgroups. + if c.config.Rootless { + return nil, fmt.Errorf("cannot get OOM notifications from rootless container") + } return notifyOnOOM(c.cgroupManager.GetPaths()) } func (c *linuxContainer) NotifyMemoryPressure(level PressureLevel) (<-chan struct{}, error) { + // XXX(cyphar): This requires cgroups. + if c.config.Rootless { + return nil, fmt.Errorf("cannot get memory pressure notifications from rootless container") + } return notifyMemoryPressure(c.cgroupManager.GetPaths(), level) } @@ -622,6 +634,13 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() + // TODO(avagin): Figure out how to make this work nicely. CRIU 2.0 has + // support for doing unprivileged dumps, but the setup of + // rootless containers might make this complicated. + if c.config.Rootless { + return fmt.Errorf("cannot checkpoint a rootless container") + } + if err := c.checkCriuVersion("1.5.2"); err != nil { return err } @@ -791,6 +810,13 @@ func (c *linuxContainer) restoreNetwork(req *criurpc.CriuReq, criuOpts *CriuOpts func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { c.m.Lock() defer c.m.Unlock() + + // TODO(avagin): Figure out how to make this work nicely. CRIU doesn't have + // support for unprivileged restore at the moment. + if c.config.Rootless { + return fmt.Errorf("cannot restore a rootless container") + } + if err := c.checkCriuVersion("1.5.2"); err != nil { return err } @@ -918,6 +944,7 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error { } func (c *linuxContainer) criuApplyCgroups(pid int, req *criurpc.CriuReq) error { + // XXX: Do we need to deal with this case? AFAIK criu still requires root. if err := c.cgroupManager.Apply(pid); err != nil { return err } @@ -1314,6 +1341,7 @@ func (c *linuxContainer) currentState() (*State, error) { InitProcessStartTime: startTime, Created: c.created, }, + Rootless: c.config.Rootless, CgroupPaths: c.cgroupManager.GetPaths(), NamespacePaths: make(map[configs.NamespaceType]string), ExternalDescriptors: externalDescriptors, @@ -1441,19 +1469,34 @@ func (c *linuxContainer) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Na Type: GidmapAttr, Value: b, }) - // check if we have CAP_SETGID to setgroup properly - pid, err := capability.NewPid(os.Getpid()) - if err != nil { - return nil, err - } - if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { - r.AddData(&Boolmsg{ - Type: SetgroupAttr, - Value: true, - }) + // The following only applies if we are root. + if !c.config.Rootless { + // check if we have CAP_SETGID to setgroup properly + pid, err := capability.NewPid(os.Getpid()) + if err != nil { + return nil, err + } + if !pid.Get(capability.EFFECTIVE, capability.CAP_SETGID) { + r.AddData(&Boolmsg{ + Type: SetgroupAttr, + Value: true, + }) + } } } } + // write oom_score_adj + r.AddData(&Bytemsg{ + Type: OomScoreAdjAttr, + Value: []byte(fmt.Sprintf("%d", c.config.OomScoreAdj)), + }) + + // write rootless + r.AddData(&Boolmsg{ + Type: RootlessAttr, + Value: c.config.Rootless, + }) + return bytes.NewReader(r.Serialize()), nil } diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index d553287553a..6a0f8558373 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -15,6 +15,7 @@ import ( "github.com/docker/docker/pkg/mount" "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs" + "github.com/opencontainers/runc/libcontainer/cgroups/rootless" "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs/validate" @@ -73,6 +74,20 @@ func Cgroupfs(l *LinuxFactory) error { return nil } +// RootlessCgroups is an options func to configure a LinuxFactory to +// return containers that use the "rootless" cgroup manager, which will +// fail to do any operations not possible to do with an unprivileged user. +// It should only be used in conjunction with rootless containers. +func RootlessCgroups(l *LinuxFactory) error { + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + return &rootless.Manager{ + Cgroups: config, + Paths: paths, + } + } + return nil +} + // TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs. func TmpfsRoot(l *LinuxFactory) error { mounted, err := mount.Mounted(l.Root) @@ -149,11 +164,11 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err if err := l.Validator.Validate(config); err != nil { return nil, newGenericError(err, ConfigInvalid) } - uid, err := config.HostUID() + uid, err := config.HostRootUID() if err != nil { return nil, newGenericError(err, SystemError) } - gid, err := config.HostGID() + gid, err := config.HostRootGID() if err != nil { return nil, newGenericError(err, SystemError) } @@ -169,6 +184,9 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err if err := os.Chown(containerRoot, uid, gid); err != nil { return nil, newGenericError(err, SystemError) } + if config.Rootless { + RootlessCgroups(l) + } c := &linuxContainer{ id: id, root: containerRoot, @@ -195,6 +213,10 @@ func (l *LinuxFactory) Load(id string) (Container, error) { processStartTime: state.InitProcessStartTime, fds: state.ExternalDescriptors, } + // We have to use the RootlessManager. + if state.Rootless { + RootlessCgroups(l) + } c := &linuxContainer{ initProcess: r, initProcessStartTime: state.InitProcessStartTime, diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 39b83a4eb10..99cc02cbd02 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -6,10 +6,8 @@ import ( "encoding/json" "fmt" "io" - "io/ioutil" "net" "os" - "strconv" "strings" "syscall" "unsafe" @@ -60,6 +58,7 @@ type initConfig struct { ContainerId string `json:"containerid"` Rlimits []configs.Rlimit `json:"rlimits"` CreateConsole bool `json:"create_console"` + Rootless bool `json:"rootless"` } type initer interface { @@ -231,18 +230,21 @@ func syncParentHooks(pipe io.ReadWriter) error { func setupUser(config *initConfig) error { // Set up defaults. defaultExecUser := user.ExecUser{ - Uid: syscall.Getuid(), - Gid: syscall.Getgid(), + Uid: 0, + Gid: 0, Home: "/", } + passwdPath, err := user.GetPasswdPath() if err != nil { return err } + groupPath, err := user.GetGroupPath() if err != nil { return err } + execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath) if err != nil { return err @@ -255,22 +257,49 @@ func setupUser(config *initConfig) error { return err } } + + if config.Rootless { + if execUser.Uid != 0 { + return fmt.Errorf("cannot run as a non-root user in a rootless container") + } + + if execUser.Gid != 0 { + return fmt.Errorf("cannot run as a non-root group in a rootless container") + } + + // We cannot set any additional groups in a rootless container and thus we + // bail if the user asked us to do so. TODO: We currently can't do this + // earlier, but if libcontainer.Process.User was typesafe this might work. + if len(addGroups) > 0 { + return fmt.Errorf("cannot set any additional groups in a rootless container") + } + } + // before we change to the container's user make sure that the processes STDIO // is correctly owned by the user that we are switching to. - if err := fixStdioPermissions(execUser); err != nil { + if err := fixStdioPermissions(config, execUser); err != nil { return err } - suppGroups := append(execUser.Sgids, addGroups...) - if err := syscall.Setgroups(suppGroups); err != nil { - return err + + // This isn't allowed in an unprivileged user namespace since Linux 3.19. + // There's nothing we can do about /etc/group entries, so we silently + // ignore setting groups here (since the user didn't explicitly ask us to + // set the group). + if !config.Rootless { + suppGroups := append(execUser.Sgids, addGroups...) + if err := syscall.Setgroups(suppGroups); err != nil { + return err + } } if err := system.Setgid(execUser.Gid); err != nil { return err } + if err := system.Setuid(execUser.Uid); err != nil { return err } + // if we didn't get HOME already, set it based on the user's HOME if envHome := os.Getenv("HOME"); envHome == "" { if err := os.Setenv("HOME", execUser.Home); err != nil { @@ -283,7 +312,7 @@ func setupUser(config *initConfig) error { // fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user. // The ownership needs to match because it is created outside of the container and needs to be // localized. -func fixStdioPermissions(u *user.ExecUser) error { +func fixStdioPermissions(config *initConfig, u *user.ExecUser) error { var null syscall.Stat_t if err := syscall.Stat("/dev/null", &null); err != nil { return err @@ -297,10 +326,20 @@ func fixStdioPermissions(u *user.ExecUser) error { if err := syscall.Fstat(int(fd), &s); err != nil { return err } + // Skip chown of /dev/null if it was used as one of the STDIO fds. if s.Rdev == null.Rdev { continue } + + // Skip chown if s.Gid is actually an unmapped gid in the host. While + // this is a bit dodgy if it just so happens that the console _is_ + // owned by overflow_gid, there's no way for us to disambiguate this as + // a userspace program. + if _, err := config.Config.HostGID(int(s.Gid)); err != nil { + continue + } + // We only change the uid owner (as it is possible for the mount to // prefer a different gid, and there's no reason for us to change it). // The reason why we don't just leave the default uid=X mount setup is @@ -369,12 +408,6 @@ func setupRlimits(limits []configs.Rlimit, pid int) error { return nil } -func setOomScoreAdj(oomScoreAdj int, pid int) error { - path := fmt.Sprintf("/proc/%d/oom_score_adj", pid) - - return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0600) -} - const _P_PID = 1 type siginfo struct { diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go index a189c7244bf..bc725a227d6 100644 --- a/libcontainer/message_linux.go +++ b/libcontainer/message_linux.go @@ -11,12 +11,15 @@ import ( // list of known message types we want to send to bootstrap program // The number is randomly chosen to not conflict with known netlink types const ( - InitMsg uint16 = 62000 - CloneFlagsAttr uint16 = 27281 - NsPathsAttr uint16 = 27282 - UidmapAttr uint16 = 27283 - GidmapAttr uint16 = 27284 - SetgroupAttr uint16 = 27285 + InitMsg uint16 = 62000 + CloneFlagsAttr uint16 = 27281 + NsPathsAttr uint16 = 27282 + UidmapAttr uint16 = 27283 + GidmapAttr uint16 = 27284 + SetgroupAttr uint16 = 27285 + OomScoreAdjAttr uint16 = 27286 + RootlessAttr uint16 = 27287 + // When syscall.NLA_HDRLEN is in gccgo, take this out. syscall_NLA_HDRLEN = (syscall.SizeofNlAttr + syscall.NLA_ALIGNTO - 1) & ^(syscall.NLA_ALIGNTO - 1) ) diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 51bd1e3eccc..0ad68834388 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -72,18 +72,23 @@ struct nlconfig_t { char *namespaces; size_t namespaces_len; uint8_t is_setgroup; + uint8_t is_rootless; + char *oom_score_adj; + size_t oom_score_adj_len; }; /* * List of netlink message types sent to us as part of bootstrapping the init. * These constants are defined in libcontainer/message_linux.go. */ -#define INIT_MSG 62000 +#define INIT_MSG 62000 #define CLONE_FLAGS_ATTR 27281 #define NS_PATHS_ATTR 27282 -#define UIDMAP_ATTR 27283 -#define GIDMAP_ATTR 27284 +#define UIDMAP_ATTR 27283 +#define GIDMAP_ATTR 27284 #define SETGROUP_ATTR 27285 +#define OOM_SCORE_ADJ_ATTR 27286 +#define ROOTLESS_ATTR 27287 /* * Use the raw syscall for versions of glibc which don't include a function for @@ -172,6 +177,7 @@ static void update_setgroups(int pid, enum policy_t setgroup) policy = "deny"; break; case SETGROUPS_DEFAULT: + default: /* Nothing to do. */ return; } @@ -186,7 +192,7 @@ static void update_setgroups(int pid, enum policy_t setgroup) } } -static void update_uidmap(int pid, char *map, int map_len) +static void update_uidmap(int pid, char *map, size_t map_len) { if (map == NULL || map_len <= 0) return; @@ -195,7 +201,7 @@ static void update_uidmap(int pid, char *map, int map_len) bail("failed to update /proc/%d/uid_map", pid); } -static void update_gidmap(int pid, char *map, int map_len) +static void update_gidmap(int pid, char *map, size_t map_len) { if (map == NULL || map_len <= 0) return; @@ -204,6 +210,15 @@ static void update_gidmap(int pid, char *map, int map_len) bail("failed to update /proc/%d/gid_map", pid); } +static void update_oom_score_adj(char *data, size_t len) +{ + if (data == NULL || len <= 0) + return; + + if (write_file(data, len, "/proc/self/oom_score_adj") < 0) + bail("failed to update /proc/self/oom_score_adj"); +} + /* A dummy function that just jumps to the given jumpval. */ static int child_func(void *arg) __attribute__ ((noinline)); static int child_func(void *arg) @@ -317,6 +332,13 @@ static void nl_parse(int fd, struct nlconfig_t *config) case CLONE_FLAGS_ATTR: config->cloneflags = readint32(current); break; + case ROOTLESS_ATTR: + config->is_rootless = readint8(current); + break; + case OOM_SCORE_ADJ_ATTR: + config->oom_score_adj = current; + config->oom_score_adj_len = payload_len; + break; case NS_PATHS_ATTR: config->namespaces = current; config->namespaces_len = payload_len; @@ -425,14 +447,32 @@ void nsexec(void) if (pipenum == -1) return; - /* make the process non-dumpable */ - if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) != 0) { - bail("failed to set process as non-dumpable"); - } - /* Parse all of the netlink configuration. */ nl_parse(pipenum, &config); + /* Set oom_score_adj. This has to be done before !dumpable because + * /proc/self/oom_score_adj is not writeable unless you're an privileged + * user (if !dumpable is set). All children inherit their parent's + * oom_score_adj value on fork(2) so this will always be propagated + * properly. + */ + update_oom_score_adj(config.oom_score_adj, config.oom_score_adj_len); + + /* + * Make the process non-dumpable, to avoid various race conditions that + * could cause processes in namespaces we're joining to access host + * resources (or potentially execute code). + * + * However, if the number of namespaces we are joining is 0, we are not + * going to be switching to a different security context. Thus setting + * ourselves to be non-dumpable only breaks things (like rootless + * containers), which is the recommendation from the kernel folks. + */ + if (config.namespaces) { + if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) + bail("failed to set process as non-dumpable"); + } + /* Pipe so we can tell the child when we've finished setting up. */ if (socketpair(AF_LOCAL, SOCK_STREAM, 0, sync_child_pipe) < 0) bail("failed to setup sync pipe between parent and child"); @@ -540,9 +580,21 @@ void nsexec(void) exit(ret); case SYNC_USERMAP_PLS: - /* Enable setgroups(2) if we've been asked to. */ + /* + * Enable setgroups(2) if we've been asked to. But we also + * have to explicitly disable setgroups(2) if we're + * creating a rootless container (this is required since + * Linux 3.19). + */ + if (config.is_rootless && config.is_setgroup) { + kill(child, SIGKILL); + bail("cannot allow setgroup in an unprivileged user namespace setup"); + } + if (config.is_setgroup) update_setgroups(child, SETGROUPS_ALLOW); + if (config.is_rootless) + update_setgroups(child, SETGROUPS_DENY); /* Set up mappings. */ update_uidmap(child, config.uidmap, config.uidmap_len); @@ -681,6 +733,11 @@ void nsexec(void) * clone_parent rant). So signal our parent to hook us up. */ + /* Switching is only necessary if we joined namespaces. */ + if (config.namespaces) { + if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0) + bail("failed to set process as dumpable"); + } s = SYNC_USERMAP_PLS; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); @@ -691,6 +748,11 @@ void nsexec(void) bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); if (s != SYNC_USERMAP_ACK) bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); + /* Switching is only necessary if we joined namespaces. */ + if (config.namespaces) { + if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) + bail("failed to set process as dumpable"); + } } /* @@ -774,8 +836,10 @@ void nsexec(void) if (setgid(0) < 0) bail("setgid failed"); - if (setgroups(0, NULL) < 0) - bail("setgroups failed"); + if (!config.is_rootless && config.is_setgroup) { + if (setgroups(0, NULL) < 0) + bail("setgroups failed"); + } s = SYNC_CHILD_READY; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 0f79a3811b7..bfe99551d4e 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -80,15 +80,12 @@ func (p *setnsProcess) start() (err error) { if err = p.execSetns(); err != nil { return newSystemErrorWithCause(err, "executing setns process") } - if len(p.cgroupPaths) > 0 { + // We can't join cgroups if we're in a rootless container. + if !p.config.Rootless && len(p.cgroupPaths) > 0 { if err := cgroups.EnterPid(p.cgroupPaths, p.pid()); err != nil { return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) } } - // set oom_score_adj - if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil { - return newSystemErrorWithCause(err, "setting oom score") - } // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { @@ -257,8 +254,9 @@ func (p *initProcess) start() error { return newSystemErrorWithCausef(err, "getting pipe fds for pid %d", p.pid()) } p.setExternalDescriptors(fds) - // Do this before syncing with child so that no children - // can escape the cgroup + // Do this before syncing with child so that no children can escape the + // cgroup. We don't need to worry about not doing this and not being root + // because we'd be using the rootless cgroup manager in that case. if err := p.manager.Apply(p.pid()); err != nil { return newSystemErrorWithCause(err, "applying cgroup configuration for process") } @@ -285,10 +283,6 @@ func (p *initProcess) start() error { if err := p.manager.Set(p.config.Config); err != nil { return newSystemErrorWithCause(err, "setting cgroup config for ready process") } - // set oom_score_adj - if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil { - return newSystemErrorWithCause(err, "setting oom score for ready process") - } // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { @@ -432,6 +426,12 @@ func getPipeFds(pid int) ([]string, error) { f := filepath.Join(dirPath, strconv.Itoa(i)) target, err := os.Readlink(f) if err != nil { + // Ignore permission errors, for rootless containers and other + // non-dumpable processes. if we can't get the fd for a particular + // file, there's not much we can do. + if os.IsPermission(err) { + continue + } return fds, err } fds[i] = target diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 2635fd6f99c..b4948687e27 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -348,7 +348,7 @@ func getCgroupMounts(m *configs.Mount) ([]*configs.Mount, error) { var binds []*configs.Mount for _, mm := range mounts { - dir, err := mm.GetThisCgroupDir(cgroupPaths) + dir, err := mm.GetOwnCgroup(cgroupPaths) if err != nil { return nil, err } diff --git a/libcontainer/specconv/example.go b/libcontainer/specconv/example.go new file mode 100644 index 00000000000..9a4460ce7fb --- /dev/null +++ b/libcontainer/specconv/example.go @@ -0,0 +1,227 @@ +package specconv + +import ( + "os" + "runtime" + "strings" + + "github.com/opencontainers/runtime-spec/specs-go" +) + +func sPtr(s string) *string { return &s } + +// Example returns an example spec file, with many options set so a user can +// see what a standard spec file looks like. +func Example() *specs.Spec { + return &specs.Spec{ + Version: specs.Version, + Platform: specs.Platform{ + OS: runtime.GOOS, + Arch: runtime.GOARCH, + }, + Root: specs.Root{ + Path: "rootfs", + Readonly: true, + }, + Process: specs.Process{ + Terminal: true, + User: specs.User{}, + Args: []string{ + "sh", + }, + Env: []string{ + "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", + "TERM=xterm", + }, + Cwd: "/", + NoNewPrivileges: true, + Capabilities: &specs.LinuxCapabilities{ + Bounding: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Permitted: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Inheritable: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Ambient: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + Effective: []string{ + "CAP_AUDIT_WRITE", + "CAP_KILL", + "CAP_NET_BIND_SERVICE", + }, + }, + Rlimits: []specs.LinuxRlimit{ + { + Type: "RLIMIT_NOFILE", + Hard: uint64(1024), + Soft: uint64(1024), + }, + }, + }, + Hostname: "runc", + Mounts: []specs.Mount{ + { + Destination: "/proc", + Type: "proc", + Source: "proc", + Options: nil, + }, + { + Destination: "/dev", + Type: "tmpfs", + Source: "tmpfs", + Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"}, + }, + { + Destination: "/dev/pts", + Type: "devpts", + Source: "devpts", + Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"}, + }, + { + Destination: "/dev/shm", + Type: "tmpfs", + Source: "shm", + Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"}, + }, + { + Destination: "/dev/mqueue", + Type: "mqueue", + Source: "mqueue", + Options: []string{"nosuid", "noexec", "nodev"}, + }, + { + Destination: "/sys", + Type: "sysfs", + Source: "sysfs", + Options: []string{"nosuid", "noexec", "nodev", "ro"}, + }, + { + Destination: "/sys/fs/cgroup", + Type: "cgroup", + Source: "cgroup", + Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"}, + }, + }, + Linux: &specs.Linux{ + MaskedPaths: []string{ + "/proc/kcore", + "/proc/latency_stats", + "/proc/timer_list", + "/proc/timer_stats", + "/proc/sched_debug", + "/sys/firmware", + }, + ReadonlyPaths: []string{ + "/proc/asound", + "/proc/bus", + "/proc/fs", + "/proc/irq", + "/proc/sys", + "/proc/sysrq-trigger", + }, + Resources: &specs.LinuxResources{ + Devices: []specs.LinuxDeviceCgroup{ + { + Allow: false, + Access: "rwm", + }, + }, + }, + Namespaces: []specs.LinuxNamespace{ + { + Type: "pid", + }, + { + Type: "network", + }, + { + Type: "ipc", + }, + { + Type: "uts", + }, + { + Type: "mount", + }, + }, + }, + } +} + +// ExampleRootless returns an example spec file that works with rootless +// containers. It's essentially a modified version of the specfile from +// Example(). +func ToRootless(spec *specs.Spec) { + var namespaces []specs.LinuxNamespace + + // Remove networkns from the spec. + for _, ns := range spec.Linux.Namespaces { + switch ns.Type { + case specs.NetworkNamespace, specs.UserNamespace: + // Do nothing. + default: + namespaces = append(namespaces, ns) + } + } + // Add userns to the spec. + namespaces = append(namespaces, specs.LinuxNamespace{ + Type: specs.UserNamespace, + }) + spec.Linux.Namespaces = namespaces + + // Add mappings for the current user. + spec.Linux.UIDMappings = []specs.LinuxIDMapping{{ + HostID: uint32(os.Geteuid()), + ContainerID: 0, + Size: 1, + }} + spec.Linux.GIDMappings = []specs.LinuxIDMapping{{ + HostID: uint32(os.Getegid()), + ContainerID: 0, + Size: 1, + }} + + // Fix up mounts. + var mounts []specs.Mount + for _, mount := range spec.Mounts { + // Ignore all mounts that are under /sys. + if strings.HasPrefix(mount.Destination, "/sys") { + continue + } + + // Remove all gid= and uid= mappings. + var options []string + for _, option := range mount.Options { + if !strings.HasPrefix(option, "gid=") && !strings.HasPrefix(option, "uid=") { + options = append(options, option) + } + } + + mount.Options = options + mounts = append(mounts, mount) + } + // Add the sysfs mount as an rbind. + mounts = append(mounts, specs.Mount{ + Source: "/sys", + Destination: "/sys", + Type: "none", + Options: []string{"rbind", "nosuid", "noexec", "nodev", "ro"}, + }) + spec.Mounts = mounts + + // Remove cgroup settings. + spec.Linux.Resources = nil +} diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 52b3ca112d1..1575ae03793 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -145,6 +145,7 @@ type CreateOpts struct { NoPivotRoot bool NoNewKeyring bool Spec *specs.Spec + Rootless bool } // CreateLibcontainerConfig creates a new libcontainer configuration from a @@ -175,6 +176,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { Hostname: spec.Hostname, Labels: append(labels, fmt.Sprintf("bundle=%s", cwd)), NoNewKeyring: opts.NoNewKeyring, + Rootless: opts.Rootless, } exists := false @@ -208,7 +210,7 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { if err := setupUserNamespace(spec, config); err != nil { return nil, err } - c, err := createCgroupConfig(opts.CgroupName, opts.UseSystemdCgroup, spec) + c, err := createCgroupConfig(opts) if err != nil { return nil, err } @@ -264,8 +266,14 @@ func createLibcontainerMount(cwd string, m specs.Mount) *configs.Mount { } } -func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (*configs.Cgroup, error) { - var myCgroupPath string +func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { + var ( + myCgroupPath string + + spec = opts.Spec + useSystemdCgroup = opts.UseSystemdCgroup + name = opts.CgroupName + ) c := &configs.Cgroup{ Resources: &configs.Resources{}, @@ -301,9 +309,14 @@ func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (* c.Path = myCgroupPath } - c.Resources.AllowedDevices = allowedDevices - if spec.Linux == nil { - return c, nil + // In rootless containers, any attempt to make cgroup changes will fail. + // libcontainer will validate this and we shouldn't add any cgroup options + // the user didn't specify. + if !opts.Rootless { + c.Resources.AllowedDevices = allowedDevices + if spec.Linux == nil { + return c, nil + } } r := spec.Linux.Resources if r == nil { @@ -340,8 +353,10 @@ func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (* } c.Resources.Devices = append(c.Resources.Devices, dd) } - // append the default allowed devices to the end of the list - c.Resources.Devices = append(c.Resources.Devices, allowedDevices...) + if !opts.Rootless { + // append the default allowed devices to the end of the list + c.Resources.Devices = append(c.Resources.Devices, allowedDevices...) + } if r.Memory != nil { if r.Memory.Limit != nil { c.Resources.Memory = *r.Memory.Limit @@ -595,11 +610,11 @@ func setupUserNamespace(spec *specs.Spec, config *configs.Config) error { for _, m := range spec.Linux.GIDMappings { config.GidMappings = append(config.GidMappings, create(m)) } - rootUID, err := config.HostUID() + rootUID, err := config.HostRootUID() if err != nil { return err } - rootGID, err := config.HostGID() + rootGID, err := config.HostRootGID() if err != nil { return err } diff --git a/libcontainer/specconv/spec_linux_test.go b/libcontainer/specconv/spec_linux_test.go index baa2638adac..f7292f36829 100644 --- a/libcontainer/specconv/spec_linux_test.go +++ b/libcontainer/specconv/spec_linux_test.go @@ -5,6 +5,7 @@ package specconv import ( "testing" + "github.com/opencontainers/runc/libcontainer/configs/validate" "github.com/opencontainers/runtime-spec/specs-go" ) @@ -16,7 +17,13 @@ func TestLinuxCgroupsPathSpecified(t *testing.T) { CgroupsPath: cgroupsPath, } - cgroup, err := createCgroupConfig("ContainerID", false, spec) + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + } + + cgroup, err := createCgroupConfig(opts) if err != nil { t.Errorf("Couldn't create Cgroup config: %v", err) } @@ -28,8 +35,13 @@ func TestLinuxCgroupsPathSpecified(t *testing.T) { func TestLinuxCgroupsPathNotSpecified(t *testing.T) { spec := &specs.Spec{} + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + } - cgroup, err := createCgroupConfig("ContainerID", false, spec) + cgroup, err := createCgroupConfig(opts) if err != nil { t.Errorf("Couldn't create Cgroup config: %v", err) } @@ -39,6 +51,27 @@ func TestLinuxCgroupsPathNotSpecified(t *testing.T) { } } +func TestSpecconvExampleValidate(t *testing.T) { + spec := Example() + spec.Root.Path = "/" + + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + } + + config, err := CreateLibcontainerConfig(opts) + if err != nil { + t.Errorf("Couldn't create libcontainer config: %v", err) + } + + validator := validate.New() + if err := validator.Validate(config); err != nil { + t.Errorf("Expected specconv to produce valid container config: %v", err) + } +} + func TestDupNamespaces(t *testing.T) { spec := &specs.Spec{ Linux: &specs.Linux{ @@ -62,3 +95,26 @@ func TestDupNamespaces(t *testing.T) { t.Errorf("Duplicated namespaces should be forbidden") } } + +func TestRootlessSpecconvValidate(t *testing.T) { + spec := Example() + spec.Root.Path = "/" + ToRootless(spec) + + opts := &CreateOpts{ + CgroupName: "ContainerID", + UseSystemdCgroup: false, + Spec: spec, + Rootless: true, + } + + config, err := CreateLibcontainerConfig(opts) + if err != nil { + t.Errorf("Couldn't create libcontainer config: %v", err) + } + + validator := validate.New() + if err := validator.Validate(config); err != nil { + t.Errorf("Expected specconv to produce valid rootless container config: %v", err) + } +} diff --git a/list.go b/list.go index c7550a2a853..1c3b9aa8352 100644 --- a/list.go +++ b/list.go @@ -7,12 +7,14 @@ import ( "io/ioutil" "os" "path/filepath" + "syscall" "text/tabwriter" "time" "encoding/json" "github.com/opencontainers/runc/libcontainer" + "github.com/opencontainers/runc/libcontainer/user" "github.com/opencontainers/runc/libcontainer/utils" "github.com/urfave/cli" ) @@ -38,6 +40,8 @@ type containerState struct { Created time.Time `json:"created"` // Annotations is the user defined annotations added to the config. Annotations map[string]string `json:"annotations,omitempty"` + // The owner of the state directory (the owner of the container). + Owner string `json:"owner"` } var listCommand = cli.Command{ @@ -85,14 +89,15 @@ To list containers created using a non-default value for "--root": switch context.String("format") { case "table": w := tabwriter.NewWriter(os.Stdout, 12, 1, 3, ' ', 0) - fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\n") + fmt.Fprint(w, "ID\tPID\tSTATUS\tBUNDLE\tCREATED\tOWNER\n") for _, item := range s { - fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\n", + fmt.Fprintf(w, "%s\t%d\t%s\t%s\t%s\t%s\n", item.ID, item.InitProcessPid, item.Status, item.Bundle, - item.Created.Format(time.RFC3339Nano)) + item.Created.Format(time.RFC3339Nano), + item.Owner) } if err := w.Flush(); err != nil { return err @@ -126,6 +131,13 @@ func getContainers(context *cli.Context) ([]containerState, error) { var s []containerState for _, item := range list { if item.IsDir() { + // This cast is safe on Linux. + stat := item.Sys().(*syscall.Stat_t) + owner, err := user.LookupUid(int(stat.Uid)) + if err != nil { + owner.Name = string(stat.Uid) + } + container, err := factory.Load(item.Name()) if err != nil { fmt.Fprintf(os.Stderr, "load container %s: %v\n", item.Name(), err) @@ -155,6 +167,7 @@ func getContainers(context *cli.Context) ([]containerState, error) { Rootfs: state.BaseState.Config.Rootfs, Created: state.BaseState.Created, Annotations: annotations, + Owner: owner.Name, }) } } diff --git a/ps.go b/ps.go index b8a1b111b4c..6e0c7376a1b 100644 --- a/ps.go +++ b/ps.go @@ -28,6 +28,11 @@ var psCommand = cli.Command{ if err := checkArgs(context, 1, minArgs); err != nil { return err } + // XXX: Currently not supported with rootless containers. + if isRootless() { + return fmt.Errorf("runc ps requires root") + } + container, err := getContainer(context) if err != nil { return err diff --git a/restore.go b/restore.go index afc604653f7..06f635f130d 100644 --- a/restore.go +++ b/restore.go @@ -3,6 +3,7 @@ package main import ( + "fmt" "os" "syscall" @@ -86,6 +87,11 @@ using the runc checkpoint command.`, if err := checkArgs(context, 1, exactArgs); err != nil { return err } + // XXX: Currently this is untested with rootless containers. + if isRootless() { + return fmt.Errorf("runc restore requires root") + } + imagePath := context.String("image-path") id := context.Args().First() if id == "" { diff --git a/spec.go b/spec.go index 1b55c6b4c2b..9024ad4cd7f 100644 --- a/spec.go +++ b/spec.go @@ -10,6 +10,7 @@ import ( "runtime" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/specconv" "github.com/opencontainers/runtime-spec/specs-go" "github.com/urfave/cli" ) @@ -63,156 +64,20 @@ container on your host.`, Value: "", Usage: "path to the root of the bundle directory", }, + cli.BoolFlag{ + Name: "rootless", + Usage: "generate a configuration for a rootless container", + }, }, Action: func(context *cli.Context) error { if err := checkArgs(context, 0, exactArgs); err != nil { return err } - spec := specs.Spec{ - Version: specs.Version, - Platform: specs.Platform{ - OS: runtime.GOOS, - Arch: runtime.GOARCH, - }, - Root: specs.Root{ - Path: "rootfs", - Readonly: true, - }, - Process: specs.Process{ - Terminal: true, - User: specs.User{}, - Args: []string{ - "sh", - }, - Env: []string{ - "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - "TERM=xterm", - }, - Cwd: "/", - NoNewPrivileges: true, - Capabilities: &specs.LinuxCapabilities{ - Bounding: []string{ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE", - }, - Permitted: []string{ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE", - }, - Inheritable: []string{ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE", - }, - Ambient: []string{ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE", - }, - Effective: []string{ - "CAP_AUDIT_WRITE", - "CAP_KILL", - "CAP_NET_BIND_SERVICE", - }, - }, - Rlimits: []specs.LinuxRlimit{ - { - Type: "RLIMIT_NOFILE", - Hard: uint64(1024), - Soft: uint64(1024), - }, - }, - }, - Hostname: "runc", - Mounts: []specs.Mount{ - { - Destination: "/proc", - Type: "proc", - Source: "proc", - Options: nil, - }, - { - Destination: "/dev", - Type: "tmpfs", - Source: "tmpfs", - Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"}, - }, - { - Destination: "/dev/pts", - Type: "devpts", - Source: "devpts", - Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"}, - }, - { - Destination: "/dev/shm", - Type: "tmpfs", - Source: "shm", - Options: []string{"nosuid", "noexec", "nodev", "mode=1777", "size=65536k"}, - }, - { - Destination: "/dev/mqueue", - Type: "mqueue", - Source: "mqueue", - Options: []string{"nosuid", "noexec", "nodev"}, - }, - { - Destination: "/sys", - Type: "sysfs", - Source: "sysfs", - Options: []string{"nosuid", "noexec", "nodev", "ro"}, - }, - { - Destination: "/sys/fs/cgroup", - Type: "cgroup", - Source: "cgroup", - Options: []string{"nosuid", "noexec", "nodev", "relatime", "ro"}, - }, - }, - Linux: &specs.Linux{ - MaskedPaths: []string{ - "/proc/kcore", - "/proc/latency_stats", - "/proc/timer_list", - "/proc/timer_stats", - "/proc/sched_debug", - "/sys/firmware", - }, - ReadonlyPaths: []string{ - "/proc/asound", - "/proc/bus", - "/proc/fs", - "/proc/irq", - "/proc/sys", - "/proc/sysrq-trigger", - }, - Resources: &specs.LinuxResources{ - Devices: []specs.LinuxDeviceCgroup{ - { - Allow: false, - Access: "rwm", - }, - }, - }, - Namespaces: []specs.LinuxNamespace{ - { - Type: "pid", - }, - { - Type: "network", - }, - { - Type: "ipc", - }, - { - Type: "uts", - }, - { - Type: "mount", - }, - }, - }, + spec := specconv.Example() + + rootless := context.Bool("rootless") + if rootless { + specconv.ToRootless(spec) } checkNoFile := func(name string) error { @@ -234,7 +99,7 @@ container on your host.`, if err := checkNoFile(specConfig); err != nil { return err } - data, err := json.MarshalIndent(&spec, "", "\t") + data, err := json.MarshalIndent(spec, "", "\t") if err != nil { return err } diff --git a/tests/integration/cgroups.bats b/tests/integration/cgroups.bats index 9ab6f432beb..90095a7ea87 100644 --- a/tests/integration/cgroups.bats +++ b/tests/integration/cgroups.bats @@ -28,7 +28,9 @@ function check_cgroup_value() { } @test "runc update --kernel-memory (initialized)" { - requires cgroups_kmem + # XXX: currently cgroups require root containers. + requires cgroups_kmem root + # Add cgroup path sed -i 's/\("linux": {\)/\1\n "cgroupsPath": "\/runc-cgroups-integration-test",/' ${BUSYBOX_BUNDLE}/config.json @@ -56,7 +58,9 @@ EOF } @test "runc update --kernel-memory (uninitialized)" { - requires cgroups_kmem + # XXX: currently cgroups require root containers. + requires cgroups_kmem root + # Add cgroup path sed -i 's/\("linux": {\)/\1\n "cgroupsPath": "\/runc-cgroups-integration-test",/' ${BUSYBOX_BUNDLE}/config.json diff --git a/tests/integration/checkpoint.bats b/tests/integration/checkpoint.bats index 34d1b0363f7..cd969a80695 100644 --- a/tests/integration/checkpoint.bats +++ b/tests/integration/checkpoint.bats @@ -12,7 +12,8 @@ function teardown() { } @test "checkpoint and restore" { - requires criu + # XXX: currently criu require root containers. + requires criu root # criu does not work with external terminals so.. # setting terminal and root:readonly: to false @@ -58,8 +59,9 @@ function teardown() { [[ "${output}" == *"running"* ]] } -@test "checkpoint(pre-dump) and restore" { - requires criu +@test "checkpoint --pre-dump and restore" { + # XXX: currently criu require root containers. + requires criu root # criu does not work with external terminals so.. # setting terminal and root:readonly: to false diff --git a/tests/integration/delete.bats b/tests/integration/delete.bats index cdadd7dcce5..2c11e79b691 100644 --- a/tests/integration/delete.bats +++ b/tests/integration/delete.bats @@ -22,11 +22,13 @@ function teardown() { testcontainer test_busybox running runc kill test_busybox KILL + [ "$status" -eq 0 ] # wait for busybox to be in the destroyed state retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'" # delete test_busybox runc delete test_busybox + [ "$status" -eq 0 ] runc state test_busybox [ "$status" -ne 0 ] diff --git a/tests/integration/events.bats b/tests/integration/events.bats index 182b721b8af..23500733b45 100644 --- a/tests/integration/events.bats +++ b/tests/integration/events.bats @@ -12,6 +12,9 @@ function teardown() { } @test "events --stats" { + # XXX: currently cgroups require root containers. + requires root + # run busybox detached runc run -d --console-socket $CONSOLE_SOCKET test_busybox [ "$status" -eq 0 ] @@ -27,6 +30,9 @@ function teardown() { } @test "events --interval default " { + # XXX: currently cgroups require root containers. + requires root + # run busybox detached runc run -d --console-socket $CONSOLE_SOCKET test_busybox [ "$status" -eq 0 ] @@ -54,6 +60,9 @@ function teardown() { } @test "events --interval 1s " { + # XXX: currently cgroups require root containers. + requires root + # run busybox detached runc run -d --console-socket $CONSOLE_SOCKET test_busybox [ "$status" -eq 0 ] @@ -80,6 +89,9 @@ function teardown() { } @test "events --interval 100ms " { + # XXX: currently cgroups require root containers. + requires root + # run busybox detached runc run -d --console-socket $CONSOLE_SOCKET test_busybox [ "$status" -eq 0 ] diff --git a/tests/integration/exec.bats b/tests/integration/exec.bats index ba60ea17183..f172f9bd88f 100644 --- a/tests/integration/exec.bats +++ b/tests/integration/exec.bats @@ -112,6 +112,9 @@ function teardown() { } @test "runc exec --user" { + # --user can't work in rootless containers + requires root + # run busybox detached runc run -d --console-socket $CONSOLE_SOCKET test_busybox [ "$status" -eq 0 ] diff --git a/tests/integration/help.bats b/tests/integration/help.bats index ca404f342af..163de2d35cf 100644 --- a/tests/integration/help.bats +++ b/tests/integration/help.bats @@ -57,6 +57,7 @@ load helpers [ "$status" -eq 0 ] [[ ${lines[1]} =~ runc\ resume+ ]] + # We don't use runc_spec here, because we're just testing the help page. runc spec -h [ "$status" -eq 0 ] [[ ${lines[1]} =~ runc\ spec+ ]] diff --git a/tests/integration/helpers.bash b/tests/integration/helpers.bash index e4c2cb93611..fc8c2904223 100644 --- a/tests/integration/helpers.bash +++ b/tests/integration/helpers.bash @@ -4,7 +4,7 @@ INTEGRATION_ROOT=$(dirname "$(readlink -f "$BASH_SOURCE")") RUNC="${INTEGRATION_ROOT}/../../runc" RECVTTY="${INTEGRATION_ROOT}/../../contrib/cmd/recvtty/recvtty" -GOPATH="${INTEGRATION_ROOT}/../../../.." +GOPATH="$(mktemp -d --tmpdir runc-integration-gopath.XXXXXX)" # Test data path. TESTDATA="${INTEGRATION_ROOT}/testdata" @@ -27,7 +27,7 @@ KERNEL_MINOR="${KERNEL_VERSION#$KERNEL_MAJOR.}" KERNEL_MINOR="${KERNEL_MINOR%%.*}" # Root state path. -ROOT="$BATS_TMPDIR/runc" +ROOT=$(mktemp -d "$BATS_TMPDIR/runc.XXXXXX") # Path to console socket. CONSOLE_SOCKET="$BATS_TMPDIR/console.sock" @@ -40,6 +40,9 @@ CGROUP_CPU_BASE_PATH=$(grep "cgroup" /proc/self/mountinfo | gawk 'toupper($NF) ~ KMEM="${CGROUP_MEMORY_BASE_PATH}/memory.kmem.limit_in_bytes" RT_PERIOD="${CGROUP_CPU_BASE_PATH}/cpu.rt_period_us" +# Check if we're in rootless mode. +ROOTLESS=$(id -u) + # Wrapper for runc. function runc() { run __runc "$@" @@ -55,6 +58,17 @@ function __runc() { "$RUNC" --root "$ROOT" "$@" } +# Wrapper for runc spec. +function runc_spec() { + local args="" + + if [ "$ROOTLESS" -ne 0 ]; then + args+="--rootless" + fi + + runc spec $args "$@" +} + # Fails the current test, providing the error given. function fail() { echo "$@" >&2 @@ -68,7 +82,12 @@ function requires() { case $var in criu) if [ ! -e "$CRIU" ]; then - skip "Test requires ${var}." + skip "test requires ${var}" + fi + ;; + root) + if [ "$ROOTLESS" -ne 0 ]; then + skip "test requires ${var}" fi ;; cgroups_kmem) @@ -179,18 +198,18 @@ function setup_busybox() { if [ ! -e $BUSYBOX_IMAGE ]; then curl -o $BUSYBOX_IMAGE -sSL 'https://github.com/docker-library/busybox/raw/a0558a9006ce0dd6f6ec5d56cfd3f32ebeeb815f/glibc/busybox.tar.xz' fi - tar -C "$BUSYBOX_BUNDLE"/rootfs -xf "$BUSYBOX_IMAGE" + tar --exclude './dev/*' -C "$BUSYBOX_BUNDLE"/rootfs -xf "$BUSYBOX_IMAGE" cd "$BUSYBOX_BUNDLE" - runc spec + runc_spec } function setup_hello() { setup_recvtty run mkdir "$HELLO_BUNDLE" run mkdir "$HELLO_BUNDLE"/rootfs - tar -C "$HELLO_BUNDLE"/rootfs -xf "$HELLO_IMAGE" + tar --exclude './dev/*' -C "$HELLO_BUNDLE"/rootfs -xf "$HELLO_IMAGE" cd "$HELLO_BUNDLE" - runc spec + runc_spec sed -i 's;"sh";"/hello";' config.json } diff --git a/tests/integration/kill.bats b/tests/integration/kill.bats index a049de65708..74246fadbad 100644 --- a/tests/integration/kill.bats +++ b/tests/integration/kill.bats @@ -13,7 +13,6 @@ function teardown() { @test "kill detached busybox" { - # run busybox detached runc run -d --console-socket $CONSOLE_SOCKET test_busybox [ "$status" -eq 0 ] diff --git a/tests/integration/pause.bats b/tests/integration/pause.bats index 2f46a6cae07..30d98b57dbb 100644 --- a/tests/integration/pause.bats +++ b/tests/integration/pause.bats @@ -12,6 +12,9 @@ function teardown() { } @test "runc pause and resume" { + # XXX: currently cgroups require root containers. + requires root + # run busybox detached runc run -d --console-socket $CONSOLE_SOCKET test_busybox [ "$status" -eq 0 ] @@ -34,6 +37,9 @@ function teardown() { } @test "runc pause and resume with nonexist container" { + # XXX: currently cgroups require root containers. + requires root + # run test_busybox detached runc run -d --console-socket $CONSOLE_SOCKET test_busybox [ "$status" -eq 0 ] diff --git a/tests/integration/ps.bats b/tests/integration/ps.bats index 7a200150daa..c000af66304 100644 --- a/tests/integration/ps.bats +++ b/tests/integration/ps.bats @@ -12,6 +12,9 @@ function teardown() { } @test "ps" { + # ps is not supported, it requires cgroups + requires root + # start busybox detached runc run -d --console-socket $CONSOLE_SOCKET test_busybox [ "$status" -eq 0 ] @@ -24,10 +27,13 @@ function teardown() { runc ps test_busybox [ "$status" -eq 0 ] [[ ${lines[0]} =~ UID\ +PID\ +PPID\ +C\ +STIME\ +TTY\ +TIME\ +CMD+ ]] - [[ "${lines[1]}" == *"root"*[0-9]* ]] + [[ "${lines[1]}" == *"$(id -un 2>/dev/null)"*[0-9]* ]] } @test "ps -f json" { + # ps is not supported, it requires cgroups + requires root + # start busybox detached runc run -d --console-socket $CONSOLE_SOCKET test_busybox [ "$status" -eq 0 ] @@ -43,6 +49,9 @@ function teardown() { } @test "ps -e -x" { + # ps is not supported, it requires cgroups + requires root + # start busybox detached runc run -d --console-socket $CONSOLE_SOCKET test_busybox [ "$status" -eq 0 ] diff --git a/tests/integration/spec.bats b/tests/integration/spec.bats index 79bb6907651..e9f28fbfdd2 100644 --- a/tests/integration/spec.bats +++ b/tests/integration/spec.bats @@ -26,7 +26,7 @@ function teardown() { [ ! -e config.json ] # test generation of spec does not return an error - runc spec + runc_spec [ "$status" -eq 0 ] # test generation of spec created our config.json (spec) @@ -51,7 +51,7 @@ function teardown() { [ ! -e "$HELLO_BUNDLE"/config.json ] # test generation of spec does not return an error - runc spec --bundle "$HELLO_BUNDLE" + runc_spec --bundle "$HELLO_BUNDLE" [ "$status" -eq 0 ] # test generation of spec created our config.json (spec) diff --git a/tests/integration/start_detached.bats b/tests/integration/start_detached.bats index 605fde225a7..08036ddd9e2 100644 --- a/tests/integration/start_detached.bats +++ b/tests/integration/start_detached.bats @@ -23,6 +23,9 @@ function teardown() { } @test "runc run detached ({u,g}id != 0)" { + # cannot start containers as another user in rootless setup + requires root + # replace "uid": 0 with "uid": 1000 # and do a similar thing for gid. sed -i 's;"uid": 0;"uid": 1000;g' config.json diff --git a/tests/integration/start_hello.bats b/tests/integration/start_hello.bats index 6de65e07e71..2e935728085 100644 --- a/tests/integration/start_hello.bats +++ b/tests/integration/start_hello.bats @@ -21,6 +21,9 @@ function teardown() { } @test "runc run ({u,g}id != 0)" { + # cannot start containers as another user in rootless setup + requires root + # replace "uid": 0 with "uid": 1000 # and do a similar thing for gid. sed -i 's;"uid": 0;"uid": 1000;g' config.json diff --git a/tests/integration/state.bats b/tests/integration/state.bats index eed2eb3c4ac..3772c1e5a91 100644 --- a/tests/integration/state.bats +++ b/tests/integration/state.bats @@ -11,7 +11,37 @@ function teardown() { teardown_busybox } -@test "state" { +@test "state (kill + delete)" { + runc state test_busybox + [ "$status" -ne 0 ] + + # run busybox detached + runc run -d --console-socket $CONSOLE_SOCKET test_busybox + [ "$status" -eq 0 ] + + # check state + wait_for_container 15 1 test_busybox + + testcontainer test_busybox running + + runc kill test_busybox KILL + [ "$status" -eq 0 ] + + # wait for busybox to be in the destroyed state + retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'" + + # delete test_busybox + runc delete test_busybox + [ "$status" -eq 0 ] + + runc state test_busybox + [ "$status" -ne 0 ] +} + +@test "state (pause + resume)" { + # XXX: pause and resume require cgroups. + requires root + runc state test_busybox [ "$status" -ne 0 ] @@ -37,14 +67,4 @@ function teardown() { # test state of busybox is back to running testcontainer test_busybox running - - runc kill test_busybox KILL - # wait for busybox to be in the destroyed state - retry 10 1 eval "__runc state test_busybox | grep -q 'stopped'" - - # delete test_busybox - runc delete test_busybox - - runc state test_busybox - [ "$status" -ne 0 ] } diff --git a/tests/integration/tty.bats b/tests/integration/tty.bats index b9a1f108e20..9e817dbf873 100644 --- a/tests/integration/tty.bats +++ b/tests/integration/tty.bats @@ -24,6 +24,10 @@ function teardown() { } @test "runc run [tty owner]" { + # tty chmod is not doable in rootless containers. + # TODO: this can be made as a change to the gid test. + requires root + # Replace sh script with stat. sed -i 's/"sh"/"sh", "-c", "stat -c %u:%g $(tty) | tr : \\\\\\\\n"/' config.json @@ -36,6 +40,9 @@ function teardown() { } @test "runc run [tty owner] ({u,g}id != 0)" { + # tty chmod is not doable in rootless containers. + requires root + # replace "uid": 0 with "uid": 1000 # and do a similar thing for gid. sed -i 's;"uid": 0;"uid": 1000;g' config.json @@ -72,6 +79,10 @@ function teardown() { } @test "runc exec [tty owner]" { + # tty chmod is not doable in rootless containers. + # TODO: this can be made as a change to the gid test. + requires root + # run busybox detached runc run -d --console-socket $CONSOLE_SOCKET test_busybox [ "$status" -eq 0 ] @@ -90,6 +101,9 @@ function teardown() { } @test "runc exec [tty owner] ({u,g}id != 0)" { + # tty chmod is not doable in rootless containers. + requires root + # replace "uid": 0 with "uid": 1000 # and do a similar thing for gid. sed -i 's;"uid": 0;"uid": 1000;g' config.json diff --git a/tests/integration/update.bats b/tests/integration/update.bats index 9aaf1b9c35f..4a6bf7fc492 100644 --- a/tests/integration/update.bats +++ b/tests/integration/update.bats @@ -50,7 +50,11 @@ function check_cgroup_value() { # TODO: test rt cgroup updating @test "update" { - requires cgroups_kmem + # XXX: currently cgroups require root containers. + # XXX: Also, this test should be split into separate sections so that we + # can skip kmem without skipping update tests overall. + requires cgroups_kmem root + # run a few busyboxes detached runc run -d --console-socket $CONSOLE_SOCKET test_update [ "$status" -eq 0 ] diff --git a/utils.go b/utils.go index 1286fd6f2d7..98f93a4cfad 100644 --- a/utils.go +++ b/utils.go @@ -63,9 +63,6 @@ func setupSpec(context *cli.Context) (*specs.Spec, error) { if err != nil { return nil, err } - if os.Geteuid() != 0 { - return nil, fmt.Errorf("runc should be run as root") - } return spec, nil } diff --git a/utils_linux.go b/utils_linux.go index dcf156c8c5b..c6a8c028e6a 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -186,6 +186,11 @@ func createPidFile(path string, process *libcontainer.Process) error { return os.Rename(tmpName, path) } +// XXX: Currently we autodetect rootless mode. +func isRootless() bool { + return os.Geteuid() != 0 +} + func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcontainer.Container, error) { config, err := specconv.CreateLibcontainerConfig(&specconv.CreateOpts{ CgroupName: id, @@ -193,6 +198,7 @@ func createContainer(context *cli.Context, id string, spec *specs.Spec) (libcont NoPivotRoot: context.Bool("no-pivot"), NoNewKeyring: context.Bool("no-new-keyring"), Spec: spec, + Rootless: isRootless(), }) if err != nil { return nil, err @@ -236,12 +242,12 @@ func (r *runner) run(config *specs.Process) (int, error) { for i := baseFd; i < baseFd+r.preserveFDs; i++ { process.ExtraFiles = append(process.ExtraFiles, os.NewFile(uintptr(i), "PreserveFD:"+strconv.Itoa(i))) } - rootuid, err := r.container.Config().HostUID() + rootuid, err := r.container.Config().HostRootUID() if err != nil { r.destroy() return -1, err } - rootgid, err := r.container.Config().HostGID() + rootgid, err := r.container.Config().HostRootGID() if err != nil { r.destroy() return -1, err