Skip to content

Commit 1e57e94

Browse files
committed
Use private cgroup namespaces for cgroup v2
1 parent bc1f782 commit 1e57e94

File tree

1 file changed

+52
-4
lines changed

1 file changed

+52
-4
lines changed

pkg/cluster/cluster.go

Lines changed: 52 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"fmt"
1010
"io"
1111
"os"
12+
"path"
1213
"regexp"
1314
"strconv"
1415
"strings"
@@ -284,10 +285,57 @@ func (c *Cluster) createMachineRunArgs(machine *Machine, name string, i int) []s
284285
"--tmpfs", "/tmp:exec,mode=777",
285286
}
286287
if docker.CgroupVersion() == "2" {
287-
runArgs = append(runArgs, "--cgroupns", "host",
288-
"--cgroup-parent", "bootloose.slice",
289-
"-v", "/sys/fs/cgroup:/sys/fs/cgroup:rw")
290-
288+
runArgs = append(runArgs, "--cgroupns", "private")
289+
290+
if !machine.spec.Privileged {
291+
// Non-privileged containers will have their /sys/fs/cgroup folder
292+
// mounted read-only, even when running in private cgroup
293+
// namespaces. This is a bummer for init systems. Containers could
294+
// probably remount the cgroup fs in read-write mode, but that would
295+
// require CAP_SYS_ADMIN _and_ a custom logic in the container's
296+
// entry point. Podman has `--security-opt unmask=/sys/fs/cgroup`,
297+
// but that's not a thing for Docker. The only other way to get a
298+
// writable cgroup fs inside the container is to explicitly mount
299+
// it. Some references:
300+
// - https://github.com/moby/moby/issues/42275
301+
// - https://serverfault.com/a/1054414
302+
303+
// Docker will use cgroups like
304+
// <cgroup-parent>/docker-{{ContainerID}}.scope.
305+
//
306+
// Ideally, we could mount those to /sys/fs/cgroup inside the
307+
// containers. But there's some chicken-and-egg problem, as we only
308+
// know the container ID _after_ the container creation. As a
309+
// duct-tape solution, we mount our own cgroup as the root, which is
310+
// unrelated to the Docker-managed one:
311+
// <cgroup-parent>/cluster-{{ClusterID}}.scope/machine-{{MachineID}}.scope
312+
313+
// FIXME: How to clean this up? Especially when Docker is being run
314+
// on a different machine?
315+
316+
// Just assume that the cgroup fs is mounted at its default
317+
// location. We could try to figure this out via
318+
// /proc/self/mountinfo, but it's really not worth the hassle.
319+
const cgroupMountpoint = "/sys/fs/cgroup"
320+
321+
// Use this as the parent cgroup for everything. Note that if Docker
322+
// uses the systemd cgroup driver, the cgroup name has to end with
323+
// .slice. This is not a requirement for the cgroupfs driver; it
324+
// won't care. Hence, just always use the .slice suffix, no matter
325+
// if it's required or not.
326+
const cgroupParent = "bootloose.slice"
327+
328+
cg := path.Join(
329+
cgroupMountpoint, cgroupParent,
330+
fmt.Sprintf("cluster-%s.scope", c.spec.Cluster.Name),
331+
fmt.Sprintf("machine-%s.scope", name),
332+
)
333+
334+
runArgs = append(runArgs,
335+
"--cgroup-parent", cgroupParent,
336+
"-v", fmt.Sprintf("%s:%s:rw", cg, cgroupMountpoint),
337+
)
338+
}
291339
} else {
292340
runArgs = append(runArgs, "-v", "/sys/fs/cgroup:/sys/fs/cgroup:ro")
293341
}

0 commit comments

Comments
 (0)