|
9 | 9 | "fmt" |
10 | 10 | "io" |
11 | 11 | "os" |
| 12 | + "path" |
12 | 13 | "regexp" |
13 | 14 | "strconv" |
14 | 15 | "strings" |
@@ -284,10 +285,57 @@ func (c *Cluster) createMachineRunArgs(machine *Machine, name string, i int) []s |
284 | 285 | "--tmpfs", "/tmp:exec,mode=777", |
285 | 286 | } |
286 | 287 | if docker.CgroupVersion() == "2" { |
287 | | - runArgs = append(runArgs, "--cgroupns", "host", |
288 | | - "--cgroup-parent", "bootloose.slice", |
289 | | - "-v", "/sys/fs/cgroup:/sys/fs/cgroup:rw") |
290 | | - |
| 288 | + runArgs = append(runArgs, "--cgroupns", "private") |
| 289 | + |
| 290 | + if !machine.spec.Privileged { |
| 291 | + // Non-privileged containers will have their /sys/fs/cgroup folder |
| 292 | + // mounted read-only, even when running in private cgroup |
| 293 | + // namespaces. This is a bummer for init systems. Containers could |
| 294 | + // probably remount the cgroup fs in read-write mode, but that would |
| 295 | + // require CAP_SYS_ADMIN _and_ a custom logic in the container's |
| 296 | + // entry point. Podman has `--security-opt unmask=/sys/fs/cgroup`, |
| 297 | + // but that's not a thing for Docker. The only other way to get a |
| 298 | + // writable cgroup fs inside the container is to explicitly mount |
| 299 | + // it. Some references: |
| 300 | + // - https://github.com/moby/moby/issues/42275 |
| 301 | + // - https://serverfault.com/a/1054414 |
| 302 | + |
| 303 | + // Docker will use cgroups like |
| 304 | + // <cgroup-parent>/docker-{{ContainerID}}.scope. |
| 305 | + // |
| 306 | + // Ideally, we could mount those to /sys/fs/cgroup inside the |
| 307 | + // containers. But there's some chicken-and-egg problem, as we only |
| 308 | + // know the container ID _after_ the container creation. As a |
| 309 | + // duct-tape solution, we mount our own cgroup as the root, which is |
| 310 | + // unrelated to the Docker-managed one: |
| 311 | + // <cgroup-parent>/cluster-{{ClusterID}}.scope/machine-{{MachineID}}.scope |
| 312 | + |
| 313 | + // FIXME: How to clean this up? Especially when Docker is being run |
| 314 | + // on a different machine? |
| 315 | + |
| 316 | + // Just assume that the cgroup fs is mounted at its default |
| 317 | + // location. We could try to figure this out via |
| 318 | + // /proc/self/mountinfo, but it's really not worth the hassle. |
| 319 | + const cgroupMountpoint = "/sys/fs/cgroup" |
| 320 | + |
| 321 | + // Use this as the parent cgroup for everything. Note that if Docker |
| 322 | + // uses the systemd cgroup driver, the cgroup name has to end with |
| 323 | + // .slice. This is not a requirement for the cgroupfs driver; it |
| 324 | + // won't care. Hence, just always use the .slice suffix, no matter |
| 325 | + // if it's required or not. |
| 326 | + const cgroupParent = "bootloose.slice" |
| 327 | + |
| 328 | + cg := path.Join( |
| 329 | + cgroupMountpoint, cgroupParent, |
| 330 | + fmt.Sprintf("cluster-%s.scope", c.spec.Cluster.Name), |
| 331 | + fmt.Sprintf("machine-%s.scope", name), |
| 332 | + ) |
| 333 | + |
| 334 | + runArgs = append(runArgs, |
| 335 | + "--cgroup-parent", cgroupParent, |
| 336 | + "-v", fmt.Sprintf("%s:%s:rw", cg, cgroupMountpoint), |
| 337 | + ) |
| 338 | + } |
291 | 339 | } else { |
292 | 340 | runArgs = append(runArgs, "-v", "/sys/fs/cgroup:/sys/fs/cgroup:ro") |
293 | 341 | } |
|
0 commit comments