@@ -16,6 +16,7 @@ import (
1616 "strconv"
1717 "strings"
1818 "sync"
19+ "syscall"
1920 "time"
2021
2122 "github.com/opencontainers/runtime-spec/specs-go"
@@ -310,18 +311,106 @@ func (p *setnsProcess) addIntoCgroupV2() error {
310311}
311312
312313func (p * setnsProcess ) addIntoCgroup () error {
314+ if p .cmd .SysProcAttr .UseCgroupFD {
315+ // We've used cgroupfd successfully, so the process is
316+ // already in the proper cgroup, nothing to do here.
317+ return nil
318+ }
313319 if cgroups .IsCgroup2UnifiedMode () {
314320 return p .addIntoCgroupV2 ()
315321 }
316322 return p .addIntoCgroupV1 ()
317323}
318324
325+ // prepareCgroupFD sets up p.cmd to use clone3 with CLONE_INTO_CGROUP
326+ // to join cgroup early, in p.cmd.Start. Returns an *os.File which
327+ // must be closed by the caller after p.Cmd.Start return.
328+ func (p * setnsProcess ) prepareCgroupFD () (* os.File , error ) {
329+ if ! cgroups .IsCgroup2UnifiedMode () {
330+ return nil , nil
331+ }
332+
333+ base := p .manager .Path ("" )
334+ if base == "" { // No cgroup to join.
335+ return nil , nil
336+ }
337+ sub := ""
338+ if p .process .SubCgroupPaths != nil {
339+ sub = p .process .SubCgroupPaths ["" ]
340+ }
341+ cgroup := path .Join (base , sub )
342+ if ! strings .HasPrefix (cgroup , base ) {
343+ return nil , fmt .Errorf ("bad sub cgroup path: %s" , sub )
344+ }
345+
346+ fd , err := os .OpenFile (cgroup , unix .O_PATH | unix .O_DIRECTORY | unix .O_CLOEXEC , 0 )
347+ if err != nil {
348+ if p .rootlessCgroups {
349+ return nil , nil
350+ }
351+ return nil , fmt .Errorf ("can't open cgroup: %w" , err )
352+ }
353+
354+ logrus .Debugf ("using CLONE_INTO_CGROUP %q" , cgroup )
355+ if p .cmd .SysProcAttr == nil {
356+ p .cmd .SysProcAttr = & syscall.SysProcAttr {}
357+ }
358+ p .cmd .SysProcAttr .UseCgroupFD = true
359+ p .cmd .SysProcAttr .CgroupFD = int (fd .Fd ())
360+
361+ return fd , nil
362+ }
363+
364+ // shouldRetryWithoutCgroupFD tells if the error returned from p.cmd.Start
365+ // could be caused by using cgroupfd.
366+ func (p * setnsProcess ) shouldRetryWithoutCgroupFD (err error ) bool {
367+ if err == nil || ! p .cmd .SysProcAttr .UseCgroupFD {
368+ return false
369+ }
370+ logrus .Debugf ("exec with CLONE_INTO_CGROUP failed: %v" , err )
371+
372+ switch {
373+ // Cgroup in which a domain controller is enabled.
374+ case errors .Is (err , unix .EBUSY ):
375+ return true
376+ // The cgroup is in the domain invalid state.
377+ case errors .Is (err , unix .EOPNOTSUPP ):
378+ return true
379+ // Rootless with no direct access to cgroup.
380+ case p .rootlessCgroups && errors .Is (err , unix .EACCES ):
381+ return true
382+ // No clone3 syscall (kernels < v5.3).
383+ case errors .Is (err , unix .ENOSYS ):
384+ return true
385+ // No CLONE_INTO_CGROUP flag support (kernels v5.3 to v5.7).
386+ case errors .Is (err , unix .E2BIG ):
387+ return true
388+ }
389+
390+ return false
391+ }
392+
319393func (p * setnsProcess ) start () (retErr error ) {
320394 defer p .comm .closeParent ()
321395
396+ fd , err := p .prepareCgroupFD ()
397+ if err != nil {
398+ return err
399+ }
400+
322401 // Get the "before" value of oom kill count.
323402 oom , _ := p .manager .OOMKillCount ()
324- err := p .startWithCPUAffinity ()
403+
404+ err = p .startWithCPUAffinity ()
405+ if fd != nil {
406+ fd .Close ()
407+ }
408+ if p .shouldRetryWithoutCgroupFD (err ) {
409+ // SysProcAttr.CgroupFD is never used when UseCgroupFD is unset.
410+ p .cmd .SysProcAttr .UseCgroupFD = false
411+ err = p .startWithCPUAffinity ()
412+ }
413+
325414 // Close the child-side of the pipes (controlled by child).
326415 p .comm .closeChild ()
327416 if err != nil {
0 commit comments