diff --git a/go.mod b/go.mod index a53e86b369c..e167aad9b2f 100644 --- a/go.mod +++ b/go.mod @@ -15,7 +15,7 @@ require ( github.com/moby/sys/userns v0.1.0 github.com/mrunalp/fileutils v0.5.1 github.com/opencontainers/cgroups v0.0.5 - github.com/opencontainers/runtime-spec v1.2.2-0.20250818071321-383cadbf08c0 + github.com/opencontainers/runtime-spec v1.2.2-0.20251022072015-5caf3047c341 github.com/opencontainers/selinux v1.12.0 github.com/seccomp/libseccomp-golang v0.11.1 github.com/sirupsen/logrus v1.9.3 @@ -32,3 +32,5 @@ require ( github.com/cpuguy83/go-md2man/v2 v2.0.7 // indirect github.com/russross/blackfriday/v2 v2.1.0 // indirect ) + +replace github.com/opencontainers/cgroups v0.0.5 => github.com/cyphar/oci-cgroups v0.0.0-20251025144234-7c34f099ee0b diff --git a/go.sum b/go.sum index 45bc4513954..f419bf9bf0e 100644 --- a/go.sum +++ b/go.sum @@ -11,6 +11,8 @@ github.com/cpuguy83/go-md2man/v2 v2.0.7 h1:zbFlGlXEAKlwXpmvle3d8Oe3YnkKIK4xSRTd3 github.com/cpuguy83/go-md2man/v2 v2.0.7/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= github.com/cyphar/filepath-securejoin v0.4.1 h1:JyxxyPEaktOD+GAnqIqTf9A8tHyAG22rowi7HkoSU1s= github.com/cyphar/filepath-securejoin v0.4.1/go.mod h1:Sdj7gXlvMcPZsbhwhQ33GguGLDGQL7h7bg04C/+u9jI= +github.com/cyphar/oci-cgroups v0.0.0-20251025144234-7c34f099ee0b h1:xzotBueLMeuibA0KuEOJ5X1qTNfpWFwWCDszJoXO+SI= +github.com/cyphar/oci-cgroups v0.0.0-20251025144234-7c34f099ee0b/go.mod h1:oWVzJsKK0gG9SCRBfTpnn16WcGEqDI8PAcpMGbqWxcs= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -44,10 +46,8 @@ github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28= github.com/mrunalp/fileutils v0.5.1 h1:F+S7ZlNKnrwHfSwdlgNSkKo67ReVf8o9fel6C3dkm/Q= github.com/mrunalp/fileutils v0.5.1/go.mod h1:M1WthSahJixYnrXQl/DFQuteStB1weuxD2QJNHXfbSQ= -github.com/opencontainers/cgroups v0.0.5 h1:DRITAqcOnY0uSBzIpt1RYWLjh5DPDiqUs4fY6Y0ktls= -github.com/opencontainers/cgroups v0.0.5/go.mod h1:oWVzJsKK0gG9SCRBfTpnn16WcGEqDI8PAcpMGbqWxcs= -github.com/opencontainers/runtime-spec v1.2.2-0.20250818071321-383cadbf08c0 h1:RLn0YfUWkiqPGtgUANvJrcjIkCHGRl3jcz/c557M28M= -github.com/opencontainers/runtime-spec v1.2.2-0.20250818071321-383cadbf08c0/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= +github.com/opencontainers/runtime-spec v1.2.2-0.20251022072015-5caf3047c341 h1:fQ6LUhSWtHE2SdjVfrgANsFgQZtCNDTjUhussoMtX+8= +github.com/opencontainers/runtime-spec v1.2.2-0.20251022072015-5caf3047c341/go.mod h1:jwyrGlmzljRJv/Fgzds9SsS/C5hL+LL3ko9hs6T5lQ0= github.com/opencontainers/selinux v1.12.0 h1:6n5JV4Cf+4y0KNXW48TLj5DwfXpvWlxXplUkdTrmPb8= github.com/opencontainers/selinux v1.12.0/go.mod h1:BTPX+bjVbWGXw7ZZWUbdENt8w0htPSrlgOOysQaU62U= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index f5a24777edf..e3362842031 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -525,20 +525,22 @@ func TestPidsSystemd(t *testing.T) { testPids(t, true) } +func mkPtr[T any](v T) *T { return &v } + func testPids(t *testing.T, systemd bool) { if testing.Short() { return } config := newTemplateConfig(t, &tParam{systemd: systemd}) - config.Cgroups.Resources.PidsLimit = -1 + config.Cgroups.Resources.PidsLimit = mkPtr[int64](-1) // Running multiple processes, expecting it to succeed with no pids limit. runContainerOk(t, config, "/bin/sh", "-c", "/bin/true | /bin/true | /bin/true | /bin/true") // Enforce a permissive limit. This needs to be fairly hand-wavey due to the // issues with running Go binaries with pids restrictions (see below). - config.Cgroups.Resources.PidsLimit = 64 + config.Cgroups.Resources.PidsLimit = mkPtr[int64](64) runContainerOk(t, config, "/bin/sh", "-c", ` /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | @@ -547,7 +549,7 @@ func testPids(t *testing.T, systemd bool) { // Enforce a restrictive limit. 64 * /bin/true + 1 * shell should cause // this to fail reliably. - config.Cgroups.Resources.PidsLimit = 64 + config.Cgroups.Resources.PidsLimit = mkPtr[int64](64) out, _, err := runContainer(t, config, "/bin/sh", "-c", ` /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | /bin/true | bin/true | /bin/true | diff --git a/man/runc-update.8.md b/man/runc-update.8.md index 0e95d85ded1..af23be35ca8 100644 --- a/man/runc-update.8.md +++ b/man/runc-update.8.md @@ -85,7 +85,8 @@ stdin. If this option is used, all other options are ignored. (i.e. use unlimited swap). **--pids-limit** _num_ -: Set the maximum number of processes allowed in the container. +: Set the maximum number of processes allowed in the container. Use **-1** to +unset the limit. **--l3-cache-schema** _value_ : Set the value for Intel RDT/CAT L3 cache schema. diff --git a/tests/integration/cgroups.bats b/tests/integration/cgroups.bats index a2df63e8002..42d20dec8d5 100644 --- a/tests/integration/cgroups.bats +++ b/tests/integration/cgroups.bats @@ -324,6 +324,36 @@ convert_hugetlb_size() { done } +# https://github.com/opencontainers/runc/issues/4014. +@test "runc run (pids.limit=0 is 0)" { + [ $EUID -ne 0 ] && requires rootless_cgroup + requires cgroups_pids + + set_cgroups_path + update_config '.linux.resources.pids.limit = 0' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_pids + [ "$status" -eq 0 ] + # systemd doesn't support TasksMax=0 so runc will silently remap it to 1. + check_cgroup_value "pids.max" "1" + check_systemd_value "TasksMax" "1" +} + +# https://github.com/opencontainers/runc/issues/4014. +@test "runc run (pids.limit=-1 means unlimited)" { + [ $EUID -ne 0 ] && requires rootless_cgroup + requires cgroups_pids + + set_cgroups_path + update_config '.linux.resources.pids.limit = -1' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_pids + [ "$status" -eq 0 ] + check_cgroup_value "pids.max" "max" + # systemd < v227 shows UINT64_MAX instead of "infinity". + check_systemd_value "TasksMax" "infinity" "18446744073709551615" +} + @test "runc run (cgroup v2 resources.unified only)" { requires root cgroups_v2 @@ -390,6 +420,7 @@ convert_hugetlb_size() { set_cgroups_path # CPU shares of 3333 corresponds to CPU weight of 128. update_config ' .linux.resources.memory |= {"limit": 33554432} + | .linux.resources.pids.limit = 100 | .linux.resources.cpu |= { "shares": 3333, "quota": 40000, diff --git a/tests/integration/update.bats b/tests/integration/update.bats index 29d36fdfa2a..07f3eff1733 100644 --- a/tests/integration/update.bats +++ b/tests/integration/update.bats @@ -327,6 +327,37 @@ EOF check_cpu_shares 100 } +@test "update pids.limit" { + [ $EUID -ne 0 ] && requires rootless_cgroup + requires cgroups_pids + + runc run -d --console-socket "$CONSOLE_SOCKET" test_update + [ "$status" -eq 0 ] + + check_cgroup_value "pids.max" 20 + check_systemd_value "TasksMax" 20 + + runc update test_update --pids-limit 12345 + [ "$status" -eq 0 ] + + check_cgroup_value "pids.max" "12345" + check_systemd_value "TasksMax" "12345" + + runc update test_update --pids-limit -1 + [ "$status" -eq 0 ] + + check_cgroup_value "pids.max" "max" + # systemd < v227 shows UINT64_MAX instead of "infinity". + check_systemd_value "TasksMax" "infinity" "18446744073709551615" + + runc update test_update --pids-limit 0 + [ "$status" -eq 0 ] + + # systemd doesn't support TasksMax=0 so runc will silently remap it to 1. + check_cgroup_value "pids.max" "1" + check_systemd_value "TasksMax" "1" +} + @test "cpu burst" { [ $EUID -ne 0 ] && requires rootless_cgroup requires cgroups_cpu_burst diff --git a/update.go b/update.go index 8510c217cd5..e4abb1ef552 100644 --- a/update.go +++ b/update.go @@ -252,7 +252,9 @@ other options are ignored. } } - r.Pids.Limit = int64(context.Int("pids-limit")) + if context.IsSet("pids-limit") { + r.Pids.Limit = i64Ptr(int64(context.Int("pids-limit"))) + } } // Fix up values diff --git a/vendor/github.com/opencontainers/cgroups/config_linux.go b/vendor/github.com/opencontainers/cgroups/config_linux.go index 9bc58a3789b..3d29d938bf0 100644 --- a/vendor/github.com/opencontainers/cgroups/config_linux.go +++ b/vendor/github.com/opencontainers/cgroups/config_linux.go @@ -90,8 +90,8 @@ type Resources struct { // Cgroup's SCHED_IDLE value. CPUIdle *int64 `json:"cpu_idle,omitempty"` - // Process limit; set <= `0' to disable limit. - PidsLimit int64 `json:"pids_limit,omitempty"` + // Process limit; set < `0' to disable limit. `nil` means "keep current limit". + PidsLimit *int64 `json:"pids_limit,omitempty"` // Specifies per cgroup weight, range is from 10 to 1000. BlkioWeight uint16 `json:"blkio_weight,omitempty"` diff --git a/vendor/github.com/opencontainers/cgroups/fs/pids.go b/vendor/github.com/opencontainers/cgroups/fs/pids.go index 9319761e6ad..36bd339af82 100644 --- a/vendor/github.com/opencontainers/cgroups/fs/pids.go +++ b/vendor/github.com/opencontainers/cgroups/fs/pids.go @@ -19,19 +19,24 @@ func (s *PidsGroup) Apply(path string, _ *cgroups.Resources, pid int) error { } func (s *PidsGroup) Set(path string, r *cgroups.Resources) error { - if r.PidsLimit != 0 { - // "max" is the fallback value. - limit := "max" - - if r.PidsLimit > 0 { - limit = strconv.FormatInt(r.PidsLimit, 10) - } - - if err := cgroups.WriteFile(path, "pids.max", limit); err != nil { - return err - } + if r.PidsLimit == nil { + return nil } + // "max" is the fallback value. + val := "max" + if limit := *r.PidsLimit; limit > 0 { + val = strconv.FormatInt(limit, 10) + } else if limit == 0 { + // systemd doesn't support setting pids.max to "0", so when setting + // TasksMax we need to remap it to "1". We do the same thing here to + // avoid flip-flop behaviour between the fs and systemd drivers. In + // practice, the pids cgroup behaviour is basically identical. + val = "1" + } + if err := cgroups.WriteFile(path, "pids.max", val); err != nil { + return err + } return nil } diff --git a/vendor/github.com/opencontainers/cgroups/fs2/pids.go b/vendor/github.com/opencontainers/cgroups/fs2/pids.go index 9b82b90115e..f932259ad56 100644 --- a/vendor/github.com/opencontainers/cgroups/fs2/pids.go +++ b/vendor/github.com/opencontainers/cgroups/fs2/pids.go @@ -4,6 +4,7 @@ import ( "errors" "math" "os" + "strconv" "strings" "golang.org/x/sys/unix" @@ -13,19 +14,26 @@ import ( ) func isPidsSet(r *cgroups.Resources) bool { - return r.PidsLimit != 0 + return r.PidsLimit != nil } func setPids(dirPath string, r *cgroups.Resources) error { if !isPidsSet(r) { return nil } - if val := numToStr(r.PidsLimit); val != "" { - if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil { - return err - } + val := "max" + if limit := *r.PidsLimit; limit > 0 { + val = strconv.FormatInt(limit, 10) + } else if limit == 0 { + // systemd doesn't support setting pids.max to "0", so when setting + // TasksMax we need to remap it to "1". We do the same thing here to + // avoid flip-flop behaviour between the fs and systemd drivers. In + // practice, the pids cgroup behaviour is basically identical. + val = "1" + } + if err := cgroups.WriteFile(dirPath, "pids.max", val); err != nil { + return err } - return nil } diff --git a/vendor/github.com/opencontainers/cgroups/systemd/v1.go b/vendor/github.com/opencontainers/cgroups/systemd/v1.go index 5500a53ac0f..96e69bb8608 100644 --- a/vendor/github.com/opencontainers/cgroups/systemd/v1.go +++ b/vendor/github.com/opencontainers/cgroups/systemd/v1.go @@ -2,6 +2,7 @@ package systemd import ( "errors" + "math" "os" "path/filepath" "strings" @@ -97,9 +98,17 @@ func genV1ResourcesProperties(r *cgroups.Resources, cm *dbusConnManager) ([]syst newProp("BlockIOWeight", uint64(r.BlkioWeight))) } - if r.PidsLimit > 0 || r.PidsLimit == -1 { + if r.PidsLimit != nil { + var tasksMax uint64 + if limit := *r.PidsLimit; limit < 0 { + tasksMax = math.MaxUint64 // "infinity" + } else if limit == 0 { + tasksMax = 1 // systemd does not accept "0" for TasksMax + } else { + tasksMax = uint64(limit) + } properties = append(properties, - newProp("TasksMax", uint64(r.PidsLimit))) + newProp("TasksMax", tasksMax)) } err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems) diff --git a/vendor/github.com/opencontainers/cgroups/systemd/v2.go b/vendor/github.com/opencontainers/cgroups/systemd/v2.go index c2f2e87f3ba..f76c93e8444 100644 --- a/vendor/github.com/opencontainers/cgroups/systemd/v2.go +++ b/vendor/github.com/opencontainers/cgroups/systemd/v2.go @@ -176,6 +176,9 @@ func unifiedResToSystemdProps(cm *dbusConnManager, res map[string]string) (props return nil, fmt.Errorf("unified resource %q value conversion error: %w", k, err) } } + if num == 0 { + num = 1 // systemd does not accept "0" for TasksMax + } props = append(props, newProp("TasksMax", num)) @@ -256,9 +259,17 @@ func genV2ResourcesProperties(dirPath string, r *cgroups.Resources, cm *dbusConn addCPUQuota(cm, &properties, &r.CpuQuota, r.CpuPeriod) - if r.PidsLimit > 0 || r.PidsLimit == -1 { + if r.PidsLimit != nil { + var tasksMax uint64 + if limit := *r.PidsLimit; limit < 0 { + tasksMax = math.MaxUint64 // "infinity" + } else if limit == 0 { + tasksMax = 1 // systemd does not accept "0" for TasksMax + } else { + tasksMax = uint64(limit) + } properties = append(properties, - newProp("TasksMax", uint64(r.PidsLimit))) + newProp("TasksMax", tasksMax)) } err = addCpuset(cm, &properties, r.CpusetCpus, r.CpusetMems) diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go index 36d28032e66..bce831f6f90 100644 --- a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go +++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go @@ -434,7 +434,7 @@ type LinuxCPU struct { // LinuxPids for Linux cgroup 'pids' resource management (Linux 4.3) type LinuxPids struct { // Maximum number of PIDs. Default is "no limit". - Limit int64 `json:"limit"` + Limit *int64 `json:"limit,omitempty"` } // LinuxNetwork identification and priority configuration @@ -688,6 +688,32 @@ type WindowsHyperV struct { UtilityVMPath string `json:"utilityVMPath,omitempty"` } +// IOMems contains information about iomem addresses that should be passed to the VM. +type IOMems struct { + // Guest Frame Number to map the iomem range. If GFN is not specified, the mapping will be done to the same Frame Number as was provided in FirstMFN. + FirstGFN *uint64 `json:"firstGFN,omitempty"` + // Physical page number of iomem regions. + FirstMFN *uint64 `json:"firstMFN"` + // Number of pages to be mapped. + NrMFNs *uint64 `json:"nrMFNs"` +} + +// Hardware configuration for the VM image +type HWConfig struct { + // Path to the container device-tree file that should be passed to the VM configuration. + DeviceTree string `json:"deviceTree,omitempty"` + // Number of virtual cpus for the VM. + VCPUs *uint32 `json:"vcpus,omitempty"` + // Maximum memory in bytes allocated to the VM. + Memory *uint64 `json:"memory,omitempty"` + // Host device tree nodes to passthrough to the VM. + DtDevs []string `json:"dtdevs,omitempty"` + // Allow auto-translated domains to access specific hardware I/O memory pages. + IOMems []IOMems `json:"iomems,omitempty"` + // Allows VM to access specific physical IRQs. + Irqs []uint32 `json:"irqs,omitempty"` +} + // VM contains information for virtual-machine-based containers. type VM struct { // Hypervisor specifies hypervisor-related configuration for virtual-machine-based containers. @@ -696,6 +722,8 @@ type VM struct { Kernel VMKernel `json:"kernel"` // Image specifies guest image related configuration for virtual-machine-based containers. Image VMImage `json:"image,omitempty"` + // Hardware configuration that should be passed to the VM. + HwConfig *HWConfig `json:"hwconfig,omitempty"` } // VMHypervisor contains information about the hypervisor to use for a virtual machine. diff --git a/vendor/modules.txt b/vendor/modules.txt index 3633df7018e..b82fba17212 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -51,7 +51,7 @@ github.com/moby/sys/userns # github.com/mrunalp/fileutils v0.5.1 ## explicit; go 1.13 github.com/mrunalp/fileutils -# github.com/opencontainers/cgroups v0.0.5 +# github.com/opencontainers/cgroups v0.0.5 => github.com/cyphar/oci-cgroups v0.0.0-20251025144234-7c34f099ee0b ## explicit; go 1.23.0 github.com/opencontainers/cgroups github.com/opencontainers/cgroups/devices @@ -62,7 +62,7 @@ github.com/opencontainers/cgroups/fscommon github.com/opencontainers/cgroups/internal/path github.com/opencontainers/cgroups/manager github.com/opencontainers/cgroups/systemd -# github.com/opencontainers/runtime-spec v1.2.2-0.20250818071321-383cadbf08c0 +# github.com/opencontainers/runtime-spec v1.2.2-0.20251022072015-5caf3047c341 ## explicit github.com/opencontainers/runtime-spec/specs-go github.com/opencontainers/runtime-spec/specs-go/features