diff --git a/common/pkg/cgroups/blkio_linux.go b/common/pkg/cgroups/blkio_linux.go index c5f085ecd7..4d85ba4a70 100644 --- a/common/pkg/cgroups/blkio_linux.go +++ b/common/pkg/cgroups/blkio_linux.go @@ -3,6 +3,10 @@ package cgroups import ( + "bufio" + "errors" + "fmt" + "os" "path/filepath" "strconv" "strings" @@ -22,56 +26,122 @@ func getBlkioHandler() *linuxBlkioHandler { // Apply set the specified constraints. func (c *linuxBlkioHandler) Apply(ctr *CgroupControl, res *cgroups.Resources) error { - man, err := fs2.NewManager(ctr.config, filepath.Join(cgroupRoot, ctr.config.Path)) - if err != nil { - return err + if ctr.cgroup2 { + man, err := fs2.NewManager(ctr.config, filepath.Join(cgroupRoot, ctr.config.Path)) + if err != nil { + return err + } + return man.Set(res) + } + path := filepath.Join(cgroupRoot, Blkio, ctr.config.Path) + return c.Blkio.Set(path, res) +} + +// Create the cgroup. +func (c *linuxBlkioHandler) Create(ctr *CgroupControl) (bool, error) { + if ctr.cgroup2 { + return false, nil } - return man.Set(res) + return ctr.createCgroupDirectory(Blkio) +} + +// Destroy the cgroup. +func (c *linuxBlkioHandler) Destroy(ctr *CgroupControl) error { + return rmDirRecursively(ctr.getCgroupv1Path(Blkio)) } // Stat fills a metrics structure with usage stats for the controller. func (c *linuxBlkioHandler) Stat(ctr *CgroupControl, m *cgroups.Stats) error { var ioServiceBytesRecursive []cgroups.BlkioStatEntry - // more details on the io.stat file format:X https://facebookmicrosites.github.io/cgroup2/docs/io-controller.html - values, err := readCgroup2MapFile(ctr, "io.stat") - if err != nil { - return err - } - for k, v := range values { - d := strings.Split(k, ":") - if len(d) != 2 { - continue - } - minor, err := strconv.ParseUint(d[0], 10, 0) + if ctr.cgroup2 { + // more details on the io.stat file format:X https://facebookmicrosites.github.io/cgroup2/docs/io-controller.html + values, err := readCgroup2MapFile(ctr, "io.stat") if err != nil { return err } - major, err := strconv.ParseUint(d[1], 10, 0) + for k, v := range values { + d := strings.Split(k, ":") + if len(d) != 2 { + continue + } + minor, err := strconv.ParseUint(d[0], 10, 0) + if err != nil { + return err + } + major, err := strconv.ParseUint(d[1], 10, 0) + if err != nil { + return err + } + + for _, item := range v { + d := strings.Split(item, "=") + if len(d) != 2 { + continue + } + op := d[0] + + // Accommodate the cgroup v1 naming + switch op { + case "rbytes": + op = "read" + case "wbytes": + op = "write" + } + + value, err := strconv.ParseUint(d[1], 10, 0) + if err != nil { + return err + } + + entry := cgroups.BlkioStatEntry{ + Op: op, + Major: major, + Minor: minor, + Value: value, + } + ioServiceBytesRecursive = append(ioServiceBytesRecursive, entry) + } + } + } else { + BlkioRoot := ctr.getCgroupv1Path(Blkio) + + p := filepath.Join(BlkioRoot, "blkio.throttle.io_service_bytes_recursive") + f, err := os.Open(p) if err != nil { - return err + if errors.Is(err, os.ErrNotExist) { + return nil + } + return fmt.Errorf("open %s: %w", p, err) } + defer f.Close() - for _, item := range v { - d := strings.Split(item, "=") + scanner := bufio.NewScanner(f) + for scanner.Scan() { + line := scanner.Text() + parts := strings.Fields(line) + if len(parts) < 3 { + continue + } + d := strings.Split(parts[0], ":") if len(d) != 2 { continue } - op := d[0] - - // Accommodate the cgroup v1 naming - switch op { - case "rbytes": - op = "read" - case "wbytes": - op = "write" + minor, err := strconv.ParseUint(d[0], 10, 0) + if err != nil { + return err } - - value, err := strconv.ParseUint(d[1], 10, 0) + major, err := strconv.ParseUint(d[1], 10, 0) if err != nil { return err } + op := parts[1] + + value, err := strconv.ParseUint(parts[2], 10, 0) + if err != nil { + return err + } entry := cgroups.BlkioStatEntry{ Op: op, Major: major, @@ -80,6 +150,9 @@ func (c *linuxBlkioHandler) Stat(ctr *CgroupControl, m *cgroups.Stats) error { } ioServiceBytesRecursive = append(ioServiceBytesRecursive, entry) } + if err := scanner.Err(); err != nil { + return fmt.Errorf("parse %s: %w", p, err) + } } m.BlkioStats.IoServiceBytesRecursive = ioServiceBytesRecursive return nil diff --git a/common/pkg/cgroups/cgroups_linux.go b/common/pkg/cgroups/cgroups_linux.go index 4d7c6c2e84..1c66a8d9cc 100644 --- a/common/pkg/cgroups/cgroups_linux.go +++ b/common/pkg/cgroups/cgroups_linux.go @@ -8,9 +8,11 @@ import ( "context" "errors" "fmt" + "maps" "math" "os" "path/filepath" + "slices" "strconv" "strings" "sync" @@ -20,6 +22,8 @@ import ( systemdDbus "github.com/coreos/go-systemd/v22/dbus" "github.com/godbus/dbus/v5" "github.com/opencontainers/cgroups" + "github.com/opencontainers/cgroups/fs2" + "github.com/sirupsen/logrus" "go.podman.io/storage/pkg/fileutils" "go.podman.io/storage/pkg/unshare" "golang.org/x/sys/unix" @@ -28,7 +32,9 @@ import ( var ( // ErrCgroupDeleted means the cgroup was deleted. ErrCgroupDeleted = errors.New("cgroup deleted") - ErrStatCgroup = errors.New("no cgroup available for gathering user statistics") + // ErrCgroupV1Rootless means the cgroup v1 were attempted to be used in rootless environment. + ErrCgroupV1Rootless = errors.New("no support for CGroups V1 in rootless environments") + ErrStatCgroup = errors.New("no cgroup available for gathering user statistics") isUnifiedOnce sync.Once isUnified bool @@ -37,12 +43,23 @@ var ( // CgroupControl controls a cgroup hierarchy. type CgroupControl struct { + cgroup2 bool config *cgroups.Cgroup systemd bool + // List of additional cgroup subsystems joined that + // do not have a custom handler. + additionalControllers []controller +} + +type controller struct { + name string + symlink bool } type controllerHandler interface { + Create(*CgroupControl) (bool, error) Apply(*CgroupControl, *cgroups.Resources) error + Destroy(*CgroupControl) error Stat(*CgroupControl, *cgroups.Stats) error } @@ -75,33 +92,97 @@ func init() { } // getAvailableControllers get the available controllers. -func getAvailableControllers() ([]string, error) { - controllers := []string{} - controllersFile := filepath.Join(cgroupRoot, "cgroup.controllers") +func getAvailableControllers(exclude map[string]controllerHandler, cgroup2 bool) ([]controller, error) { + if cgroup2 { + controllers := []controller{} + controllersFile := filepath.Join(cgroupRoot, "cgroup.controllers") + + // rootless cgroupv2: check available controllers for current user, systemd or servicescope will inherit + if unshare.IsRootless() { + userSlice, err := getCgroupPathForCurrentProcess() + if err != nil { + return controllers, err + } + // userSlice already contains '/' so not adding here + basePath := cgroupRoot + userSlice + controllersFile = filepath.Join(basePath, "cgroup.controllers") + } + controllersFileBytes, err := os.ReadFile(controllersFile) + if err != nil { + return nil, fmt.Errorf("failed while reading controllers for cgroup v2: %w", err) + } + for controllerName := range strings.FieldsSeq(string(controllersFileBytes)) { + c := controller{ + name: controllerName, + symlink: false, + } + controllers = append(controllers, c) + } + return controllers, nil + } - // rootless cgroupv2: check available controllers for current user, systemd or servicescope will inherit + subsystems, _ := cgroupV1GetAllSubsystems() + controllers := []controller{} + // cgroupv1 and rootless: No subsystem is available: delegation is unsafe. if unshare.IsRootless() { - userSlice, err := getCgroupPathForCurrentProcess() + return controllers, nil + } + + for _, name := range subsystems { + if _, found := exclude[name]; found { + continue + } + fileInfo, err := os.Stat(cgroupRoot + "/" + name) if err != nil { - return controllers, err + continue + } + c := controller{ + name: name, + symlink: !fileInfo.IsDir(), } - // userSlice already contains '/' so not adding here - basePath := cgroupRoot + userSlice - controllersFile = filepath.Join(basePath, "cgroup.controllers") + controllers = append(controllers, c) } - controllersFileBytes, err := os.ReadFile(controllersFile) + + return controllers, nil +} + +// AvailableControllers get string:bool map of all the available controllers. +func AvailableControllers(exclude map[string]controllerHandler, cgroup2 bool) ([]string, error) { + availableControllers, err := getAvailableControllers(exclude, cgroup2) if err != nil { - return nil, fmt.Errorf("failed while reading controllers for cgroup v2: %w", err) + return nil, err } - for controllerName := range strings.FieldsSeq(string(controllersFileBytes)) { - controllers = append(controllers, controllerName) + controllerList := []string{} + for _, controller := range availableControllers { + controllerList = append(controllerList, controller.name) } - return controllers, nil + + return controllerList, nil } -// AvailableControllers get string:bool map of all the available controllers. -func AvailableControllers(exclude map[string]controllerHandler) ([]string, error) { - return getAvailableControllers() +func cgroupV1GetAllSubsystems() ([]string, error) { + f, err := os.Open("/proc/cgroups") + if err != nil { + return nil, err + } + defer f.Close() + + subsystems := []string{} + + s := bufio.NewScanner(f) + for s.Scan() { + text := s.Text() + if text[0] != '#' { + parts := strings.Fields(text) + if len(parts) >= 4 && parts[3] != "0" { + subsystems = append(subsystems, parts[0]) + } + } + } + if err := s.Err(); err != nil { + return nil, err + } + return subsystems, nil } func getCgroupPathForCurrentProcess() (string, error) { @@ -127,11 +208,51 @@ func getCgroupPathForCurrentProcess() (string, error) { return cgroupPath, nil } +// getCgroupv1Path is a helper function to get the cgroup v1 path. +func (c *CgroupControl) getCgroupv1Path(name string) string { + return filepath.Join(cgroupRoot, name, c.config.Path) +} + // initialize initializes the specified hierarchy. func (c *CgroupControl) initialize() (err error) { - if err := createCgroupv2Path(filepath.Join(cgroupRoot, c.config.Path)); err != nil { - return fmt.Errorf("creating cgroup path %s: %w", c.config.Path, err) + createdSoFar := map[string]controllerHandler{} + defer func() { + if err != nil { + for name, ctr := range createdSoFar { + if err := ctr.Destroy(c); err != nil { + logrus.Warningf("error cleaning up controller %s for %s", name, c.config.Path) + } + } + } + }() + if c.cgroup2 { + if err := createCgroupv2Path(filepath.Join(cgroupRoot, c.config.Path)); err != nil { + return fmt.Errorf("creating cgroup path %s: %w", c.config.Path, err) + } + } + for name, handler := range handlers { + created, err := handler.Create(c) + if err != nil { + return err + } + if created { + createdSoFar[name] = handler + } } + + if !c.cgroup2 { + // We won't need to do this for cgroup v2 + for _, ctr := range c.additionalControllers { + if ctr.symlink { + continue + } + path := c.getCgroupv1Path(ctr.name) + if err := os.MkdirAll(path, 0o755); err != nil { + return fmt.Errorf("creating cgroup path for %s: %w", ctr.name, err) + } + } + } + return nil } @@ -176,17 +297,26 @@ func readFileByKeyAsUint64(path, key string) (uint64, error) { // New creates a new cgroup control. func New(path string, resources *cgroups.Resources) (*CgroupControl, error) { - _, err := IsCgroup2UnifiedMode() + cgroup2, err := IsCgroup2UnifiedMode() if err != nil { return nil, err } control := &CgroupControl{ + cgroup2: cgroup2, config: &cgroups.Cgroup{ Path: path, Resources: resources, }, } + if !cgroup2 { + controllers, err := getAvailableControllers(handlers, false) + if err != nil { + return nil, err + } + control.additionalControllers = controllers + } + if err := control.initialize(); err != nil { return nil, err } @@ -196,11 +326,12 @@ func New(path string, resources *cgroups.Resources) (*CgroupControl, error) { // NewSystemd creates a new cgroup control. func NewSystemd(path string, resources *cgroups.Resources) (*CgroupControl, error) { - _, err := IsCgroup2UnifiedMode() + cgroup2, err := IsCgroup2UnifiedMode() if err != nil { return nil, err } control := &CgroupControl{ + cgroup2: cgroup2, systemd: true, config: &cgroups.Cgroup{ Path: path, @@ -214,16 +345,45 @@ func NewSystemd(path string, resources *cgroups.Resources) (*CgroupControl, erro // Load loads an existing cgroup control. func Load(path string) (*CgroupControl, error) { - _, err := IsCgroup2UnifiedMode() + cgroup2, err := IsCgroup2UnifiedMode() if err != nil { return nil, err } control := &CgroupControl{ + cgroup2: cgroup2, systemd: false, config: &cgroups.Cgroup{ Path: path, }, } + if !cgroup2 { + controllers, err := getAvailableControllers(handlers, false) + if err != nil { + return nil, err + } + control.additionalControllers = controllers + } + if !cgroup2 { + oneExists := false + // check that the cgroup exists at least under one controller + for name := range handlers { + p := control.getCgroupv1Path(name) + if err := fileutils.Exists(p); err == nil { + oneExists = true + break + } + } + + // if there is no controller at all, raise an error + if !oneExists { + if unshare.IsRootless() { + return nil, ErrCgroupV1Rootless + } + // compatible with the error code + // used by containerd/cgroups + return nil, ErrCgroupDeleted + } + } return control, nil } @@ -288,7 +448,26 @@ func (c *CgroupControl) DeleteByPathConn(path string, conn *systemdDbus.Conn) er if c.systemd { return systemdDestroyConn(path, conn) } - return rmDirRecursively(filepath.Join(cgroupRoot, c.config.Path)) + if c.cgroup2 { + return rmDirRecursively(filepath.Join(cgroupRoot, c.config.Path)) + } + var lastError error + for _, h := range handlers { + if err := h.Destroy(c); err != nil { + lastError = err + } + } + + for _, ctr := range c.additionalControllers { + if ctr.symlink { + continue + } + p := c.getCgroupv1Path(ctr.name) + if err := rmDirRecursively(p); err != nil { + lastError = fmt.Errorf("remove %s: %w", p, err) + } + } + return lastError } // DeleteByPath deletes the specified cgroup path. @@ -314,6 +493,36 @@ func (c *CgroupControl) Update(resources *cgroups.Resources) error { return nil } +// AddPid moves the specified pid to the cgroup. +func (c *CgroupControl) AddPid(pid int) error { + pidString := []byte(fmt.Sprintf("%d\n", pid)) + + if c.cgroup2 { + path := filepath.Join(cgroupRoot, c.config.Path) + return fs2.CreateCgroupPath(path, c.config) + } + + names := slices.Collect(maps.Keys(handlers)) + + for _, c := range c.additionalControllers { + if !c.symlink { + names = append(names, c.name) + } + } + + for _, n := range names { + // If we aren't using cgroup2, we won't write correctly to unified hierarchy + if !c.cgroup2 && n == "unified" { + continue + } + p := filepath.Join(c.getCgroupv1Path(n), "tasks") + if err := os.WriteFile(p, pidString, 0o644); err != nil { + return fmt.Errorf("write %s: %w", p, err) + } + } + return nil +} + // Stat returns usage statistics for the cgroup. func (c *CgroupControl) Stat() (*cgroups.Stats, error) { m := cgroups.Stats{} @@ -364,6 +573,23 @@ func readCgroup2MapFile(ctr *CgroupControl, name string) (map[string][]string, e return readCgroupMapPath(p) } +func (c *CgroupControl) createCgroupDirectory(controller string) (bool, error) { + cPath := c.getCgroupv1Path(controller) + err := fileutils.Exists(cPath) + if err == nil { + return false, nil + } + + if !errors.Is(err, os.ErrNotExist) { + return false, err + } + + if err := os.MkdirAll(cPath, 0o755); err != nil { + return false, fmt.Errorf("creating cgroup for %s: %w", controller, err) + } + return true, nil +} + var TestMode bool func createCgroupv2Path(path string) (deferredError error) { @@ -445,6 +671,32 @@ func cleanString(s string) string { return strings.Trim(s, "\n") } +func readAcct(ctr *CgroupControl, name string) (uint64, error) { + p := filepath.Join(ctr.getCgroupv1Path(CPUAcct), name) + return readFileAsUint64(p) +} + +func readAcctList(ctr *CgroupControl, name string) ([]uint64, error) { + p := filepath.Join(ctr.getCgroupv1Path(CPUAcct), name) + data, err := os.ReadFile(p) + if err != nil { + return nil, err + } + r := []uint64{} + for s := range strings.SplitSeq(string(data), " ") { + s = cleanString(s) + if s == "" { + break + } + v, err := strconv.ParseUint(s, 10, 64) + if err != nil { + return nil, fmt.Errorf("parsing %s: %w", s, err) + } + r = append(r, v) + } + return r, nil +} + func cpusetCopyFromParent(path string, cgroupv2 bool) error { for _, file := range []string{"cpuset.cpus", "cpuset.mems"} { if _, err := cpusetCopyFileFromParent(path, file, cgroupv2); err != nil { @@ -487,10 +739,15 @@ func cpusetCopyFileFromParent(dir, file string, cgroupv2 bool) ([]byte, error) { // SystemCPUUsage returns the system usage for all the cgroups. func SystemCPUUsage() (uint64, error) { - _, err := IsCgroup2UnifiedMode() + cgroupv2, err := IsCgroup2UnifiedMode() if err != nil { return 0, err } + if !cgroupv2 { + p := filepath.Join(cgroupRoot, CPUAcct, "cpuacct.usage") + return readFileAsUint64(p) + } + files, err := os.ReadDir(cgroupRoot) if err != nil { return 0, err @@ -543,7 +800,7 @@ func UserConnection(uid int) (*systemdDbus.Conn, error) { func UserOwnsCurrentSystemdCgroup() (bool, error) { uid := os.Geteuid() - _, err := IsCgroup2UnifiedMode() + cgroup2, err := IsCgroup2UnifiedMode() if err != nil { return false, err } @@ -565,11 +822,20 @@ func UserOwnsCurrentSystemdCgroup() (bool, error) { // If we are on a cgroup v2 system and there are cgroup v1 controllers // mounted, ignore them when the current process is at the root cgroup. - if parts[1] != "" && parts[2] == "/" { + if cgroup2 && parts[1] != "" && parts[2] == "/" { continue } - cgroupPath := filepath.Join(cgroupRoot, parts[2]) + var cgroupPath string + + if cgroup2 { + cgroupPath = filepath.Join(cgroupRoot, parts[2]) + } else { + if parts[1] != "name=systemd" { + continue + } + cgroupPath = filepath.Join(cgroupRoot, "systemd", parts[2]) + } st, err := os.Stat(cgroupPath) if err != nil { diff --git a/common/pkg/cgroups/cgroups_linux_test.go b/common/pkg/cgroups/cgroups_linux_test.go index aaf800940e..2a415acea4 100644 --- a/common/pkg/cgroups/cgroups_linux_test.go +++ b/common/pkg/cgroups/cgroups_linux_test.go @@ -89,7 +89,7 @@ func TestResources(t *testing.T) { } // test CPU Quota adjustment. - u, _, b, _, _, _ := resourcesToProps(&resources) + u, _, b, _, _, _ := resourcesToProps(&resources, true) val, ok := u["CPUQuotaPerSecUSec"] if !ok { diff --git a/common/pkg/cgroups/cpu_linux.go b/common/pkg/cgroups/cpu_linux.go index f89bac87c2..899a86d5d3 100644 --- a/common/pkg/cgroups/cpu_linux.go +++ b/common/pkg/cgroups/cpu_linux.go @@ -3,6 +3,8 @@ package cgroups import ( + "errors" + "os" "path/filepath" "strconv" @@ -21,33 +23,75 @@ func getCPUHandler() *linuxCPUHandler { // Apply set the specified constraints. func (c *linuxCPUHandler) Apply(ctr *CgroupControl, res *cgroups.Resources) error { - man, err := fs2.NewManager(ctr.config, filepath.Join(cgroupRoot, ctr.config.Path)) - if err != nil { - return err + if ctr.cgroup2 { + man, err := fs2.NewManager(ctr.config, filepath.Join(cgroupRoot, ctr.config.Path)) + if err != nil { + return err + } + return man.Set(res) } - return man.Set(res) + path := filepath.Join(cgroupRoot, CPU, ctr.config.Path) + return c.CPU.Set(path, res) +} + +// Create the cgroup. +func (c *linuxCPUHandler) Create(ctr *CgroupControl) (bool, error) { + if ctr.cgroup2 { + return false, nil + } + return ctr.createCgroupDirectory(CPU) +} + +// Destroy the cgroup. +func (c *linuxCPUHandler) Destroy(ctr *CgroupControl) error { + return rmDirRecursively(ctr.getCgroupv1Path(CPU)) } // Stat fills a metrics structure with usage stats for the controller. func (c *linuxCPUHandler) Stat(ctr *CgroupControl, m *cgroups.Stats) error { + var err error cpu := cgroups.CpuStats{} - values, err := readCgroup2MapFile(ctr, "cpu.stat") - if err != nil { - return err - } - if val, found := values["usage_usec"]; found { - cpu.CpuUsage.TotalUsage, err = strconv.ParseUint(cleanString(val[0]), 10, 64) + if ctr.cgroup2 { + values, err := readCgroup2MapFile(ctr, "cpu.stat") if err != nil { return err } - cpu.CpuUsage.TotalUsage *= 1000 - } - if val, found := values["system_usec"]; found { - cpu.CpuUsage.UsageInKernelmode, err = strconv.ParseUint(cleanString(val[0]), 10, 64) + if val, found := values["usage_usec"]; found { + cpu.CpuUsage.TotalUsage, err = strconv.ParseUint(cleanString(val[0]), 10, 64) + if err != nil { + return err + } + cpu.CpuUsage.UsageInKernelmode *= 1000 + } + if val, found := values["system_usec"]; found { + cpu.CpuUsage.UsageInKernelmode, err = strconv.ParseUint(cleanString(val[0]), 10, 64) + if err != nil { + return err + } + cpu.CpuUsage.TotalUsage *= 1000 + } + } else { + cpu.CpuUsage.TotalUsage, err = readAcct(ctr, "cpuacct.usage") if err != nil { - return err + if !errors.Is(err, os.ErrNotExist) { + return err + } + cpu.CpuUsage.TotalUsage = 0 + } + cpu.CpuUsage.UsageInKernelmode, err = readAcct(ctr, "cpuacct.usage_sys") + if err != nil { + if !errors.Is(err, os.ErrNotExist) { + return err + } + cpu.CpuUsage.UsageInKernelmode = 0 + } + cpu.CpuUsage.PercpuUsage, err = readAcctList(ctr, "cpuacct.usage_percpu") + if err != nil { + if !errors.Is(err, os.ErrNotExist) { + return err + } + cpu.CpuUsage.PercpuUsage = nil } - cpu.CpuUsage.UsageInKernelmode *= 1000 } m.CpuStats = cpu return nil diff --git a/common/pkg/cgroups/cpuset_linux.go b/common/pkg/cgroups/cpuset_linux.go index c03a73623c..10b2298e12 100644 --- a/common/pkg/cgroups/cpuset_linux.go +++ b/common/pkg/cgroups/cpuset_linux.go @@ -20,17 +20,33 @@ func getCpusetHandler() *linuxCpusetHandler { // Apply set the specified constraints. func (c *linuxCpusetHandler) Apply(ctr *CgroupControl, res *cgroups.Resources) error { - man, err := fs2.NewManager(ctr.config, filepath.Join(cgroupRoot, ctr.config.Path)) - if err != nil { - return err + if ctr.cgroup2 { + man, err := fs2.NewManager(ctr.config, filepath.Join(cgroupRoot, ctr.config.Path)) + if err != nil { + return err + } + return man.Set(res) } - return man.Set(res) + path := filepath.Join(cgroupRoot, CPUset, ctr.config.Path) + return c.CPUSet.Set(path, res) } // Create the cgroup. func (c *linuxCpusetHandler) Create(ctr *CgroupControl) (bool, error) { - path := filepath.Join(cgroupRoot, ctr.config.Path) - return true, cpusetCopyFromParent(path, true) + if ctr.cgroup2 { + path := filepath.Join(cgroupRoot, ctr.config.Path) + return true, cpusetCopyFromParent(path, true) + } + created, err := ctr.createCgroupDirectory(CPUset) + if !created || err != nil { + return created, err + } + return true, cpusetCopyFromParent(ctr.getCgroupv1Path(CPUset), false) +} + +// Destroy the cgroup. +func (c *linuxCpusetHandler) Destroy(ctr *CgroupControl) error { + return rmDirRecursively(ctr.getCgroupv1Path(CPUset)) } // Stat fills a metrics structure with usage stats for the controller. diff --git a/common/pkg/cgroups/memory_linux.go b/common/pkg/cgroups/memory_linux.go index 5a06d902ee..7f61900308 100644 --- a/common/pkg/cgroups/memory_linux.go +++ b/common/pkg/cgroups/memory_linux.go @@ -20,11 +20,28 @@ func getMemoryHandler() *linuxMemHandler { // Apply set the specified constraints. func (c *linuxMemHandler) Apply(ctr *CgroupControl, res *cgroups.Resources) error { - man, err := fs2.NewManager(ctr.config, filepath.Join(cgroupRoot, ctr.config.Path)) - if err != nil { - return err + if ctr.cgroup2 { + man, err := fs2.NewManager(ctr.config, filepath.Join(cgroupRoot, ctr.config.Path)) + if err != nil { + return err + } + return man.Set(res) + } + path := filepath.Join(cgroupRoot, Memory, ctr.config.Path) + return c.Mem.Set(path, res) +} + +// Create the cgroup. +func (c *linuxMemHandler) Create(ctr *CgroupControl) (bool, error) { + if ctr.cgroup2 { + return false, nil } - return man.Set(res) + return ctr.createCgroupDirectory(Memory) +} + +// Destroy the cgroup. +func (c *linuxMemHandler) Destroy(ctr *CgroupControl) error { + return rmDirRecursively(ctr.getCgroupv1Path(Memory)) } // Stat fills a metrics structure with usage stats for the controller. @@ -35,25 +52,48 @@ func (c *linuxMemHandler) Stat(ctr *CgroupControl, m *cgroups.Stats) error { var memoryRoot string var limitFilename string - memoryRoot = filepath.Join(cgroupRoot, ctr.config.Path) - limitFilename = "memory.max" + if ctr.cgroup2 { + memoryRoot = filepath.Join(cgroupRoot, ctr.config.Path) + limitFilename = "memory.max" - // Read memory.current - current, err := readFileAsUint64(filepath.Join(memoryRoot, "memory.current")) - if err != nil { - return err - } + // Read memory.current + current, err := readFileAsUint64(filepath.Join(memoryRoot, "memory.current")) + if err != nil { + return err + } - // Read inactive_file from memory.stat - inactiveFile, err := readFileByKeyAsUint64(filepath.Join(memoryRoot, "memory.stat"), "inactive_file") - if err != nil { - return err - } + // Read inactive_file from memory.stat + inactiveFile, err := readFileByKeyAsUint64(filepath.Join(memoryRoot, "memory.stat"), "inactive_file") + if err != nil { + return err + } + + // Docker calculation: memory.current - memory.stat['inactive_file'] + memUsage.Usage.Usage = 0 + if inactiveFile < current { + memUsage.Usage.Usage = current - inactiveFile + } + } else { + memoryRoot = ctr.getCgroupv1Path(Memory) + limitFilename = "memory.limit_in_bytes" + + // Read memory.usage_in_bytes + usageInBytes, err := readFileAsUint64(filepath.Join(memoryRoot, "memory.usage_in_bytes")) + if err != nil { + return err + } + + // Read total_inactive_file from memory.stat + totalInactiveFile, err := readFileByKeyAsUint64(filepath.Join(memoryRoot, "memory.stat"), "total_inactive_file") + if err != nil { + return err + } - // Docker calculation: memory.current - memory.stat['inactive_file'] - memUsage.Usage.Usage = 0 - if inactiveFile < current { - memUsage.Usage.Usage = current - inactiveFile + // Docker calculation: memory.usage_in_bytes - memory.stat['total_inactive_file'] + memUsage.Usage.Usage = 0 + if totalInactiveFile < usageInBytes { + memUsage.Usage.Usage = usageInBytes - totalInactiveFile + } } memUsage.Usage.Limit, err = readFileAsUint64(filepath.Join(memoryRoot, limitFilename)) diff --git a/common/pkg/cgroups/pids_linux.go b/common/pkg/cgroups/pids_linux.go index f74d80a9bf..82202830e0 100644 --- a/common/pkg/cgroups/pids_linux.go +++ b/common/pkg/cgroups/pids_linux.go @@ -20,11 +20,29 @@ func getPidsHandler() *linuxPidHandler { // Apply set the specified constraints. func (c *linuxPidHandler) Apply(ctr *CgroupControl, res *cgroups.Resources) error { - man, err := fs2.NewManager(ctr.config, filepath.Join(cgroupRoot, ctr.config.Path)) - if err != nil { - return err + if ctr.cgroup2 { + man, err := fs2.NewManager(ctr.config, filepath.Join(cgroupRoot, ctr.config.Path)) + if err != nil { + return err + } + return man.Set(res) } - return man.Set(res) + + path := filepath.Join(cgroupRoot, Pids, ctr.config.Path) + return c.Pid.Set(path, res) +} + +// Create the cgroup. +func (c *linuxPidHandler) Create(ctr *CgroupControl) (bool, error) { + if ctr.cgroup2 { + return false, nil + } + return ctr.createCgroupDirectory(Pids) +} + +// Destroy the cgroup. +func (c *linuxPidHandler) Destroy(ctr *CgroupControl) error { + return rmDirRecursively(ctr.getCgroupv1Path(Pids)) } // Stat fills a metrics structure with usage stats for the controller. @@ -34,7 +52,12 @@ func (c *linuxPidHandler) Stat(ctr *CgroupControl, m *cgroups.Stats) error { return nil } - PIDRoot := filepath.Join(cgroupRoot, ctr.config.Path) + var PIDRoot string + if ctr.cgroup2 { + PIDRoot = filepath.Join(cgroupRoot, ctr.config.Path) + } else { + PIDRoot = ctr.getCgroupv1Path(Pids) + } current, err := readFileAsUint64(filepath.Join(PIDRoot, "pids.current")) if err != nil { diff --git a/common/pkg/cgroups/systemd_linux.go b/common/pkg/cgroups/systemd_linux.go index e59a007615..c0bc6d9d38 100644 --- a/common/pkg/cgroups/systemd_linux.go +++ b/common/pkg/cgroups/systemd_linux.go @@ -32,11 +32,18 @@ func systemdCreate(resources *cgroups.Resources, path string, c *systemdDbus.Con systemdDbus.PropDescription("cgroup " + name), systemdDbus.PropWants(slice), } + var ioString string + v2, _ := IsCgroup2UnifiedMode() + if v2 { + ioString = "IOAccounting" + } else { + ioString = "BlockIOAccounting" + } pMap := map[string]bool{ "DefaultDependencies": false, "MemoryAccounting": true, "CPUAccounting": true, - "IOAccounting": true, + ioString: true, } if i == 0 { pMap["Delegate"] = true @@ -50,7 +57,7 @@ func systemdCreate(resources *cgroups.Resources, path string, c *systemdDbus.Con properties = append(properties, p) } - uMap, sMap, bMap, iMap, structMap, err := resourcesToProps(resources) + uMap, sMap, bMap, iMap, structMap, err := resourcesToProps(resources, v2) if err != nil { lastError = err continue @@ -143,7 +150,7 @@ func systemdDestroyConn(path string, c *systemdDbus.Conn) error { return nil } -func resourcesToProps(res *cgroups.Resources) (map[string]uint64, map[string]string, map[string][]byte, map[string]int64, map[string][]BlkioDev, error) { +func resourcesToProps(res *cgroups.Resources, v2 bool) (map[string]uint64, map[string]string, map[string][]byte, map[string]int64, map[string][]BlkioDev, error) { bMap := make(map[string][]byte) // this array is not used but will be once more resource limits are added sMap := make(map[string]string) @@ -169,8 +176,13 @@ func resourcesToProps(res *cgroups.Resources) (map[string]uint64, map[string]str if res.CpuShares != 0 { // convert from shares to weight. weight only supports 1-10000 - wt := (1 + ((res.CpuShares-2)*9999)/262142) - uMap["CPUWeight"] = wt + v2, _ := IsCgroup2UnifiedMode() + if v2 { + wt := (1 + ((res.CpuShares-2)*9999)/262142) + uMap["CPUWeight"] = wt + } else { + uMap["CPUShares"] = res.CpuShares + } } // CPUSet @@ -200,15 +212,21 @@ func resourcesToProps(res *cgroups.Resources) (map[string]uint64, map[string]str case res.Memory == -1 || res.MemorySwap == -1: swap := -1 uMap["MemorySwapMax"] = uint64(swap) - default: + case v2: // swap max = swap (limit + swap limit) - limit uMap["MemorySwapMax"] = uint64(res.MemorySwap - res.Memory) + default: + uMap["MemorySwapMax"] = uint64(res.MemorySwap) } } // Blkio if res.BlkioWeight > 0 { - uMap["IOWeight"] = uint64(res.BlkioWeight) + if v2 { + uMap["IOWeight"] = uint64(res.BlkioWeight) + } else { + uMap["BlockIOWeight"] = uint64(res.BlkioWeight) + } } // systemd requires the paths to be in the form /dev/{block, char}/major:minor @@ -220,7 +238,11 @@ func resourcesToProps(res *cgroups.Resources) (map[string]uint64, map[string]str Device: fmt.Sprintf("/dev/block/%d:%d", entry.Major, entry.Minor), Bytes: entry.Rate, } - structMap["IOReadBandwidthMax"] = append(structMap["IOReadBandwidthMax"], newThrottle) + if v2 { + structMap["IOReadBandwidthMax"] = append(structMap["IOReadBandwidthMax"], newThrottle) + } else { + structMap["BlockIOReadBandwidth"] = append(structMap["BlockIOReadBandwidth"], newThrottle) + } } } @@ -230,7 +252,11 @@ func resourcesToProps(res *cgroups.Resources) (map[string]uint64, map[string]str Device: fmt.Sprintf("/dev/block/%d:%d", entry.Major, entry.Minor), Bytes: entry.Rate, } - structMap["IOWriteBandwidthMax"] = append(structMap["IOWriteBandwidthMax"], newThrottle) + if v2 { + structMap["IOWriteBandwidthMax"] = append(structMap["IOWriteBandwidthMax"], newThrottle) + } else { + structMap["BlockIOWriteBandwidth"] = append(structMap["BlockIOWriteBandwidth"], newThrottle) + } } } @@ -240,7 +266,11 @@ func resourcesToProps(res *cgroups.Resources) (map[string]uint64, map[string]str Device: fmt.Sprintf("/dev/block/%d:%d", entry.Major, entry.Minor), Bytes: uint64(entry.Weight), } - structMap["IODeviceWeight"] = append(structMap["IODeviceWeight"], newWeight) + if v2 { + structMap["IODeviceWeight"] = append(structMap["IODeviceWeight"], newWeight) + } else { + structMap["BlockIODeviceWeight"] = append(structMap["BlockIODeviceWeight"], newWeight) + } } } diff --git a/common/pkg/cgroups/utils_linux.go b/common/pkg/cgroups/utils_linux.go index b1ee60a294..a1b18a9695 100644 --- a/common/pkg/cgroups/utils_linux.go +++ b/common/pkg/cgroups/utils_linux.go @@ -15,6 +15,7 @@ import ( "github.com/opencontainers/cgroups" "github.com/sirupsen/logrus" + "go.podman.io/storage/pkg/fileutils" "golang.org/x/sys/unix" ) @@ -206,7 +207,7 @@ func MoveUnderCgroup(cgroup, subtree string, processes []uint32) error { } defer f.Close() - _, err = IsCgroup2UnifiedMode() + unifiedMode, err := IsCgroup2UnifiedMode() if err != nil { return err } @@ -220,12 +221,24 @@ func MoveUnderCgroup(cgroup, subtree string, processes []uint32) error { } // root cgroup, skip it - if parts[2] == "/" && parts[1] != "" { + if parts[2] == "/" && (!unifiedMode || parts[1] != "") { continue } cgroupRoot := "/sys/fs/cgroup" - if parts[1] != "" { + // Special case the unified mount on hybrid cgroup and named hierarchies. + // This works on Fedora 31, but we should really parse the mounts to see + // where the cgroup hierarchy is mounted. + if parts[1] == "" && !unifiedMode { + // If it is not using unified mode, the cgroup v2 hierarchy is + // usually mounted under /sys/fs/cgroup/unified + cgroupRoot = filepath.Join(cgroupRoot, "unified") + + // Ignore the unified mount if it doesn't exist + if err := fileutils.Exists(cgroupRoot); err != nil && os.IsNotExist(err) { + continue + } + } else if parts[1] != "" { // Assume the controller is mounted at /sys/fs/cgroup/$CONTROLLER. controller := strings.TrimPrefix(parts[1], "name=") cgroupRoot = filepath.Join(cgroupRoot, controller) @@ -279,11 +292,15 @@ var ( // it is running in the root cgroup on a system that uses cgroupv2. func MaybeMoveToSubCgroup() error { maybeMoveToSubCgroupSync.Do(func() { - _, err := IsCgroup2UnifiedMode() + unifiedMode, err := IsCgroup2UnifiedMode() if err != nil { maybeMoveToSubCgroupSyncErr = err return } + if !unifiedMode { + maybeMoveToSubCgroupSyncErr = nil + return + } cgroup, err := GetOwnCgroup() if err != nil { maybeMoveToSubCgroupSyncErr = err diff --git a/common/pkg/cgroupv2/cgroups_linux.go b/common/pkg/cgroupv2/cgroups_linux.go new file mode 100644 index 0000000000..b7e1e6aeac --- /dev/null +++ b/common/pkg/cgroupv2/cgroups_linux.go @@ -0,0 +1,27 @@ +package cgroupv2 + +import ( + "sync" + "syscall" + + "golang.org/x/sys/unix" +) + +var ( + isCgroupV2Once sync.Once + isCgroupV2 bool + isCgroupV2Err error +) + +// Enabled returns whether we are running on cgroup v2. +func Enabled() (bool, error) { + isCgroupV2Once.Do(func() { + var st syscall.Statfs_t + if err := syscall.Statfs("/sys/fs/cgroup", &st); err != nil { + isCgroupV2, isCgroupV2Err = false, err + } else { + isCgroupV2, isCgroupV2Err = st.Type == unix.CGROUP2_SUPER_MAGIC, nil + } + }) + return isCgroupV2, isCgroupV2Err +} diff --git a/common/pkg/cgroupv2/cgroups_unsupported.go b/common/pkg/cgroupv2/cgroups_unsupported.go new file mode 100644 index 0000000000..8de8e60d80 --- /dev/null +++ b/common/pkg/cgroupv2/cgroups_unsupported.go @@ -0,0 +1,8 @@ +//go:build !linux + +package cgroupv2 + +// Enabled returns whether we are running on cgroup v2. +func Enabled() (bool, error) { + return false, nil +} diff --git a/common/pkg/config/default.go b/common/pkg/config/default.go index ee4dd63791..54402d1712 100644 --- a/common/pkg/config/default.go +++ b/common/pkg/config/default.go @@ -15,6 +15,7 @@ import ( "go.podman.io/common/internal/attributedstring" nettypes "go.podman.io/common/libnetwork/types" "go.podman.io/common/pkg/apparmor" + "go.podman.io/common/pkg/cgroupv2" "go.podman.io/storage/pkg/fileutils" "go.podman.io/storage/pkg/homedir" "go.podman.io/storage/pkg/unshare" @@ -230,12 +231,17 @@ func defaultConfig() (*Config, error) { } } + cgroupNS := "host" + if cgroup2, _ := cgroupv2.Enabled(); cgroup2 { + cgroupNS = "private" + } + return &Config{ Containers: ContainersConfig{ Annotations: attributedstring.Slice{}, ApparmorProfile: DefaultApparmorProfile, BaseHostsFile: "", - CgroupNS: "private", + CgroupNS: cgroupNS, Cgroups: getDefaultCgroupsMode(), DNSOptions: attributedstring.Slice{}, DNSSearches: attributedstring.Slice{}, @@ -644,7 +650,12 @@ func (c *Config) PidsLimit() int64 { if c.Engine.CgroupManager != SystemdCgroupsManager { return 0 } + cgroup2, _ := cgroupv2.Enabled() + if !cgroup2 { + return 0 + } } + return c.Containers.PidsLimit } diff --git a/common/pkg/config/systemd.go b/common/pkg/config/systemd.go index f8e84acece..e7c15b5909 100644 --- a/common/pkg/config/systemd.go +++ b/common/pkg/config/systemd.go @@ -7,7 +7,9 @@ import ( "path/filepath" "sync" + "go.podman.io/common/pkg/cgroupv2" "go.podman.io/common/pkg/systemd" + "go.podman.io/storage/pkg/unshare" ) var ( @@ -24,6 +26,11 @@ func defaultCgroupManager() string { if !useSystemd() { return CgroupfsCgroupsManager } + enabled, err := cgroupv2.Enabled() + if err == nil && !enabled && unshare.IsRootless() { + return CgroupfsCgroupsManager + } + return SystemdCgroupsManager } diff --git a/common/pkg/sysinfo/sysinfo_linux.go b/common/pkg/sysinfo/sysinfo_linux.go index 752b1bc120..ea98d49481 100644 --- a/common/pkg/sysinfo/sysinfo_linux.go +++ b/common/pkg/sysinfo/sysinfo_linux.go @@ -9,7 +9,7 @@ import ( "github.com/opencontainers/cgroups" "github.com/sirupsen/logrus" - cgroupv2 "go.podman.io/common/pkg/cgroups" + "go.podman.io/common/pkg/cgroupv2" "go.podman.io/storage/pkg/fileutils" "golang.org/x/sys/unix" ) @@ -41,7 +41,7 @@ func New(quiet bool) *SysInfo { sysInfo.cgroupCPUInfo = checkCgroupCPU(cgMounts, quiet) sysInfo.cgroupBlkioInfo = checkCgroupBlkioInfo(cgMounts, quiet) sysInfo.cgroupCpusetInfo = checkCgroupCpusetInfo(cgMounts, quiet) - sysInfo.cgroupPids = checkCgroupPids() + sysInfo.cgroupPids = checkCgroupPids(cgMounts, quiet) } _, ok := cgMounts["devices"] @@ -228,12 +228,22 @@ func checkCgroupCpusetInfo(cgMounts map[string]string, quiet bool) cgroupCpusetI } // checkCgroupPids reads the pids information from the pids cgroup mount point. -func checkCgroupPids() cgroupPids { - _, err := cgroupv2.IsCgroup2UnifiedMode() +func checkCgroupPids(cgMounts map[string]string, quiet bool) cgroupPids { + cgroup2, err := cgroupv2.Enabled() if err != nil { logrus.Errorf("Failed to check cgroups version: %v", err) return cgroupPids{} } + if !cgroup2 { + _, ok := cgMounts["pids"] + if !ok { + if !quiet { + logrus.Warn("Unable to find pids cgroup in mounts") + } + return cgroupPids{} + } + } + return cgroupPids{ PidsLimit: true, } diff --git a/common/pkg/systemd/systemd_linux.go b/common/pkg/systemd/systemd_linux.go index 1d839636aa..a189cfbe05 100644 --- a/common/pkg/systemd/systemd_linux.go +++ b/common/pkg/systemd/systemd_linux.go @@ -88,12 +88,14 @@ func MovePauseProcessToScope(pausePidPath string) { } if err != nil { - _, err2 := cgroups.IsCgroup2UnifiedMode() + unified, err2 := cgroups.IsCgroup2UnifiedMode() if err2 != nil { logrus.Warnf("Failed to detect if running with cgroup unified: %v", err) } - if RunsOnSystemd() { + if RunsOnSystemd() && unified { logrus.Warnf("Failed to add pause process to systemd sandbox cgroup: %v", err) + } else { + logrus.Debugf("Failed to add pause process to systemd sandbox cgroup: %v", err) } } }