diff --git a/Godeps/_workspace/src/github.com/opencontainers/runtime-spec/specs-go/config.go b/Godeps/_workspace/src/github.com/opencontainers/runtime-spec/specs-go/config.go index 491b734c937..148ab6fc921 100644 --- a/Godeps/_workspace/src/github.com/opencontainers/runtime-spec/specs-go/config.go +++ b/Godeps/_workspace/src/github.com/opencontainers/runtime-spec/specs-go/config.go @@ -311,6 +311,13 @@ type Network struct { Priorities []InterfacePriority `json:"priorities,omitempty"` } +// IntelRdt for Linux Intel RDT/CAT resource management (Linux 4.10) +type IntelRdt struct { + // The schema for L3 cache id and capacity bitmask (CBM) + // Format: "L3:=;=;..." + L3CacheSchema *string `json:"l3CacheSchema,omitempty"` +} + // Resources has container runtime resource constraints type Resources struct { // Devices configures the device whitelist. @@ -331,6 +338,8 @@ type Resources struct { HugepageLimits []HugepageLimit `json:"hugepageLimits,omitempty"` // Network restriction configuration Network *Network `json:"network,omitempty"` + // IntelRdt restriction configuration + IntelRdt *IntelRdt `json:"intelRdt,omitempty"` } // Device represents the mknod information for a Linux special device file diff --git a/events.go b/events.go index 77cf5f540b1..c85064fa490 100644 --- a/events.go +++ b/events.go @@ -24,11 +24,12 @@ type event struct { // stats is the runc specific stats structure for stability when encoding and decoding stats. type stats struct { - Cpu cpu `json:"cpu"` - Memory memory `json:"memory"` - Pids pids `json:"pids"` - Blkio blkio `json:"blkio"` - Hugetlb map[string]hugetlb `json:"hugetlb"` + Cpu cpu `json:"cpu"` + Memory memory `json:"memory"` + Pids pids `json:"pids"` + Blkio blkio `json:"blkio"` + Hugetlb map[string]hugetlb `json:"hugetlb"` + IntelRdt intelRdt `json:"intelRdt"` } type hugetlb struct { @@ -95,6 +96,12 @@ type memory struct { Raw map[string]uint64 `json:"raw,omitempty"` } +type intelRdt struct { + // The read-only default "schemas" in root, for reference + L3CacheSchemaRoot string `json:"l3CacheSchemaRoot,omitempty"` + L3CacheSchema string `json:"l3CacheSchema,omitempty"` +} + var eventsCommand = cli.Command{ Name: "events", Usage: "display container events such as OOM notifications, cpu, memory, and IO usage statistics", @@ -223,6 +230,10 @@ func convertLibcontainerStats(ls *libcontainer.Stats) *stats { for k, v := range cg.HugetlbStats { s.Hugetlb[k] = convertHugtlb(v) } + + is := cg.IntelRdtStats + s.IntelRdt.L3CacheSchemaRoot = is.IntelRdtRootStats.L3CacheSchema + s.IntelRdt.L3CacheSchema = is.IntelRdtGroupStats.L3CacheSchema return &s } diff --git a/libcontainer/SPEC.md b/libcontainer/SPEC.md index e5894c6429d..265073c16e9 100644 --- a/libcontainer/SPEC.md +++ b/libcontainer/SPEC.md @@ -154,6 +154,93 @@ that no processes or threads escape the cgroups. This sync is done via a pipe ( specified in the runtime section below ) that the container's init process will block waiting for the parent to finish setup. +**intelRdt**: +Intel platforms with new Xeon CPU support Intel Resource Director Technology +(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which +currently supports L3 cache resource allocation. + +This feature provides a way for the software to restrict cache allocation to a +defined 'subset' of L3 cache which may be overlapping with other 'subsets'. +The different subsets are identified by class of service (CLOS) and each CLOS +has a capacity bitmask (CBM). + +It can be used to handle L3 cache resource allocation for containers if +hardware and kernel support Intel RDT/CAT. + +`intelRdt` is implemented as the `intel_rdt` cgroup subsystem in libcontainer +even though the Linux kernel interface is not real cgroup. When intelRdt is +joined, the statistics can be collected from intel_rdt cgroup subsystem. + +In Linux kernel, it is exposed via "resource control" filesystem, which is a +"cgroup-like" interface. + +Comparing with cgroups, it has similar process management lifecycle and +interfaces in a container. But unlike cgroups' hierarchy, it has single level +filesystem layout. + +Intel RDT "resource control" filesystem hierarchy: +``` +mount -t resctrl resctrl /sys/fs/resctrl +tree /sys/fs/resctrl +/sys/fs/resctrl/ +|-- info +| |-- L3 +| |-- cbm_mask +| |-- num_closids +|-- cpus +|-- schemata +|-- tasks +|-- + |-- cpus + |-- schemata + |-- tasks + +``` + +For runc, we can make use of `tasks` and `schemata` configuration for L3 cache +resource constraints. + +The file `tasks` has a list of tasks that belongs to this group (e.g., +" group). Tasks can be added to a group by writing the task ID +to the "tasks" file (which will automatically remove them from the previous +group to which they belonged). New tasks created by fork(2) and clone(2) are +added to the same group as their parent. If a pid is not in any sub group, it +is in root group. + +The file `schemata` has allocation masks/values for L3 cache on each socket, +which contains L3 cache id and capacity bitmask (CBM). +``` + Format: "L3:=;=;..." +``` +For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0` +Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + +The valid L3 cache CBM is a *contiguous bits set* and number of bits that can +be set is less than the max bit. The max bits in the CBM is varied among +supported Intel Xeon platforms. In Intel RDT "resource control" filesystem +layout, the CBM in a group should be a subset of the CBM in root. Kernel will +check if it is valid when writing. e.g., 0xfffff in root indicates the max bits +of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM +values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + +For more information about Intel RDT/CAT kernel interface: +https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/commit/?h=x86/cache&id=f20e57892806ad244eaec7a7ae365e78fee53377 + +An example for runc: +``` +There are two L3 caches in the two-socket machine, the default CBM is 0xfffff +and the max CBM length is 20 bits. This configuration assigns 4/5 of L3 cache +id 0 and the whole L3 cache id 1 for the container: + +"linux": { + "resources": { + "intelRdt": { + "l3CacheSchema": "L3:0=ffff0;1=fffff" + } + } +} +``` + ### Security The standard set of Linux capabilities that are set in a container diff --git a/libcontainer/cgroups/cgroups.go b/libcontainer/cgroups/cgroups.go index 35fc8eb961d..9335734aad8 100644 --- a/libcontainer/cgroups/cgroups.go +++ b/libcontainer/cgroups/cgroups.go @@ -39,6 +39,9 @@ type Manager interface { // Sets the cgroup as configured. Set(container *configs.Config) error + + // Get non-cgroup resource path + GetResourcePath() string } type NotFoundError struct { diff --git a/libcontainer/cgroups/fs/apply_raw.go b/libcontainer/cgroups/fs/apply_raw.go index 30b20632b54..19f62d0f3c5 100644 --- a/libcontainer/cgroups/fs/apply_raw.go +++ b/libcontainer/cgroups/fs/apply_raw.go @@ -31,6 +31,7 @@ var ( &PerfEventGroup{}, &FreezerGroup{}, &NameGroup{GroupName: "name=systemd", Join: true}, + // If Intel RDT is enabled, will append IntelRdtGroup later } HugePageSizes, _ = cgroups.GetHugePageSize() ) @@ -62,9 +63,11 @@ type subsystem interface { } type Manager struct { - mu sync.Mutex - Cgroups *configs.Cgroup - Paths map[string]string + mu sync.Mutex + Cgroups *configs.Cgroup + Paths map[string]string + ContainerId string + ResourcePath string } // The absolute path to the root of the cgroup hierarchies. @@ -94,10 +97,11 @@ func getCgroupRoot() (string, error) { } type cgroupData struct { - root string - innerPath string - config *configs.Cgroup - pid int + root string + innerPath string + config *configs.Cgroup + pid int + containerId string } func (m *Manager) Apply(pid int) (err error) { @@ -109,7 +113,7 @@ func (m *Manager) Apply(pid int) (err error) { var c = m.Cgroups - d, err := getCgroupData(m.Cgroups, pid) + d, err := getCgroupData(m.Cgroups, pid, m.ContainerId) if err != nil { return err } @@ -131,23 +135,38 @@ func (m *Manager) Apply(pid int) (err error) { } paths := make(map[string]string) + + // If Intel RDT is enabled, append IntelRdtGroup to subsystems + if IsIntelRdtEnabled() && m.Cgroups.Resources.IntelRdtL3CacheSchema != "" { + subsystems = append(subsystems, &IntelRdtGroup{}) + intelRdtPath, err := GetIntelRdtPath(m.ContainerId) + if err != nil { + return err + } + m.ResourcePath = intelRdtPath + } + for _, sys := range subsystems { if err := sys.Apply(d); err != nil { return err } - // TODO: Apply should, ideally, be reentrant or be broken up into a separate - // create and join phase so that the cgroup hierarchy for a container can be - // created then join consists of writing the process pids to cgroup.procs - p, err := d.path(sys.Name()) - if err != nil { - // The non-presence of the devices subsystem is - // considered fatal for security reasons. - if cgroups.IsNotFound(err) && sys.Name() != "devices" { - continue + + // Intel RDT "resource control" filesystem is not in cgroup path + if sys.Name() != "intel_rdt" { + // TODO: Apply should, ideally, be reentrant or be broken up into a separate + // create and join phase so that the cgroup hierarchy for a container can be + // created then join consists of writing the process pids to cgroup.procs + p, err := d.path(sys.Name()) + if err != nil { + // The non-presence of the devices subsystem is + // considered fatal for security reasons. + if cgroups.IsNotFound(err) && sys.Name() != "devices" { + continue + } + return err } - return err + paths[sys.Name()] = p } - paths[sys.Name()] = p } m.Paths = paths return nil @@ -163,6 +182,12 @@ func (m *Manager) Destroy() error { return err } m.Paths = make(map[string]string) + + // Intel RDT "resource control" filesystem + if m.ResourcePath != "" { + return os.RemoveAll(m.ResourcePath) + } + m.ResourcePath = "" return nil } @@ -173,6 +198,13 @@ func (m *Manager) GetPaths() map[string]string { return paths } +func (m *Manager) GetResourcePath() string { + m.mu.Lock() + path := m.ResourcePath + m.mu.Unlock() + return path +} + func (m *Manager) GetStats() (*cgroups.Stats, error) { m.mu.Lock() defer m.mu.Unlock() @@ -186,6 +218,24 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { return nil, err } } + + // Intel RDT "resource control" filesystem stats + if IsIntelRdtEnabled() && m.Cgroups.Resources.IntelRdtL3CacheSchema != "" { + intelRdtPath, err := GetIntelRdtPath(m.ContainerId) + if err != nil || !cgroups.PathExists(intelRdtPath) { + return nil, err + } + sys, err := subsystems.Get("intel_rdt") + if err == errSubsystemDoesNotExist { + // In case IntelRdtGroup is not appended to subsystems + subsystems = append(subsystems, &IntelRdtGroup{}) + } + sys, _ = subsystems.Get("intel_rdt") + if err := sys.GetStats(intelRdtPath, stats); err != nil { + return nil, err + } + } + return stats, nil } @@ -199,6 +249,9 @@ func (m *Manager) Set(container *configs.Config) error { paths := m.GetPaths() for _, sys := range subsystems { path := paths[sys.Name()] + if sys.Name() == "intel_rdt" { + path = m.GetResourcePath() + } if err := sys.Set(path, container.Cgroups); err != nil { return err } @@ -241,7 +294,7 @@ func (m *Manager) GetAllPids() ([]int, error) { return cgroups.GetAllPids(paths["devices"]) } -func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { +func getCgroupData(c *configs.Cgroup, pid int, containerId string) (*cgroupData, error) { root, err := getCgroupRoot() if err != nil { return nil, err @@ -262,10 +315,11 @@ func getCgroupData(c *configs.Cgroup, pid int) (*cgroupData, error) { } return &cgroupData{ - root: root, - innerPath: innerPath, - config: c, - pid: pid, + root: root, + innerPath: innerPath, + config: c, + pid: pid, + containerId: containerId, }, nil } diff --git a/libcontainer/cgroups/fs/apply_raw_test.go b/libcontainer/cgroups/fs/apply_raw_test.go index ba4e9e543c4..83cff3ba6c7 100644 --- a/libcontainer/cgroups/fs/apply_raw_test.go +++ b/libcontainer/cgroups/fs/apply_raw_test.go @@ -20,7 +20,7 @@ func TestInvalidCgroupPath(t *testing.T) { Path: "../../../../../../../../../../some/path", } - data, err := getCgroupData(config, 0) + data, err := getCgroupData(config, 0, "") if err != nil { t.Errorf("couldn't get cgroup data: %v", err) } @@ -51,7 +51,7 @@ func TestInvalidAbsoluteCgroupPath(t *testing.T) { Path: "/../../../../../../../../../../some/path", } - data, err := getCgroupData(config, 0) + data, err := getCgroupData(config, 0, "") if err != nil { t.Errorf("couldn't get cgroup data: %v", err) } @@ -84,7 +84,7 @@ func TestInvalidCgroupParent(t *testing.T) { Name: "name", } - data, err := getCgroupData(config, 0) + data, err := getCgroupData(config, 0, "") if err != nil { t.Errorf("couldn't get cgroup data: %v", err) } @@ -117,7 +117,7 @@ func TestInvalidAbsoluteCgroupParent(t *testing.T) { Name: "name", } - data, err := getCgroupData(config, 0) + data, err := getCgroupData(config, 0, "") if err != nil { t.Errorf("couldn't get cgroup data: %v", err) } @@ -150,7 +150,7 @@ func TestInvalidCgroupName(t *testing.T) { Name: "../../../../../../../../../../some/path", } - data, err := getCgroupData(config, 0) + data, err := getCgroupData(config, 0, "") if err != nil { t.Errorf("couldn't get cgroup data: %v", err) } @@ -184,7 +184,7 @@ func TestInvalidAbsoluteCgroupName(t *testing.T) { Name: "/../../../../../../../../../../some/path", } - data, err := getCgroupData(config, 0) + data, err := getCgroupData(config, 0, "") if err != nil { t.Errorf("couldn't get cgroup data: %v", err) } @@ -217,7 +217,7 @@ func TestInvalidCgroupNameAndParent(t *testing.T) { Name: "../../../../../../../../../../some/path", } - data, err := getCgroupData(config, 0) + data, err := getCgroupData(config, 0, "") if err != nil { t.Errorf("couldn't get cgroup data: %v", err) } @@ -250,7 +250,7 @@ func TestInvalidAbsoluteCgroupNameAndParent(t *testing.T) { Name: "/../../../../../../../../../../some/path", } - data, err := getCgroupData(config, 0) + data, err := getCgroupData(config, 0, "") if err != nil { t.Errorf("couldn't get cgroup data: %v", err) } diff --git a/libcontainer/cgroups/fs/intelrdt.go b/libcontainer/cgroups/fs/intelrdt.go new file mode 100644 index 00000000000..1a09fd9d808 --- /dev/null +++ b/libcontainer/cgroups/fs/intelrdt.go @@ -0,0 +1,395 @@ +// +build linux + +package fs + +import ( + "bufio" + "errors" + "fmt" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +/* + * About Intel RDT/CAT feature: + * Intel platforms with new Xeon CPU support Resource Director Technology (RDT). + * Intel Cache Allocation Technology (CAT) is a sub-feature of RDT. Currently L3 + * Cache is the only resource that is supported in RDT. + * + * This feature provides a way for the software to restrict cache allocation to a + * defined 'subset' of L3 cache which may be overlapping with other 'subsets'. + * The different subsets are identified by class of service (CLOS) and each CLOS + * has a capacity bitmask (CBM). + * + * For more information about Intel RDT/CAT can be found in the section 17.17 + * of Intel Software Developer Manual. + * + * About Intel RDT/CAT kernel interface: + * In Linux kernel, the interface is defined and exposed via "resource control" + * filesystem, which is a "cgroup-like" interface. + * + * Comparing with cgroups, it has similar process management lifecycle and + * interfaces in a container. But unlike cgroups' hierarchy, it has single level + * filesystem layout. + * + * Intel RDT "resource control" filesystem hierarchy: + * mount -t resctrl resctrl /sys/fs/resctrl + * tree /sys/fs/resctrl + * /sys/fs/resctrl/ + * |-- info + * | |-- L3 + * | |-- cbm_mask + * | |-- num_closids + * |-- cpus + * |-- schemata + * |-- tasks + * |-- + * |-- cpus + * |-- schemata + * |-- tasks + * + * For runc, we can make use of `tasks` and `schemata` configuration for L3 cache + * resource constraints. + * + * The file `tasks` has a list of tasks that belongs to this group (e.g., + * " group). Tasks can be added to a group by writing the task ID + * to the "tasks" file (which will automatically remove them from the previous + * group to which they belonged). New tasks created by fork(2) and clone(2) are + * added to the same group as their parent. If a pid is not in any sub group, it is + * in root group. + * + * The file `schemata` has allocation bitmasks/values for L3 cache on each socket, + * which contains L3 cache id and capacity bitmask (CBM). + * Format: "L3:=;=;..." + * For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0` + * which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + * + * The valid L3 cache CBM is a *contiguous bits set* and number of bits that can + * be set is less than the max bit. The max bits in the CBM is varied among + * supported Intel Xeon platforms. In Intel RDT "resource control" filesystem + * layout, the CBM in a group should be a subset of the CBM in root. Kernel will + * check if it is valid when writing. e.g., 0xfffff in root indicates the max bits + * of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM + * values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + * + * For more information about Intel RDT/CAT kernel interface: + * https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/commit/?h=x86/cache&id=f20e57892806ad244eaec7a7ae365e78fee53377 + * + * An example for runc: + * There are two L3 caches in the two-socket machine, the default CBM is 0xfffff + * and the max CBM length is 20 bits. This configuration assigns 4/5 of L3 cache + * id 0 and the whole L3 cache id 1 for the container: + * + * "linux": { + * "resources": { + * "intelRdt": { + * "l3CacheSchema": "L3:0=ffff0;1=fffff" + * } + * } + * } + */ + +type IntelRdtGroup struct { +} + +func (s *IntelRdtGroup) Name() string { + return "intel_rdt" +} + +func (s *IntelRdtGroup) Apply(d *cgroupData) error { + data, err := getIntelRdtData(d.config, d.pid, d.containerId) + if err != nil && !cgroups.IsNotFound(err) { + return err + } + + if _, err := data.join(data.containerId); err != nil { + return err + } + + return nil +} + +func (s *IntelRdtGroup) Set(path string, cgroup *configs.Cgroup) error { + // About L3 cache schemata file: + // The schema has allocation masks/values for L3 cache on each socket, + // which contains L3 cache id and capacity bitmask (CBM). + // Format: "L3:=;=;..." + // For example, on a two-socket machine, L3's schema line could be: + // L3:0=ff;1=c0 + // Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + // + // About L3 cache CBM validity: + // The valid L3 cache CBM is a *contiguous bits set* and number of + // bits that can be set is less than the max bit. The max bits in the + // CBM is varied among supported Intel Xeon platforms. In Intel RDT + // "resource control" filesystem layout, the CBM in a group should + // be a subset of the CBM in root. Kernel will check if it is valid + // when writing. + // e.g., 0xfffff in root indicates the max bits of CBM is 20 bits, + // which mapping to entire L3 cache capacity. Some valid CBM values + // to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + l3CacheSchema := cgroup.Resources.IntelRdtL3CacheSchema + if l3CacheSchema != "" { + if err := writeFile(path, "schemata", l3CacheSchema+"\n"); err != nil { + return err + } + } + return nil +} + +func (s *IntelRdtGroup) Remove(d *cgroupData) error { + path, err := GetIntelRdtPath(d.containerId) + if err != nil { + return err + } + if err := removePath(path, nil); err != nil { + return err + } + return nil +} + +func (s *IntelRdtGroup) GetStats(path string, stats *cgroups.Stats) error { + // The read-only default "schemata" in root + rootPath, err := getIntelRdtRoot() + if err != nil { + return err + } + schemaRoot, err := getCgroupParamString(rootPath, "schemata") + if err != nil { + return err + } + stats.IntelRdtStats.IntelRdtRootStats.L3CacheSchema = schemaRoot + + // The stats in "container_id" group + schema, err := getCgroupParamString(path, "schemata") + if err != nil { + return err + } + stats.IntelRdtStats.IntelRdtGroupStats.L3CacheSchema = schema + + return nil +} + +const ( + IntelRdtTasks = "tasks" +) + +var ( + ErrIntelRdtNotEnabled = errors.New("intelrdt: config provided but Intel RDT not supported") + + // The root path of the Intel RDT "resource control" filesystem + intelRdtRoot string +) + +type intelRdtData struct { + root string + config *configs.Cgroup + pid int + containerId string +} + +// The read-only Intel RDT related system information in root +type IntelRdtInfo struct { + CbmMask uint64 `json:"cbm_mask,omitempty"` + NumClosid uint64 `json:"num_closid,omitempty"` +} + +// Return the mount point path of Intel RDT "resource control" filesysem +func findIntelRdtMountpointDir() (string, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + text := s.Text() + fields := strings.Split(text, " ") + // Safe as mountinfo encodes mountpoints with spaces as \040. + index := strings.Index(text, " - ") + postSeparatorFields := strings.Fields(text[index+3:]) + numPostFields := len(postSeparatorFields) + + // This is an error as we can't detect if the mount is for "Intel RDT" + if numPostFields == 0 { + return "", fmt.Errorf("Found no fields post '-' in %q", text) + } + + if postSeparatorFields[0] == "resctrl" { + // Check that the mount is properly formated. + if numPostFields < 3 { + return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) + } + + return fields[4], nil + } + } + if err := s.Err(); err != nil { + return "", err + } + + return "", err +} + +// Gets the root path of Intel RDT "resource control" filesystem +func getIntelRdtRoot() (string, error) { + if intelRdtRoot != "" { + return intelRdtRoot, nil + } + + root, err := findIntelRdtMountpointDir() + if err != nil { + return "", err + } + + if _, err := os.Stat(root); err != nil { + return "", err + } + + intelRdtRoot = root + return intelRdtRoot, nil +} + +func getIntelRdtData(c *configs.Cgroup, pid int, containerId string) (*intelRdtData, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return nil, err + } + return &intelRdtData{ + root: rootPath, + config: c, + pid: pid, + containerId: containerId, + }, nil +} + +// WriteIntelRdtTasks writes the specified pid into the "tasks" file +func WriteIntelRdtTasks(dir string, pid int) error { + if dir == "" { + return fmt.Errorf("no such directory for %s", IntelRdtTasks) + } + + // Dont attach any pid if -1 is specified as a pid + if pid != -1 { + if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", pid, IntelRdtTasks, err) + } + } + return nil +} + +func (raw *intelRdtData) join(name string) (string, error) { + path := filepath.Join(raw.root, name) + if err := os.MkdirAll(path, 0755); err != nil { + return "", err + } + + if err := WriteIntelRdtTasks(path, raw.pid); err != nil { + return "", err + } + return path, nil +} + +func isIntelRdtMounted() bool { + _, err := getIntelRdtRoot() + if err != nil { + if !cgroups.IsNotFound(err) { + return false + } + + // If not mounted, we try to mount again: + // mount -t resctrl resctrl /sys/fs/resctrl + if err := os.MkdirAll("/sys/fs/resctrl", 0755); err != nil { + return false + } + if err := exec.Command("mount", "-t", "resctrl", "resctrl", "/sys/fs/resctrl").Run(); err != nil { + return false + } + } + + return true +} + +func parseCpuInfoFile(path string) (bool, error) { + f, err := os.Open(path) + if err != nil { + return false, err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + if err := s.Err(); err != nil { + return false, err + } + + text := s.Text() + flags := strings.Split(text, " ") + + for _, flag := range flags { + if flag == "rdt_a" { + return true, nil + } + } + } + return false, nil +} + +// Check if Intel RDT is enabled +func IsIntelRdtEnabled() bool { + // 1. check if hardware and kernel support Intel RDT feature + // "rdt" flag is set if supported + isFlagSet, err := parseCpuInfoFile("/proc/cpuinfo") + if err != nil { + return false + } + + // 2. check if Intel RDT "resource control" filesystem is mounted + isMounted := isIntelRdtMounted() + + return isFlagSet && isMounted +} + +// Get Intel RDT "resource control" filesystem path +func GetIntelRdtPath(id string) (string, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return "", err + } + + path := filepath.Join(rootPath, id) + return path, nil +} + +// Get read-only Intel RDT related system information +func GetIntelRdtInfo() (*IntelRdtInfo, error) { + intelRdtInfo := &IntelRdtInfo{} + + rootPath, err := getIntelRdtRoot() + if err != nil { + return nil, err + } + + path := filepath.Join(rootPath, "info", "l3") + cbmMask, err := getCgroupParamUint(path, "cbm_mask") + if err != nil { + return nil, err + } + numClosid, err := getCgroupParamUint(path, "num_closid") + if err != nil { + return nil, err + } + + intelRdtInfo.CbmMask = cbmMask + intelRdtInfo.NumClosid = numClosid + + return intelRdtInfo, nil +} diff --git a/libcontainer/cgroups/fs/intelrdt_test.go b/libcontainer/cgroups/fs/intelrdt_test.go new file mode 100644 index 00000000000..9f597345ec4 --- /dev/null +++ b/libcontainer/cgroups/fs/intelrdt_test.go @@ -0,0 +1,70 @@ +// +build linux + +package fs + +import ( + "testing" + + "github.com/opencontainers/runc/libcontainer/cgroups" +) + +func TestIntelRdtSetL3CacheSchema(t *testing.T) { + if !IsIntelRdtEnabled() { + return + } + + helper := NewCgroupTestUtil("intel_rdt", t) + defer helper.cleanup() + + const ( + l3CacheSchemaBefore = "L3:0=f;1=f0" + l3CacheSchemeAfter = "L3:0=f0;1=f" + ) + + helper.writeFileContents(map[string]string{ + "schemata": l3CacheSchemaBefore + "\n", + }) + + helper.CgroupData.config.Resources.IntelRdtL3CacheSchema = l3CacheSchemeAfter + intelrdt := &IntelRdtGroup{} + if err := intelrdt.Set(helper.CgroupPath, helper.CgroupData.config); err != nil { + t.Fatal(err) + } + + value, err := getCgroupParamString(helper.CgroupPath, "schemata") + if err != nil { + t.Fatalf("Failed to parse file 'schemata' - %s", err) + } + + if value != l3CacheSchemeAfter { + t.Fatal("Got the wrong value, set 'schemata' failed.") + } +} + +func TestIntelRdtStats(t *testing.T) { + if !IsIntelRdtEnabled() { + return + } + + helper := NewCgroupTestUtil("intel_rdt", t) + defer helper.cleanup() + + const ( + l3CacheSchemaContent = "L3:0=ffff0;1=fff00" + ) + + helper.writeFileContents(map[string]string{ + "schemata": l3CacheSchemaContent + "\n", + }) + + intelrdt := &IntelRdtGroup{} + stats := *cgroups.NewStats() + if err := intelrdt.GetStats(helper.CgroupPath, &stats); err != nil { + t.Fatal(err) + } + + if stats.IntelRdtStats.IntelRdtGroupStats.L3CacheSchema != l3CacheSchemaContent { + t.Fatalf("Expected '%q', got '%q' for file 'schemata'", + l3CacheSchemaContent, stats.IntelRdtStats.IntelRdtGroupStats.L3CacheSchema) + } +} diff --git a/libcontainer/cgroups/stats.go b/libcontainer/cgroups/stats.go index b483f1bf983..3622f927099 100644 --- a/libcontainer/cgroups/stats.go +++ b/libcontainer/cgroups/stats.go @@ -90,13 +90,27 @@ type HugetlbStats struct { Failcnt uint64 `json:"failcnt"` } +type IntelRdtRootStats struct { + L3CacheSchema string `json:"l3_cache_schema,omitempty"` +} + +type IntelRdtGroupStats struct { + L3CacheSchema string `json:"l3_cache_schema,omitempty"` +} + +type IntelRdtStats struct { + IntelRdtRootStats IntelRdtRootStats `json:"intel_rdt_root_stats,omitempty"` + IntelRdtGroupStats IntelRdtGroupStats `json:"intel_rdt_group_stats,omitempty"` +} + type Stats struct { CpuStats CpuStats `json:"cpu_stats,omitempty"` MemoryStats MemoryStats `json:"memory_stats,omitempty"` PidsStats PidsStats `json:"pids_stats,omitempty"` BlkioStats BlkioStats `json:"blkio_stats,omitempty"` // the map is in the format "size of hugepage: stats of the hugepage" - HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` + HugetlbStats map[string]HugetlbStats `json:"hugetlb_stats,omitempty"` + IntelRdtStats IntelRdtStats `json:"intel_rdt_stats,omitempty"` } func NewStats() *Stats { diff --git a/libcontainer/cgroups/systemd/apply_nosystemd.go b/libcontainer/cgroups/systemd/apply_nosystemd.go index 7de9ae6050b..a13cb53e40c 100644 --- a/libcontainer/cgroups/systemd/apply_nosystemd.go +++ b/libcontainer/cgroups/systemd/apply_nosystemd.go @@ -10,8 +10,10 @@ import ( ) type Manager struct { - Cgroups *configs.Cgroup - Paths map[string]string + Cgroups *configs.Cgroup + Paths map[string]string + ContainerId string + ResourcePath string } func UseSystemd() bool { diff --git a/libcontainer/cgroups/systemd/apply_systemd.go b/libcontainer/cgroups/systemd/apply_systemd.go index fd428f90cb9..7bb0add93e8 100644 --- a/libcontainer/cgroups/systemd/apply_systemd.go +++ b/libcontainer/cgroups/systemd/apply_systemd.go @@ -21,9 +21,11 @@ import ( ) type Manager struct { - mu sync.Mutex - Cgroups *configs.Cgroup - Paths map[string]string + mu sync.Mutex + Cgroups *configs.Cgroup + Paths map[string]string + ContainerId string + ResourcePath string } type subsystem interface { @@ -62,6 +64,7 @@ var subsystems = subsystemSet{ &fs.NetPrioGroup{}, &fs.NetClsGroup{}, &fs.NameGroup{GroupName: "name=systemd"}, + // If Intel RDT is enabled, will append IntelRdtGroup later } const ( @@ -286,21 +289,36 @@ func (m *Manager) Apply(pid int) error { return err } + // If Intel RDT is enabled, append IntelRdtGroup to subsystems + if fs.IsIntelRdtEnabled() && m.Cgroups.Resources.IntelRdtL3CacheSchema != "" { + subsystems = append(subsystems, &fs.IntelRdtGroup{}) + + // Intel RDT "resource control" is not real cgroup, it will not join cgroup path + intelRdtPath, err := joinIntelRdt(c, pid, m.ContainerId) + if err != nil { + return err + } + m.ResourcePath = intelRdtPath + } + if err := joinCgroups(c, pid); err != nil { return err } paths := make(map[string]string) for _, s := range subsystems { - subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name()) - if err != nil { - // Don't fail if a cgroup hierarchy was not found, just skip this subsystem - if cgroups.IsNotFound(err) { - continue + // Intel RDT "resource control" filesystem is not in cgroup path + if s.Name() != "intel_rdt" { + subsystemPath, err := getSubsystemPath(m.Cgroups, s.Name()) + if err != nil { + // Don't fail if a cgroup hierarchy was not found, just skip this subsystem + if cgroups.IsNotFound(err) { + continue + } + return err } - return err + paths[s.Name()] = subsystemPath } - paths[s.Name()] = subsystemPath } m.Paths = paths return nil @@ -317,6 +335,12 @@ func (m *Manager) Destroy() error { return err } m.Paths = make(map[string]string) + + // Intel RDT "resource control" filesystem + if m.ResourcePath != "" { + return os.RemoveAll(m.ResourcePath) + } + m.ResourcePath = "" return nil } @@ -327,6 +351,13 @@ func (m *Manager) GetPaths() map[string]string { return paths } +func (m *Manager) GetResourcePath() string { + m.mu.Lock() + path := m.ResourcePath + m.mu.Unlock() + return path +} + func writeFile(dir, file, data string) error { // Normally dir should not be empty, one case is that cgroup subsystem // is not mounted, we will get empty dir, and we want it fail here. @@ -350,6 +381,20 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { return path, nil } +func joinIntelRdt(c *configs.Cgroup, pid int, containerId string) (string, error) { + path, err := fs.GetIntelRdtPath(containerId) + if err != nil { + return "", err + } + if err := os.MkdirAll(path, 0755); err != nil { + return "", err + } + if err := fs.WriteIntelRdtTasks(path, pid); err != nil { + return "", err + } + return path, nil +} + func joinCgroups(c *configs.Cgroup, pid int) error { for _, sys := range subsystems { name := sys.Name() @@ -357,6 +402,10 @@ func joinCgroups(c *configs.Cgroup, pid int) error { case "name=systemd": // let systemd handle this break + case "intel_rdt": + // Intel RDT "resource control" is not real cgroup, + // it will not join cgroup path + break case "cpuset": path, err := getSubsystemPath(c, name) if err != nil && !cgroups.IsNotFound(err) { @@ -498,6 +547,23 @@ func (m *Manager) GetStats() (*cgroups.Stats, error) { } } + // Intel RDT "resource control" filesystem stats + if fs.IsIntelRdtEnabled() && m.Cgroups.Resources.IntelRdtL3CacheSchema != "" { + intelRdtPath, err := fs.GetIntelRdtPath(m.ContainerId) + if err != nil || !cgroups.PathExists(intelRdtPath) { + return nil, err + } + sys, err := subsystems.Get("intel_rdt") + if err == errSubsystemDoesNotExist { + // In case IntelRdtGroup is not appended to subsystems + subsystems = append(subsystems, &fs.IntelRdtGroup{}) + } + sys, _ = subsystems.Get("intel_rdt") + if err := sys.GetStats(intelRdtPath, stats); err != nil { + return nil, err + } + } + return stats, nil } @@ -514,6 +580,10 @@ func (m *Manager) Set(container *configs.Config) error { return err } + if sys.Name() == "intel_rdt" { + path = m.GetResourcePath() + } + if err := sys.Set(path, container.Cgroups); err != nil { return err } diff --git a/libcontainer/configs/cgroup_unix.go b/libcontainer/configs/cgroup_unix.go index 14d62898162..77a7ad5c56b 100644 --- a/libcontainer/configs/cgroup_unix.go +++ b/libcontainer/configs/cgroup_unix.go @@ -121,4 +121,8 @@ type Resources struct { // Set class identifier for container's network packets NetClsClassid uint32 `json:"net_cls_classid_u"` + + // Intel RDT: the schema for L3 cache id and capacity bitmask (CBM) + // Format: "L3:=;=;..." + IntelRdtL3CacheSchema string `json:"intel_rdt_l3_cache_schema"` } diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 82c6d8e4420..6b7244094c4 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -61,6 +61,9 @@ type State struct { // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore ExternalDescriptors []string `json:"external_descriptors,omitempty"` + + // Intel RDT "resource control" filesystem path + IntelRdtPath string `json:"intel_rdt_path"` } // Container is a libcontainer container object. @@ -376,6 +379,7 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, return &setnsProcess{ cmd: cmd, cgroupPaths: c.cgroupManager.GetPaths(), + intelRdtPath: c.cgroupManager.GetResourcePath(), childPipe: childPipe, parentPipe: parentPipe, config: c.newInitConfig(p), @@ -1202,6 +1206,7 @@ func (c *linuxContainer) currentState() (*State, error) { Created: c.created, }, CgroupPaths: c.cgroupManager.GetPaths(), + IntelRdtPath: c.cgroupManager.GetResourcePath(), NamespacePaths: make(map[configs.NamespaceType]string), ExternalDescriptors: externalDescriptors, } diff --git a/libcontainer/container_linux_test.go b/libcontainer/container_linux_test.go index b7ce552ef02..f804369cc34 100644 --- a/libcontainer/container_linux_test.go +++ b/libcontainer/container_linux_test.go @@ -8,14 +8,16 @@ import ( "testing" "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" "github.com/opencontainers/runc/libcontainer/configs" ) type mockCgroupManager struct { - pids []int - allPids []int - stats *cgroups.Stats - paths map[string]string + pids []int + allPids []int + stats *cgroups.Stats + paths map[string]string + intelRdtPath string } func (m *mockCgroupManager) GetPids() ([]int, error) { @@ -46,6 +48,10 @@ func (m *mockCgroupManager) GetPaths() map[string]string { return m.paths } +func (m *mockCgroupManager) GetResourcePath() string { + return m.intelRdtPath +} + func (m *mockCgroupManager) Freeze(state configs.FreezerState) error { return nil } @@ -132,9 +138,10 @@ func TestGetContainerStats(t *testing.T) { func TestGetContainerState(t *testing.T) { var ( - pid = os.Getpid() - expectedMemoryPath = "/sys/fs/cgroup/memory/myid" - expectedNetworkPath = "/networks/fd" + pid = os.Getpid() + expectedMemoryPath = "/sys/fs/cgroup/memory/myid" + expectedNetworkPath = "/networks/fd" + expectedIntelRdtPath = "sys/fs/resctrl/myid" ) container := &linuxContainer{ id: "myid", @@ -164,6 +171,7 @@ func TestGetContainerState(t *testing.T) { paths: map[string]string{ "memory": expectedMemoryPath, }, + intelRdtPath: expectedIntelRdtPath, }, } container.state = &createdState{c: container} @@ -184,6 +192,12 @@ func TestGetContainerState(t *testing.T) { if memPath := paths["memory"]; memPath != expectedMemoryPath { t.Fatalf("expected memory path %q but received %q", expectedMemoryPath, memPath) } + if fs.IsIntelRdtEnabled() { + intelRdtPath := state.IntelRdtPath + if intelRdtPath != expectedIntelRdtPath { + t.Fatalf("expected intelRdt path %q but received %q", expectedIntelRdtPath, intelRdtPath) + } + } for _, ns := range container.config.Namespaces { path := state.NamespacePaths[ns.Type] if path == "" { diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index 6e2bf3ad49b..3d295c5149a 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -43,10 +43,11 @@ func InitArgs(args ...string) func(*LinuxFactory) error { // SystemdCgroups is an options func to configure a LinuxFactory to return // containers that use systemd to create and manage cgroups. func SystemdCgroups(l *LinuxFactory) error { - l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string, containerId string) cgroups.Manager { return &systemd.Manager{ - Cgroups: config, - Paths: paths, + Cgroups: config, + Paths: paths, + ContainerId: containerId, } } return nil @@ -56,10 +57,11 @@ func SystemdCgroups(l *LinuxFactory) error { // containers that use the native cgroups filesystem implementation to // create and manage cgroups. func Cgroupfs(l *LinuxFactory) error { - l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string) cgroups.Manager { + l.NewCgroupsManager = func(config *configs.Cgroup, paths map[string]string, containerId string) cgroups.Manager { return &fs.Manager{ - Cgroups: config, - Paths: paths, + Cgroups: config, + Paths: paths, + ContainerId: containerId, } } return nil @@ -128,7 +130,7 @@ type LinuxFactory struct { Validator validate.Validator // NewCgroupsManager returns an initialized cgroups manager for a single container. - NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager + NewCgroupsManager func(config *configs.Cgroup, paths map[string]string, containerId string) cgroups.Manager } func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) { @@ -177,7 +179,7 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err config: config, initArgs: l.InitArgs, criuPath: l.CriuPath, - cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), + cgroupManager: l.NewCgroupsManager(config.Cgroups, nil, id), } c.state = &stoppedState{c: c} return c, nil @@ -204,7 +206,7 @@ func (l *LinuxFactory) Load(id string) (Container, error) { config: &state.Config, initArgs: l.InitArgs, criuPath: l.CriuPath, - cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths), + cgroupManager: l.NewCgroupsManager(state.Config.Cgroups, state.CgroupPaths, id), root: containerRoot, created: state.Created, } diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 4b54e4b215c..bfde5181280 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -14,6 +14,7 @@ import ( "syscall" "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/fs" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" @@ -47,6 +48,7 @@ type setnsProcess struct { parentPipe *os.File childPipe *os.File cgroupPaths map[string]string + intelRdtPath string config *initConfig fds []string process *Process @@ -87,6 +89,15 @@ func (p *setnsProcess) start() (err error) { return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) } } + if p.intelRdtPath != "" { + // if Intel RDT "resource control" filesystem path exists + _, err := os.Stat(p.intelRdtPath) + if err == nil { + if err := fs.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil { + return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid()) + } + } + } // set oom_score_adj if err := setOomScoreAdj(p.config.Config.OomScoreAdj, p.pid()); err != nil { return newSystemErrorWithCause(err, "setting oom score") diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index fec19784ffb..7ebc96fc1ed 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -464,6 +464,11 @@ func createCgroupConfig(name string, useSystemdCgroup bool, spec *specs.Spec) (* }) } } + if r.IntelRdt != nil { + if r.IntelRdt.L3CacheSchema != nil { + c.Resources.IntelRdtL3CacheSchema = *r.IntelRdt.L3CacheSchema + } + } return c, nil }