diff --git a/events.go b/events.go index de2b5f8e440..94dd2b93dc2 100644 --- a/events.go +++ b/events.go @@ -11,6 +11,7 @@ import ( "github.com/opencontainers/runc/libcontainer" "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/sirupsen/logrus" "github.com/urfave/cli" @@ -25,11 +26,12 @@ type event struct { // stats is the runc specific stats structure for stability when encoding and decoding stats. type stats struct { - CPU cpu `json:"cpu"` - Memory memory `json:"memory"` - Pids pids `json:"pids"` - Blkio blkio `json:"blkio"` - Hugetlb map[string]hugetlb `json:"hugetlb"` + CPU cpu `json:"cpu"` + Memory memory `json:"memory"` + Pids pids `json:"pids"` + Blkio blkio `json:"blkio"` + Hugetlb map[string]hugetlb `json:"hugetlb"` + IntelRdt intelRdt `json:"intel_rdt"` } type hugetlb struct { @@ -96,6 +98,23 @@ type memory struct { Raw map[string]uint64 `json:"raw,omitempty"` } +type l3CacheInfo struct { + CbmMask string `json:"cbm_mask,omitempty"` + MinCbmBits uint64 `json:"min_cbm_bits,omitempty"` + NumClosids uint64 `json:"num_closids,omitempty"` +} + +type intelRdt struct { + // The read-only L3 cache information + L3CacheInfo *l3CacheInfo `json:"l3_cache_info,omitempty"` + + // The read-only L3 cache schema in root + L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"` + + // The L3 cache schema in 'container_id' group + L3CacheSchema string `json:"l3_cache_schema,omitempty"` +} + var eventsCommand = cli.Command{ Name: "events", Usage: "display container events such as OOM notifications, cpu, memory, and IO usage statistics", @@ -227,6 +246,13 @@ func convertLibcontainerStats(ls *libcontainer.Stats) *stats { for k, v := range cg.HugetlbStats { s.Hugetlb[k] = convertHugtlb(v) } + + if is := ls.IntelRdtStats; is != nil { + s.IntelRdt.L3CacheInfo = convertL3CacheInfo(is.L3CacheInfo) + s.IntelRdt.L3CacheSchemaRoot = is.L3CacheSchemaRoot + s.IntelRdt.L3CacheSchema = is.L3CacheSchema + } + return &s } @@ -259,3 +285,11 @@ func convertBlkioEntry(c []cgroups.BlkioStatEntry) []blkioEntry { } return out } + +func convertL3CacheInfo(i *intelrdt.L3CacheInfo) *l3CacheInfo { + return &l3CacheInfo{ + CbmMask: i.CbmMask, + MinCbmBits: i.MinCbmBits, + NumClosids: i.NumClosids, + } +} diff --git a/libcontainer/SPEC.md b/libcontainer/SPEC.md index e5894c6429d..0f2ce3518b6 100644 --- a/libcontainer/SPEC.md +++ b/libcontainer/SPEC.md @@ -154,6 +154,90 @@ that no processes or threads escape the cgroups. This sync is done via a pipe ( specified in the runtime section below ) that the container's init process will block waiting for the parent to finish setup. +### IntelRdt + +Intel platforms with new Xeon CPU support Intel Resource Director Technology +(RDT). Cache Allocation Technology (CAT) is a sub-feature of RDT, which +currently supports L3 cache resource allocation. + +This feature provides a way for the software to restrict cache allocation to a +defined 'subset' of L3 cache which may be overlapping with other 'subsets'. +The different subsets are identified by class of service (CLOS) and each CLOS +has a capacity bitmask (CBM). + +It can be used to handle L3 cache resource allocation for containers if +hardware and kernel support Intel RDT/CAT. + +In Linux 4.10 kernel or newer, the interface is defined and exposed via +"resource control" filesystem, which is a "cgroup-like" interface. + +Comparing with cgroups, it has similar process management lifecycle and +interfaces in a container. But unlike cgroups' hierarchy, it has single level +filesystem layout. + +Intel RDT "resource control" filesystem hierarchy: +``` +mount -t resctrl resctrl /sys/fs/resctrl +tree /sys/fs/resctrl +/sys/fs/resctrl/ +|-- info +| |-- L3 +| |-- cbm_mask +| |-- min_cbm_bits +| |-- num_closids +|-- cpus +|-- schemata +|-- tasks +|-- + |-- cpus + |-- schemata + |-- tasks + +``` + +For runc, we can make use of `tasks` and `schemata` configuration for L3 cache +resource constraints. + +The file `tasks` has a list of tasks that belongs to this group (e.g., +" group). Tasks can be added to a group by writing the task ID +to the "tasks" file (which will automatically remove them from the previous +group to which they belonged). New tasks created by fork(2) and clone(2) are +added to the same group as their parent. If a pid is not in any sub group, it +is in root group. + +The file `schemata` has allocation masks/values for L3 cache on each socket, +which contains L3 cache id and capacity bitmask (CBM). +``` + Format: "L3:=;=;..." +``` +For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0` +Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + +The valid L3 cache CBM is a *contiguous bits set* and number of bits that can +be set is less than the max bit. The max bits in the CBM is varied among +supported Intel Xeon platforms. In Intel RDT "resource control" filesystem +layout, the CBM in a group should be a subset of the CBM in root. Kernel will +check if it is valid when writing. e.g., 0xfffff in root indicates the max bits +of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM +values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + +For more information about Intel RDT/CAT kernel interface: +https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt + +An example for runc: +``` +Consider a two-socket machine with two L3 caches where the default CBM is +0xfffff and the max CBM length is 20 bits. With this configuration, tasks +inside the container only have access to the "upper" 80% of L3 cache id 0 and +the "lower" 50% L3 cache id 1: + +"linux": { + "intelRdt": { + "l3CacheSchema": "L3:0=ffff0;1=3ff" + } +} +``` + ### Security The standard set of Linux capabilities that are set in a container diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index 269fffff357..3cae4fd8d96 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -187,6 +187,10 @@ type Config struct { // Rootless specifies whether the container is a rootless container. Rootless bool `json:"rootless"` + + // IntelRdt specifies settings for Intel RDT/CAT group that the container is placed into + // to limit the resources (e.g., L3 cache) the container has available + IntelRdt *IntelRdt `json:"intel_rdt,omitempty"` } type Hooks struct { diff --git a/libcontainer/configs/intelrdt.go b/libcontainer/configs/intelrdt.go new file mode 100644 index 00000000000..36bd5f96a11 --- /dev/null +++ b/libcontainer/configs/intelrdt.go @@ -0,0 +1,7 @@ +package configs + +type IntelRdt struct { + // The schema for L3 cache id and capacity bitmask (CBM) + // Format: "L3:=;=;..." + L3CacheSchema string `json:"l3_cache_schema,omitempty"` +} diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go index 8284345442c..ca2efe84d85 100644 --- a/libcontainer/configs/validate/validator.go +++ b/libcontainer/configs/validate/validator.go @@ -7,6 +7,7 @@ import ( "strings" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" selinux "github.com/opencontainers/selinux/go-selinux" ) @@ -40,6 +41,9 @@ func (v *ConfigValidator) Validate(config *configs.Config) error { if err := v.sysctl(config); err != nil { return err } + if err := v.intelrdt(config); err != nil { + return err + } if config.Rootless { if err := v.rootless(config); err != nil { return err @@ -153,6 +157,19 @@ func (v *ConfigValidator) sysctl(config *configs.Config) error { return nil } +func (v *ConfigValidator) intelrdt(config *configs.Config) error { + if config.IntelRdt != nil { + if !intelrdt.IsIntelRdtEnabled() { + return fmt.Errorf("intelRdt is specified in config, but Intel RDT feature is not supported or enabled") + } + if config.IntelRdt.L3CacheSchema == "" { + return fmt.Errorf("intelRdt is specified in config, but intelRdt.l3CacheSchema is empty") + } + } + + return nil +} + func isSymbolicLink(path string) (bool, error) { fi, err := os.Lstat(path) if err != nil { diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 9e1b74d77af..ebb46358831 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -21,6 +21,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/criurpc" + "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" @@ -38,6 +39,7 @@ type linuxContainer struct { root string config *configs.Config cgroupManager cgroups.Manager + intelRdtManager intelrdt.Manager initArgs []string initProcess parentProcess initProcessStartTime uint64 @@ -67,6 +69,9 @@ type State struct { // Container's standard descriptors (std{in,out,err}), needed for checkpoint and restore ExternalDescriptors []string `json:"external_descriptors,omitempty"` + + // Intel RDT "resource control" filesystem path + IntelRdtPath string `json:"intel_rdt_path"` } // Container is a libcontainer container object. @@ -163,6 +168,11 @@ func (c *linuxContainer) Stats() (*Stats, error) { if stats.CgroupStats, err = c.cgroupManager.GetStats(); err != nil { return stats, newSystemErrorWithCause(err, "getting container stats from cgroups") } + if c.intelRdtManager != nil { + if stats.IntelRdtStats, err = c.intelRdtManager.GetStats(); err != nil { + return stats, newSystemErrorWithCause(err, "getting container's Intel RDT stats") + } + } for _, iface := range c.config.Networks { switch iface.Type { case "veth": @@ -193,6 +203,15 @@ func (c *linuxContainer) Set(config configs.Config) error { } return err } + if c.intelRdtManager != nil { + if err := c.intelRdtManager.Set(&config); err != nil { + // Set configs back + if err2 := c.intelRdtManager.Set(c.config); err2 != nil { + logrus.Warnf("Setting back intelrdt configs failed due to error: %v, your state.json and actual configs might be inconsistent.", err2) + } + return err + } + } // After config setting succeed, update config and states c.config = &config _, err = c.updateState(nil) @@ -434,15 +453,16 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c return nil, err } return &initProcess{ - cmd: cmd, - childPipe: childPipe, - parentPipe: parentPipe, - manager: c.cgroupManager, - config: c.newInitConfig(p), - container: c, - process: p, - bootstrapData: data, - sharePidns: sharePidns, + cmd: cmd, + childPipe: childPipe, + parentPipe: parentPipe, + manager: c.cgroupManager, + intelRdtManager: c.intelRdtManager, + config: c.newInitConfig(p), + container: c, + process: p, + bootstrapData: data, + sharePidns: sharePidns, }, nil } @@ -461,6 +481,7 @@ func (c *linuxContainer) newSetnsProcess(p *Process, cmd *exec.Cmd, parentPipe, return &setnsProcess{ cmd: cmd, cgroupPaths: c.cgroupManager.GetPaths(), + intelRdtPath: state.IntelRdtPath, childPipe: childPipe, parentPipe: parentPipe, config: c.newInitConfig(p), @@ -1519,6 +1540,10 @@ func (c *linuxContainer) currentState() (*State, error) { startTime, _ = c.initProcess.startTime() externalDescriptors = c.initProcess.externalDescriptors() } + intelRdtPath, err := intelrdt.GetIntelRdtPath(c.ID()) + if err != nil { + intelRdtPath = "" + } state := &State{ BaseState: BaseState{ ID: c.ID(), @@ -1529,6 +1554,7 @@ func (c *linuxContainer) currentState() (*State, error) { }, Rootless: c.config.Rootless, CgroupPaths: c.cgroupManager.GetPaths(), + IntelRdtPath: intelRdtPath, NamespacePaths: make(map[configs.NamespaceType]string), ExternalDescriptors: externalDescriptors, } diff --git a/libcontainer/container_linux_test.go b/libcontainer/container_linux_test.go index 75c2898529b..263db00feb8 100644 --- a/libcontainer/container_linux_test.go +++ b/libcontainer/container_linux_test.go @@ -9,6 +9,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/system" ) @@ -19,6 +20,11 @@ type mockCgroupManager struct { paths map[string]string } +type mockIntelRdtManager struct { + stats *intelrdt.Stats + path string +} + func (m *mockCgroupManager) GetPids() ([]int, error) { return m.pids, nil } @@ -51,6 +57,26 @@ func (m *mockCgroupManager) Freeze(state configs.FreezerState) error { return nil } +func (m *mockIntelRdtManager) Apply(pid int) error { + return nil +} + +func (m *mockIntelRdtManager) GetStats() (*intelrdt.Stats, error) { + return m.stats, nil +} + +func (m *mockIntelRdtManager) Destroy() error { + return nil +} + +func (m *mockIntelRdtManager) GetPath() string { + return m.path +} + +func (m *mockIntelRdtManager) Set(container *configs.Config) error { + return nil +} + type mockProcess struct { _pid int started uint64 @@ -118,6 +144,11 @@ func TestGetContainerStats(t *testing.T) { }, }, }, + intelRdtManager: &mockIntelRdtManager{ + stats: &intelrdt.Stats{ + L3CacheSchema: "L3:0=f;1=f0", + }, + }, } stats, err := container.Stats() if err != nil { @@ -129,13 +160,22 @@ func TestGetContainerStats(t *testing.T) { if stats.CgroupStats.MemoryStats.Usage.Usage != 1024 { t.Fatalf("expected memory usage 1024 but recevied %d", stats.CgroupStats.MemoryStats.Usage.Usage) } + if intelrdt.IsIntelRdtEnabled() { + if stats.IntelRdtStats == nil { + t.Fatal("intel rdt stats are nil") + } + if stats.IntelRdtStats.L3CacheSchema != "L3:0=f;1=f0" { + t.Fatalf("expected L3CacheSchema L3:0=f;1=f0 but recevied %s", stats.IntelRdtStats.L3CacheSchema) + } + } } func TestGetContainerState(t *testing.T) { var ( - pid = os.Getpid() - expectedMemoryPath = "/sys/fs/cgroup/memory/myid" - expectedNetworkPath = fmt.Sprintf("/proc/%d/ns/net", pid) + pid = os.Getpid() + expectedMemoryPath = "/sys/fs/cgroup/memory/myid" + expectedNetworkPath = fmt.Sprintf("/proc/%d/ns/net", pid) + expectedIntelRdtPath = "/sys/fs/resctrl/myid" ) container := &linuxContainer{ id: "myid", @@ -166,6 +206,12 @@ func TestGetContainerState(t *testing.T) { "memory": expectedMemoryPath, }, }, + intelRdtManager: &mockIntelRdtManager{ + stats: &intelrdt.Stats{ + L3CacheSchema: "L3:0=f0;1=f", + }, + path: expectedIntelRdtPath, + }, } container.state = &createdState{c: container} state, err := container.State() @@ -185,6 +231,15 @@ func TestGetContainerState(t *testing.T) { if memPath := paths["memory"]; memPath != expectedMemoryPath { t.Fatalf("expected memory path %q but received %q", expectedMemoryPath, memPath) } + if intelrdt.IsIntelRdtEnabled() { + intelRdtPath := state.IntelRdtPath + if intelRdtPath == "" { + t.Fatal("intel rdt path should not be empty") + } + if intelRdtPath != expectedIntelRdtPath { + t.Fatalf("expected intel rdt path %q but received %q", expectedIntelRdtPath, intelRdtPath) + } + } for _, ns := range container.config.Namespaces { path := state.NamespacePaths[ns.Type] if path == "" { diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index 947bdea1ceb..7ac6e1dee3a 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -18,6 +18,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/configs/validate" + "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/utils" "golang.org/x/sys/unix" @@ -86,6 +87,20 @@ func RootlessCgroups(l *LinuxFactory) error { return nil } +// IntelRdtfs is an options func to configure a LinuxFactory to return +// containers that use the Intel RDT "resource control" filesystem to +// create and manage Intel Xeon platform shared resources (e.g., L3 cache). +func IntelRdtFs(l *LinuxFactory) error { + l.NewIntelRdtManager = func(config *configs.Config, id string, path string) intelrdt.Manager { + return &intelrdt.IntelRdtManager{ + Config: config, + Id: id, + Path: path, + } + } + return nil +} + // TmpfsRoot is an option func to mount LinuxFactory.Root to tmpfs. func TmpfsRoot(l *LinuxFactory) error { mounted, err := mount.Mounted(l.Root) @@ -150,6 +165,9 @@ type LinuxFactory struct { // NewCgroupsManager returns an initialized cgroups manager for a single container. NewCgroupsManager func(config *configs.Cgroup, paths map[string]string) cgroups.Manager + + // NewIntelRdtManager returns an initialized Intel RDT manager for a single container. + NewIntelRdtManager func(config *configs.Config, id string, path string) intelrdt.Manager } func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, error) { @@ -185,6 +203,10 @@ func (l *LinuxFactory) Create(id string, config *configs.Config) (Container, err criuPath: l.CriuPath, cgroupManager: l.NewCgroupsManager(config.Cgroups, nil), } + c.intelRdtManager = nil + if intelrdt.IsIntelRdtEnabled() && c.config.IntelRdt != nil { + c.intelRdtManager = l.NewIntelRdtManager(config, id, "") + } c.state = &stoppedState{c: c} return c, nil } @@ -222,6 +244,10 @@ func (l *LinuxFactory) Load(id string) (Container, error) { if err := c.refreshState(); err != nil { return nil, err } + c.intelRdtManager = nil + if intelrdt.IsIntelRdtEnabled() && c.config.IntelRdt != nil { + c.intelRdtManager = l.NewIntelRdtManager(&state.Config, id, state.IntelRdtPath) + } return c, nil } diff --git a/libcontainer/factory_linux_test.go b/libcontainer/factory_linux_test.go index 055e1bfb0bb..57c157fe089 100644 --- a/libcontainer/factory_linux_test.go +++ b/libcontainer/factory_linux_test.go @@ -50,6 +50,32 @@ func TestFactoryNew(t *testing.T) { } } +func TestFactoryNewIntelRdt(t *testing.T) { + root, rerr := newTestRoot() + if rerr != nil { + t.Fatal(rerr) + } + defer os.RemoveAll(root) + factory, err := New(root, Cgroupfs, IntelRdtFs) + if err != nil { + t.Fatal(err) + } + if factory == nil { + t.Fatal("factory should not be nil") + } + lfactory, ok := factory.(*LinuxFactory) + if !ok { + t.Fatal("expected linux factory returned on linux based systems") + } + if lfactory.Root != root { + t.Fatalf("expected factory root to be %q but received %q", root, lfactory.Root) + } + + if factory.Type() != "libcontainer" { + t.Fatalf("unexpected factory type: %q, expected %q", factory.Type(), "libcontainer") + } +} + func TestFactoryNewTmpfs(t *testing.T) { root, rerr := newTestRoot() if rerr != nil { @@ -164,7 +190,7 @@ func TestFactoryLoadContainer(t *testing.T) { if err := marshal(filepath.Join(root, id, stateFilename), expectedState); err != nil { t.Fatal(err) } - factory, err := New(root, Cgroupfs) + factory, err := New(root, Cgroupfs, IntelRdtFs) if err != nil { t.Fatal(err) } diff --git a/libcontainer/intelrdt/intelrdt.go b/libcontainer/intelrdt/intelrdt.go new file mode 100644 index 00000000000..01db772a30a --- /dev/null +++ b/libcontainer/intelrdt/intelrdt.go @@ -0,0 +1,545 @@ +// +build linux + +package intelrdt + +import ( + "bufio" + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strconv" + "strings" + "sync" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +/* + * About Intel RDT/CAT feature: + * Intel platforms with new Xeon CPU support Resource Director Technology (RDT). + * Intel Cache Allocation Technology (CAT) is a sub-feature of RDT. Currently L3 + * Cache is the only resource that is supported in RDT. + * + * This feature provides a way for the software to restrict cache allocation to a + * defined 'subset' of L3 cache which may be overlapping with other 'subsets'. + * The different subsets are identified by class of service (CLOS) and each CLOS + * has a capacity bitmask (CBM). + * + * For more information about Intel RDT/CAT can be found in the section 17.17 + * of Intel Software Developer Manual. + * + * About Intel RDT/CAT kernel interface: + * In Linux 4.10 kernel or newer, the interface is defined and exposed via + * "resource control" filesystem, which is a "cgroup-like" interface. + * + * Comparing with cgroups, it has similar process management lifecycle and + * interfaces in a container. But unlike cgroups' hierarchy, it has single level + * filesystem layout. + * + * Intel RDT "resource control" filesystem hierarchy: + * mount -t resctrl resctrl /sys/fs/resctrl + * tree /sys/fs/resctrl + * /sys/fs/resctrl/ + * |-- info + * | |-- L3 + * | |-- cbm_mask + * | |-- min_cbm_bits + * | |-- num_closids + * |-- cpus + * |-- schemata + * |-- tasks + * |-- + * |-- cpus + * |-- schemata + * |-- tasks + * + * For runc, we can make use of `tasks` and `schemata` configuration for L3 cache + * resource constraints. + * + * The file `tasks` has a list of tasks that belongs to this group (e.g., + * " group). Tasks can be added to a group by writing the task ID + * to the "tasks" file (which will automatically remove them from the previous + * group to which they belonged). New tasks created by fork(2) and clone(2) are + * added to the same group as their parent. If a pid is not in any sub group, it is + * in root group. + * + * The file `schemata` has allocation bitmasks/values for L3 cache on each socket, + * which contains L3 cache id and capacity bitmask (CBM). + * Format: "L3:=;=;..." + * For example, on a two-socket machine, L3's schema line could be `L3:0=ff;1=c0` + * which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + * + * The valid L3 cache CBM is a *contiguous bits set* and number of bits that can + * be set is less than the max bit. The max bits in the CBM is varied among + * supported Intel Xeon platforms. In Intel RDT "resource control" filesystem + * layout, the CBM in a group should be a subset of the CBM in root. Kernel will + * check if it is valid when writing. e.g., 0xfffff in root indicates the max bits + * of CBM is 20 bits, which mapping to entire L3 cache capacity. Some valid CBM + * values to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + * + * For more information about Intel RDT/CAT kernel interface: + * https://www.kernel.org/doc/Documentation/x86/intel_rdt_ui.txt + * + * An example for runc: + * Consider a two-socket machine with two L3 caches where the default CBM is + * 0xfffff and the max CBM length is 20 bits. With this configuration, tasks + * inside the container only have access to the "upper" 80% of L3 cache id 0 and + * the "lower" 50% L3 cache id 1: + * + * "linux": { + * "intelRdt": { + * "l3CacheSchema": "L3:0=ffff0;1=3ff" + * } + * } + */ + +type Manager interface { + // Applies Intel RDT configuration to the process with the specified pid + Apply(pid int) error + + // Returns statistics for Intel RDT + GetStats() (*Stats, error) + + // Destroys the Intel RDT 'container_id' group + Destroy() error + + // Returns Intel RDT path to save in a state file and to be able to + // restore the object later + GetPath() string + + // Set Intel RDT "resource control" filesystem as configured. + Set(container *configs.Config) error +} + +// This implements interface Manager +type IntelRdtManager struct { + mu sync.Mutex + Config *configs.Config + Id string + Path string +} + +const ( + IntelRdtTasks = "tasks" +) + +var ( + // The absolute root path of the Intel RDT "resource control" filesystem + intelRdtRoot string + intelRdtRootLock sync.Mutex + + // The flag to indicate if Intel RDT is supported + isIntelRdtEnabled bool +) + +type intelRdtData struct { + root string + config *configs.Config + pid int +} + +// Return the mount point path of Intel RDT "resource control" filesysem +func findIntelRdtMountpointDir() (string, error) { + f, err := os.Open("/proc/self/mountinfo") + if err != nil { + return "", err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + text := s.Text() + fields := strings.Split(text, " ") + // Safe as mountinfo encodes mountpoints with spaces as \040. + index := strings.Index(text, " - ") + postSeparatorFields := strings.Fields(text[index+3:]) + numPostFields := len(postSeparatorFields) + + // This is an error as we can't detect if the mount is for "Intel RDT" + if numPostFields == 0 { + return "", fmt.Errorf("Found no fields post '-' in %q", text) + } + + if postSeparatorFields[0] == "resctrl" { + // Check that the mount is properly formated. + if numPostFields < 3 { + return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text) + } + + return fields[4], nil + } + } + if err := s.Err(); err != nil { + return "", err + } + + return "", NewNotFoundError("Intel RDT") +} + +// Gets the root path of Intel RDT "resource control" filesystem +func getIntelRdtRoot() (string, error) { + intelRdtRootLock.Lock() + defer intelRdtRootLock.Unlock() + + if intelRdtRoot != "" { + return intelRdtRoot, nil + } + + root, err := findIntelRdtMountpointDir() + if err != nil { + return "", err + } + + if _, err := os.Stat(root); err != nil { + return "", err + } + + intelRdtRoot = root + return intelRdtRoot, nil +} + +func isIntelRdtMounted() bool { + _, err := getIntelRdtRoot() + if err != nil { + return false + } + + return true +} + +func parseCpuInfoFile(path string) (bool, error) { + f, err := os.Open(path) + if err != nil { + return false, err + } + defer f.Close() + + s := bufio.NewScanner(f) + for s.Scan() { + if err := s.Err(); err != nil { + return false, err + } + + text := s.Text() + flags := strings.Split(text, " ") + + // "cat_l3" flag is set if Intel RDT/CAT is supported + for _, flag := range flags { + if flag == "cat_l3" { + return true, nil + } + } + } + return false, nil +} + +func parseUint(s string, base, bitSize int) (uint64, error) { + value, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 { + return 0, nil + } + + return value, err + } + + return value, nil +} + +// Gets a single uint64 value from the specified file. +func getIntelRdtParamUint(path, file string) (uint64, error) { + fileName := filepath.Join(path, file) + contents, err := ioutil.ReadFile(fileName) + if err != nil { + return 0, err + } + + res, err := parseUint(strings.TrimSpace(string(contents)), 10, 64) + if err != nil { + return res, fmt.Errorf("unable to parse %q as a uint from file %q", string(contents), fileName) + } + return res, nil +} + +// Gets a string value from the specified file +func getIntelRdtParamString(path, file string) (string, error) { + contents, err := ioutil.ReadFile(filepath.Join(path, file)) + if err != nil { + return "", err + } + + return strings.TrimSpace(string(contents)), nil +} + +func readTasksFile(dir string) ([]int, error) { + f, err := os.Open(filepath.Join(dir, IntelRdtTasks)) + if err != nil { + return nil, err + } + defer f.Close() + + var ( + s = bufio.NewScanner(f) + out = []int{} + ) + + for s.Scan() { + if t := s.Text(); t != "" { + pid, err := strconv.Atoi(t) + if err != nil { + return nil, err + } + out = append(out, pid) + } + } + return out, nil +} + +func writeFile(dir, file, data string) error { + if dir == "" { + return fmt.Errorf("no such directory for %s", file) + } + if err := ioutil.WriteFile(filepath.Join(dir, file), []byte(data+"\n"), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", data, file, err) + } + return nil +} + +func getIntelRdtData(c *configs.Config, pid int) (*intelRdtData, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return nil, err + } + return &intelRdtData{ + root: rootPath, + config: c, + pid: pid, + }, nil +} + +// Get the read-only L3 cache information +func getL3CacheInfo() (*L3CacheInfo, error) { + l3CacheInfo := &L3CacheInfo{} + + rootPath, err := getIntelRdtRoot() + if err != nil { + return l3CacheInfo, err + } + + path := filepath.Join(rootPath, "info", "L3") + cbmMask, err := getIntelRdtParamString(path, "cbm_mask") + if err != nil { + return l3CacheInfo, err + } + minCbmBits, err := getIntelRdtParamUint(path, "min_cbm_bits") + if err != nil { + return l3CacheInfo, err + } + numClosids, err := getIntelRdtParamUint(path, "num_closids") + if err != nil { + return l3CacheInfo, err + } + + l3CacheInfo.CbmMask = cbmMask + l3CacheInfo.MinCbmBits = minCbmBits + l3CacheInfo.NumClosids = numClosids + + return l3CacheInfo, nil +} + +// WriteIntelRdtTasks writes the specified pid into the "tasks" file +func WriteIntelRdtTasks(dir string, pid int) error { + if dir == "" { + return fmt.Errorf("no such directory for %s", IntelRdtTasks) + } + + // Dont attach any pid if -1 is specified as a pid + if pid != -1 { + if err := ioutil.WriteFile(filepath.Join(dir, IntelRdtTasks), []byte(strconv.Itoa(pid)), 0700); err != nil { + return fmt.Errorf("failed to write %v to %v: %v", pid, IntelRdtTasks, err) + } + } + return nil +} + +// Check if Intel RDT is enabled +func IsIntelRdtEnabled() bool { + // We have checked the flag before + if isIntelRdtEnabled { + return true + } + + // 1. Check if hardware and kernel support Intel RDT/CAT feature + // "cat_l3" flag is set if supported + isFlagSet, err := parseCpuInfoFile("/proc/cpuinfo") + if !isFlagSet || err != nil { + isIntelRdtEnabled = false + return false + } + + // 2. Check if Intel RDT "resource control" filesystem is mounted + // The user guarantees to mount the filesystem + isIntelRdtEnabled = isIntelRdtMounted() + return isIntelRdtEnabled +} + +// Get the 'container_id' path in Intel RDT "resource control" filesystem +func GetIntelRdtPath(id string) (string, error) { + rootPath, err := getIntelRdtRoot() + if err != nil { + return "", err + } + + path := filepath.Join(rootPath, id) + return path, nil +} + +// Applies Intel RDT configuration to the process with the specified pid +func (m *IntelRdtManager) Apply(pid int) (err error) { + d, err := getIntelRdtData(m.Config, pid) + if err != nil && !IsNotFound(err) { + return err + } + + m.mu.Lock() + defer m.mu.Unlock() + path, err := d.join(m.Id) + if err != nil { + return err + } + + m.Path = path + return nil +} + +// Destroys the Intel RDT 'container_id' group +func (m *IntelRdtManager) Destroy() error { + m.mu.Lock() + defer m.mu.Unlock() + if err := os.RemoveAll(m.Path); err != nil { + return err + } + m.Path = "" + return nil +} + +// Returns Intel RDT path to save in a state file and to be able to +// restore the object later +func (m *IntelRdtManager) GetPath() string { + if m.Path == "" { + m.Path, _ = GetIntelRdtPath(m.Id) + } + return m.Path +} + +// Returns statistics for Intel RDT +func (m *IntelRdtManager) GetStats() (*Stats, error) { + m.mu.Lock() + defer m.mu.Unlock() + stats := NewStats() + + // The read-only L3 cache information + l3CacheInfo, err := getL3CacheInfo() + if err != nil { + return nil, err + } + stats.L3CacheInfo = l3CacheInfo + + // The read-only L3 cache schema in root + rootPath, err := getIntelRdtRoot() + if err != nil { + return nil, err + } + tmpRootStrings, err := getIntelRdtParamString(rootPath, "schemata") + if err != nil { + return nil, err + } + // L3 cache schema is in the first line + schemaRootStrings := strings.Split(tmpRootStrings, "\n") + stats.L3CacheSchemaRoot = schemaRootStrings[0] + + // The L3 cache schema in 'container_id' group + tmpStrings, err := getIntelRdtParamString(m.GetPath(), "schemata") + if err != nil { + return nil, err + } + // L3 cache schema is in the first line + schemaStrings := strings.Split(tmpStrings, "\n") + stats.L3CacheSchema = schemaStrings[0] + + return stats, nil +} + +// Set Intel RDT "resource control" filesystem as configured. +func (m *IntelRdtManager) Set(container *configs.Config) error { + path := m.GetPath() + + // About L3 cache schema file: + // The schema has allocation masks/values for L3 cache on each socket, + // which contains L3 cache id and capacity bitmask (CBM). + // Format: "L3:=;=;..." + // For example, on a two-socket machine, L3's schema line could be: + // L3:0=ff;1=c0 + // Which means L3 cache id 0's CBM is 0xff, and L3 cache id 1's CBM is 0xc0. + // + // About L3 cache CBM validity: + // The valid L3 cache CBM is a *contiguous bits set* and number of + // bits that can be set is less than the max bit. The max bits in the + // CBM is varied among supported Intel Xeon platforms. In Intel RDT + // "resource control" filesystem layout, the CBM in a group should + // be a subset of the CBM in root. Kernel will check if it is valid + // when writing. + // e.g., 0xfffff in root indicates the max bits of CBM is 20 bits, + // which mapping to entire L3 cache capacity. Some valid CBM values + // to set in a group: 0xf, 0xf0, 0x3ff, 0x1f00 and etc. + if container.IntelRdt != nil { + l3CacheSchema := container.IntelRdt.L3CacheSchema + if l3CacheSchema != "" { + if err := writeFile(path, "schemata", l3CacheSchema); err != nil { + return err + } + } + } + + return nil +} + +func (raw *intelRdtData) join(id string) (string, error) { + path := filepath.Join(raw.root, id) + if err := os.MkdirAll(path, 0755); err != nil { + return "", err + } + + if err := WriteIntelRdtTasks(path, raw.pid); err != nil { + return "", err + } + return path, nil +} + +type NotFoundError struct { + ResourceControl string +} + +func (e *NotFoundError) Error() string { + return fmt.Sprintf("mountpoint for %s not found", e.ResourceControl) +} + +func NewNotFoundError(res string) error { + return &NotFoundError{ + ResourceControl: res, + } +} + +func IsNotFound(err error) bool { + if err == nil { + return false + } + _, ok := err.(*NotFoundError) + return ok +} diff --git a/libcontainer/intelrdt/intelrdt_test.go b/libcontainer/intelrdt/intelrdt_test.go new file mode 100644 index 00000000000..8e4081640d9 --- /dev/null +++ b/libcontainer/intelrdt/intelrdt_test.go @@ -0,0 +1,46 @@ +// +build linux + +package intelrdt + +import ( + "strings" + "testing" +) + +func TestIntelRdtSetL3CacheSchema(t *testing.T) { + if !IsIntelRdtEnabled() { + return + } + + helper := NewIntelRdtTestUtil(t) + defer helper.cleanup() + + const ( + l3CacheSchemaBefore = "L3:0=f;1=f0" + l3CacheSchemeAfter = "L3:0=f0;1=f" + ) + + helper.writeFileContents(map[string]string{ + "schemata": l3CacheSchemaBefore + "\n", + }) + + helper.IntelRdtData.config.IntelRdt.L3CacheSchema = l3CacheSchemeAfter + intelrdt := &IntelRdtManager{ + Config: helper.IntelRdtData.config, + Path: helper.IntelRdtPath, + } + if err := intelrdt.Set(helper.IntelRdtData.config); err != nil { + t.Fatal(err) + } + + tmpStrings, err := getIntelRdtParamString(helper.IntelRdtPath, "schemata") + if err != nil { + t.Fatalf("Failed to parse file 'schemata' - %s", err) + } + values := strings.Split(tmpStrings, "\n") + value := values[0] + + if value != l3CacheSchemeAfter { + t.Fatal("Got the wrong value, set 'schemata' failed.") + } +} diff --git a/libcontainer/intelrdt/stats.go b/libcontainer/intelrdt/stats.go new file mode 100644 index 00000000000..095c0a380cd --- /dev/null +++ b/libcontainer/intelrdt/stats.go @@ -0,0 +1,24 @@ +// +build linux + +package intelrdt + +type L3CacheInfo struct { + CbmMask string `json:"cbm_mask,omitempty"` + MinCbmBits uint64 `json:"min_cbm_bits,omitempty"` + NumClosids uint64 `json:"num_closids,omitempty"` +} + +type Stats struct { + // The read-only L3 cache information + L3CacheInfo *L3CacheInfo `json:"l3_cache_info,omitempty"` + + // The read-only L3 cache schema in root + L3CacheSchemaRoot string `json:"l3_cache_schema_root,omitempty"` + + // The L3 cache schema in 'container_id' group + L3CacheSchema string `json:"l3_cache_schema,omitempty"` +} + +func NewStats() *Stats { + return &Stats{} +} diff --git a/libcontainer/intelrdt/util_test.go b/libcontainer/intelrdt/util_test.go new file mode 100644 index 00000000000..970b6ce360e --- /dev/null +++ b/libcontainer/intelrdt/util_test.go @@ -0,0 +1,67 @@ +// +build linux + +/* + * Utility for testing Intel RDT operations. + * Creates a mock of the Intel RDT "resource control" filesystem for the duration of the test. + */ +package intelrdt + +import ( + "io/ioutil" + "os" + "path/filepath" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" +) + +type intelRdtTestUtil struct { + // intelRdt data to use in tests + IntelRdtData *intelRdtData + + // Path to the mock Intel RDT "resource control" filesystem directory + IntelRdtPath string + + // Temporary directory to store mock Intel RDT "resource control" filesystem + tempDir string + t *testing.T +} + +// Creates a new test util +func NewIntelRdtTestUtil(t *testing.T) *intelRdtTestUtil { + d := &intelRdtData{ + config: &configs.Config{ + IntelRdt: &configs.IntelRdt{}, + }, + } + tempDir, err := ioutil.TempDir("", "intelrdt_test") + if err != nil { + t.Fatal(err) + } + d.root = tempDir + testIntelRdtPath := filepath.Join(d.root, "resctrl") + if err != nil { + t.Fatal(err) + } + + // Ensure the full mock Intel RDT "resource control" filesystem path exists + err = os.MkdirAll(testIntelRdtPath, 0755) + if err != nil { + t.Fatal(err) + } + return &intelRdtTestUtil{IntelRdtData: d, IntelRdtPath: testIntelRdtPath, tempDir: tempDir, t: t} +} + +func (c *intelRdtTestUtil) cleanup() { + os.RemoveAll(c.tempDir) +} + +// Write the specified contents on the mock of the specified Intel RDT "resource control" files +func (c *intelRdtTestUtil) writeFileContents(fileContents map[string]string) { + for file, contents := range fileContents { + err := writeFile(c.IntelRdtPath, file, contents) + if err != nil { + c.t.Fatal(err) + } + } +} diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 50f9af574c4..a526c92f737 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -15,6 +15,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/system" "github.com/opencontainers/runc/libcontainer/utils" @@ -49,6 +50,7 @@ type setnsProcess struct { parentPipe *os.File childPipe *os.File cgroupPaths map[string]string + intelRdtPath string config *initConfig fds []string process *Process @@ -89,6 +91,15 @@ func (p *setnsProcess) start() (err error) { return newSystemErrorWithCausef(err, "adding pid %d to cgroups", p.pid()) } } + if p.intelRdtPath != "" { + // if Intel RDT "resource control" filesystem path exists + _, err := os.Stat(p.intelRdtPath) + if err == nil { + if err := intelrdt.WriteIntelRdtTasks(p.intelRdtPath, p.pid()); err != nil { + return newSystemErrorWithCausef(err, "adding pid %d to Intel RDT resource control filesystem", p.pid()) + } + } + } // set rlimits, this has to be done here because we lose permissions // to raise the limits once we enter a user-namespace if err := setupRlimits(p.config.Rlimits, p.pid()); err != nil { @@ -193,16 +204,17 @@ func (p *setnsProcess) setExternalDescriptors(newFds []string) { } type initProcess struct { - cmd *exec.Cmd - parentPipe *os.File - childPipe *os.File - config *initConfig - manager cgroups.Manager - container *linuxContainer - fds []string - process *Process - bootstrapData io.Reader - sharePidns bool + cmd *exec.Cmd + parentPipe *os.File + childPipe *os.File + config *initConfig + manager cgroups.Manager + intelRdtManager intelrdt.Manager + container *linuxContainer + fds []string + process *Process + bootstrapData io.Reader + sharePidns bool } func (p *initProcess) pid() int { @@ -281,10 +293,18 @@ func (p *initProcess) start() error { if err := p.manager.Apply(p.pid()); err != nil { return newSystemErrorWithCause(err, "applying cgroup configuration for process") } + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Apply(p.pid()); err != nil { + return newSystemErrorWithCause(err, "applying Intel RDT configuration for process") + } + } defer func() { if err != nil { // TODO: should not be the responsibility to call here p.manager.Destroy() + if p.intelRdtManager != nil { + p.intelRdtManager.Destroy() + } } }() if err := p.createNetworkInterfaces(); err != nil { @@ -312,6 +332,11 @@ func (p *initProcess) start() error { if err := p.manager.Set(p.config.Config); err != nil { return newSystemErrorWithCause(err, "setting cgroup config for ready process") } + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Set(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "setting Intel RDT config for ready process") + } + } if p.config.Config.Hooks != nil { s := configs.HookState{ @@ -337,6 +362,11 @@ func (p *initProcess) start() error { if err := p.manager.Set(p.config.Config); err != nil { return newSystemErrorWithCause(err, "setting cgroup config for procHooks process") } + if p.intelRdtManager != nil { + if err := p.intelRdtManager.Set(p.config.Config); err != nil { + return newSystemErrorWithCause(err, "setting Intel RDT config for procHooks process") + } + } if p.config.Config.Hooks != nil { s := configs.HookState{ Version: p.container.config.Version, diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index 8c3ba00f075..6d3990890ff 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -250,6 +250,12 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { } createHooks(spec, config) config.Version = specs.Version + if spec.Linux.IntelRdt != nil { + config.IntelRdt = &configs.IntelRdt{} + if spec.Linux.IntelRdt.L3CacheSchema != "" { + config.IntelRdt.L3CacheSchema = spec.Linux.IntelRdt.L3CacheSchema + } + } return config, nil } diff --git a/libcontainer/state_linux.go b/libcontainer/state_linux.go index 44fa6b43a8d..1f8c5e71e41 100644 --- a/libcontainer/state_linux.go +++ b/libcontainer/state_linux.go @@ -45,6 +45,11 @@ func destroy(c *linuxContainer) error { } } err := c.cgroupManager.Destroy() + if c.intelRdtManager != nil { + if ierr := c.intelRdtManager.Destroy(); err == nil { + err = ierr + } + } if rerr := os.RemoveAll(c.root); err == nil { err = rerr } diff --git a/libcontainer/stats_linux.go b/libcontainer/stats_linux.go index c629dc67de9..29fd641e9dd 100644 --- a/libcontainer/stats_linux.go +++ b/libcontainer/stats_linux.go @@ -1,8 +1,10 @@ package libcontainer import "github.com/opencontainers/runc/libcontainer/cgroups" +import "github.com/opencontainers/runc/libcontainer/intelrdt" type Stats struct { - Interfaces []*NetworkInterface - CgroupStats *cgroups.Stats + Interfaces []*NetworkInterface + CgroupStats *cgroups.Stats + IntelRdtStats *intelrdt.Stats } diff --git a/utils_linux.go b/utils_linux.go index 6f6ff58e1cb..6da9171c02e 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -13,6 +13,7 @@ import ( "github.com/opencontainers/runc/libcontainer" "github.com/opencontainers/runc/libcontainer/cgroups/systemd" "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/intelrdt" "github.com/opencontainers/runc/libcontainer/specconv" "github.com/opencontainers/runc/libcontainer/utils" "github.com/opencontainers/runtime-spec/specs-go" @@ -42,6 +43,10 @@ func loadFactory(context *cli.Context) (libcontainer.Factory, error) { return nil, fmt.Errorf("systemd cgroup flag passed, but systemd support for managing cgroups is not available") } } + if intelrdt.IsIntelRdtEnabled() { + intelRdtManager := libcontainer.IntelRdtFs + return libcontainer.New(abs, cgroupManager, intelRdtManager, libcontainer.CriuPath(context.GlobalString("criu"))) + } return libcontainer.New(abs, cgroupManager, libcontainer.CriuPath(context.GlobalString("criu"))) }