Skip to content

Commit 888ff40

Browse files
authored
Merge pull request kubernetes#73651 from RobertKrawitz/node_pids_limit
Support total process ID limiting for nodes
2 parents 508a4f7 + 2597a1d commit 888ff40

19 files changed

+211
-41
lines changed

cmd/kubelet/app/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ go_library(
6868
"//pkg/kubelet/kubeletconfig/configfiles:go_default_library",
6969
"//pkg/kubelet/server:go_default_library",
7070
"//pkg/kubelet/server/streaming:go_default_library",
71+
"//pkg/kubelet/stats/pidlimit:go_default_library",
7172
"//pkg/kubelet/types:go_default_library",
7273
"//pkg/util/configz:go_default_library",
7374
"//pkg/util/filesystem:go_default_library",

cmd/kubelet/app/options/options.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -590,8 +590,8 @@ func AddKubeletConfigFlags(mainfs *pflag.FlagSet, c *kubeletconfig.KubeletConfig
590590
fs.BoolVar(&c.ProtectKernelDefaults, "protect-kernel-defaults", c.ProtectKernelDefaults, "Default kubelet behaviour for kernel tuning. If set, kubelet errors if any of kernel tunables is different than kubelet defaults.")
591591

592592
// Node Allocatable Flags
593-
fs.Var(flag.NewMapStringString(&c.SystemReserved), "system-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi,ephemeral-storage=1Gi) pairs that describe resources reserved for non-kubernetes components. Currently only cpu and memory are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]")
594-
fs.Var(flag.NewMapStringString(&c.KubeReserved), "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi,ephemeral-storage=1Gi) pairs that describe resources reserved for kubernetes system components. Currently cpu, memory and local ephemeral storage for root file system are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]")
593+
fs.Var(flag.NewMapStringString(&c.SystemReserved), "system-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi,ephemeral-storage=1Gi,pid=100) pairs that describe resources reserved for non-kubernetes components. Currently only cpu, memory, and pid (process IDs) are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]")
594+
fs.Var(flag.NewMapStringString(&c.KubeReserved), "kube-reserved", "A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=500Mi,ephemeral-storage=1Gi,pid=100) pairs that describe resources reserved for kubernetes system components. Currently cpu, memory, local ephemeral storage for root file system, and pid (process IDs) are supported. See http://kubernetes.io/docs/user-guide/compute-resources for more detail. [default=none]")
595595
fs.StringSliceVar(&c.EnforceNodeAllocatable, "enforce-node-allocatable", c.EnforceNodeAllocatable, "A comma separated list of levels of node allocatable enforcement to be enforced by kubelet. Acceptable options are 'none', 'pods', 'system-reserved', and 'kube-reserved'. If the latter two options are specified, '--system-reserved-cgroup' and '--kube-reserved-cgroup' must also be set, respectively. If 'none' is specified, no additional options should be set. See https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/ for more details.")
596596
fs.StringVar(&c.SystemReservedCgroup, "system-reserved-cgroup", c.SystemReservedCgroup, "Absolute name of the top level cgroup that is used to manage non-kubernetes components for which compute resources were reserved via '--system-reserved' flag. Ex. '/system-reserved'. [default='']")
597597
fs.StringVar(&c.KubeReservedCgroup, "kube-reserved-cgroup", c.KubeReservedCgroup, "Absolute name of the top level cgroup that is used to manage kubernetes components for which compute resources were reserved via '--kube-reserved' flag. Ex. '/kube-reserved'. [default='']")

cmd/kubelet/app/server.go

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ import (
8282
"k8s.io/kubernetes/pkg/kubelet/kubeletconfig/configfiles"
8383
"k8s.io/kubernetes/pkg/kubelet/server"
8484
"k8s.io/kubernetes/pkg/kubelet/server/streaming"
85+
"k8s.io/kubernetes/pkg/kubelet/stats/pidlimit"
8586
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
8687
"k8s.io/kubernetes/pkg/util/configz"
8788
utilfs "k8s.io/kubernetes/pkg/util/filesystem"
@@ -1152,16 +1153,18 @@ func parseResourceList(m map[string]string) (v1.ResourceList, error) {
11521153
rl := make(v1.ResourceList)
11531154
for k, v := range m {
11541155
switch v1.ResourceName(k) {
1155-
// CPU, memory and local storage resources are supported.
1156-
case v1.ResourceCPU, v1.ResourceMemory, v1.ResourceEphemeralStorage:
1157-
q, err := resource.ParseQuantity(v)
1158-
if err != nil {
1159-
return nil, err
1160-
}
1161-
if q.Sign() == -1 {
1162-
return nil, fmt.Errorf("resource quantity for %q cannot be negative: %v", k, v)
1156+
// CPU, memory, local storage, and PID resources are supported.
1157+
case v1.ResourceCPU, v1.ResourceMemory, v1.ResourceEphemeralStorage, pidlimit.PIDs:
1158+
if v1.ResourceName(k) != pidlimit.PIDs || utilfeature.DefaultFeatureGate.Enabled(features.SupportNodePidsLimit) {
1159+
q, err := resource.ParseQuantity(v)
1160+
if err != nil {
1161+
return nil, err
1162+
}
1163+
if q.Sign() == -1 {
1164+
return nil, fmt.Errorf("resource quantity for %q cannot be negative: %v", k, v)
1165+
}
1166+
rl[v1.ResourceName(k)] = q
11631167
}
1164-
rl[v1.ResourceName(k)] = q
11651168
default:
11661169
return nil, fmt.Errorf("cannot reserve %q resource", k)
11671170
}

pkg/features/kube_features.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,6 +405,12 @@ const (
405405
//
406406
// Enables the AWS EBS in-tree driver to AWS EBS CSI Driver migration feature.
407407
CSIMigrationAWS utilfeature.Feature = "CSIMigrationAWS"
408+
409+
// owner: @RobertKrawitz
410+
// alpha: v1.14
411+
//
412+
// Implement support for limiting pids in nodes
413+
SupportNodePidsLimit utilfeature.Feature = "SupportNodePidsLimit"
408414
)
409415

410416
func init() {
@@ -450,6 +456,7 @@ var defaultKubernetesFeatureGates = map[utilfeature.Feature]utilfeature.FeatureS
450456
ResourceLimitsPriorityFunction: {Default: false, PreRelease: utilfeature.Alpha},
451457
SupportIPVSProxyMode: {Default: true, PreRelease: utilfeature.GA},
452458
SupportPodPidsLimit: {Default: true, PreRelease: utilfeature.Beta},
459+
SupportNodePidsLimit: {Default: false, PreRelease: utilfeature.Alpha},
453460
HyperVContainer: {Default: false, PreRelease: utilfeature.Alpha},
454461
ScheduleDaemonSetPods: {Default: true, PreRelease: utilfeature.Beta},
455462
TokenRequest: {Default: true, PreRelease: utilfeature.Beta},

pkg/kubelet/apis/config/types.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -291,12 +291,12 @@ type KubeletConfiguration struct {
291291

292292
/* the following fields are meant for Node Allocatable */
293293

294-
// A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs
294+
// A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G,pids=100) pairs
295295
// that describe resources reserved for non-kubernetes components.
296296
// Currently only cpu and memory are supported.
297297
// See http://kubernetes.io/docs/user-guide/compute-resources for more detail.
298298
SystemReserved map[string]string
299-
// A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G) pairs
299+
// A set of ResourceName=ResourceQuantity (e.g. cpu=200m,memory=150G,pids=100) pairs
300300
// that describe resources reserved for kubernetes system components.
301301
// Currently cpu, memory and local ephemeral storage for root file system are supported.
302302
// See http://kubernetes.io/docs/user-guide/compute-resources for more detail.

pkg/kubelet/cm/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ go_library(
7373
"//pkg/kubelet/events:go_default_library",
7474
"//pkg/kubelet/metrics:go_default_library",
7575
"//pkg/kubelet/qos:go_default_library",
76+
"//pkg/kubelet/stats/pidlimit:go_default_library",
7677
"//pkg/kubelet/types:go_default_library",
7778
"//pkg/util/mount:go_default_library",
7879
"//pkg/util/oom:go_default_library",

pkg/kubelet/cm/cgroup_manager_linux.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ func (m *cgroupManagerImpl) Exists(name CgroupName) bool {
257257
// in https://github.com/opencontainers/runc/issues/1440
258258
// once resolved, we can remove this code.
259259
whitelistControllers := sets.NewString("cpu", "cpuacct", "cpuset", "memory", "systemd")
260-
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) {
260+
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) || utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportNodePidsLimit) {
261261
whitelistControllers.Insert("pids")
262262
}
263263
var missingPaths []string
@@ -325,10 +325,11 @@ func getSupportedSubsystems() map[subsystem]bool {
325325
supportedSubsystems := map[subsystem]bool{
326326
&cgroupfs.MemoryGroup{}: true,
327327
&cgroupfs.CpuGroup{}: true,
328+
&cgroupfs.PidsGroup{}: true,
328329
}
329330
// not all hosts support hugetlb cgroup, and in the absent of hugetlb, we will fail silently by reporting no capacity.
330331
supportedSubsystems[&cgroupfs.HugetlbGroup{}] = false
331-
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) {
332+
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) || utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportNodePidsLimit) {
332333
supportedSubsystems[&cgroupfs.PidsGroup{}] = true
333334
}
334335
return supportedSubsystems
@@ -377,9 +378,9 @@ func (m *cgroupManagerImpl) toResources(resourceConfig *ResourceConfig) *libcont
377378
if resourceConfig.CpuPeriod != nil {
378379
resources.CpuPeriod = *resourceConfig.CpuPeriod
379380
}
380-
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) {
381-
if resourceConfig.PodPidsLimit != nil {
382-
resources.PidsLimit = *resourceConfig.PodPidsLimit
381+
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) || utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportNodePidsLimit) {
382+
if resourceConfig.PidsLimit != nil {
383+
resources.PidsLimit = *resourceConfig.PidsLimit
383384
}
384385
}
385386
// if huge pages are enabled, we set them in libcontainer
@@ -431,8 +432,8 @@ func (m *cgroupManagerImpl) Update(cgroupConfig *CgroupConfig) error {
431432
libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs()
432433
}
433434

434-
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters != nil && cgroupConfig.ResourceParameters.PodPidsLimit != nil {
435-
libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PodPidsLimit
435+
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters != nil && cgroupConfig.ResourceParameters.PidsLimit != nil {
436+
libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PidsLimit
436437
}
437438

438439
if err := setSupportedSubsystems(libcontainerCgroupConfig); err != nil {
@@ -461,8 +462,8 @@ func (m *cgroupManagerImpl) Create(cgroupConfig *CgroupConfig) error {
461462
libcontainerCgroupConfig.Path = cgroupConfig.Name.ToCgroupfs()
462463
}
463464

464-
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters != nil && cgroupConfig.ResourceParameters.PodPidsLimit != nil {
465-
libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PodPidsLimit
465+
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && cgroupConfig.ResourceParameters != nil && cgroupConfig.ResourceParameters.PidsLimit != nil {
466+
libcontainerCgroupConfig.PidsLimit = *cgroupConfig.ResourceParameters.PidsLimit
466467
}
467468

468469
// get the manager with the specified cgroup configuration

pkg/kubelet/cm/container_manager_linux.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ import (
5353
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container"
5454
"k8s.io/kubernetes/pkg/kubelet/lifecycle"
5555
"k8s.io/kubernetes/pkg/kubelet/qos"
56+
"k8s.io/kubernetes/pkg/kubelet/stats/pidlimit"
5657
"k8s.io/kubernetes/pkg/kubelet/status"
5758
"k8s.io/kubernetes/pkg/kubelet/util/pluginwatcher"
5859
schedulernodeinfo "k8s.io/kubernetes/pkg/scheduler/nodeinfo"
@@ -123,6 +124,8 @@ type containerManagerImpl struct {
123124
cgroupManager CgroupManager
124125
// Capacity of this node.
125126
capacity v1.ResourceList
127+
// Capacity of this node, including internal resources.
128+
internalCapacity v1.ResourceList
126129
// Absolute cgroupfs path to a cgroup that Kubelet needs to place all pods under.
127130
// This path include a top level container for enforcing Node Allocatable.
128131
cgroupRoot CgroupName
@@ -219,6 +222,7 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
219222
}
220223

221224
var capacity = v1.ResourceList{}
225+
var internalCapacity = v1.ResourceList{}
222226
// It is safe to invoke `MachineInfo` on cAdvisor before logically initializing cAdvisor here because
223227
// machine info is computed and cached once as part of cAdvisor object creation.
224228
// But `RootFsInfo` and `ImagesFsInfo` are not available at this moment so they will be called later during manager starts
@@ -227,6 +231,15 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
227231
return nil, err
228232
}
229233
capacity = cadvisor.CapacityFromMachineInfo(machineInfo)
234+
for k, v := range capacity {
235+
internalCapacity[k] = v
236+
}
237+
pidlimits, err := pidlimit.Stats()
238+
if err == nil && pidlimits != nil && pidlimits.MaxPID != nil {
239+
internalCapacity[pidlimit.PIDs] = *resource.NewQuantity(
240+
int64(*pidlimits.MaxPID),
241+
resource.DecimalSI)
242+
}
230243

231244
// Turn CgroupRoot from a string (in cgroupfs path format) to internal CgroupName
232245
cgroupRoot := ParseCgroupfsToCgroupName(nodeConfig.CgroupRoot)
@@ -264,6 +277,7 @@ func NewContainerManager(mountUtil mount.Interface, cadvisorInterface cadvisor.I
264277
subsystems: subsystems,
265278
cgroupManager: cgroupManager,
266279
capacity: capacity,
280+
internalCapacity: internalCapacity,
267281
cgroupRoot: cgroupRoot,
268282
recorder: recorder,
269283
qosContainerManager: qosContainerManager,

pkg/kubelet/cm/node_container_manager_linux.go

Lines changed: 20 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ import (
2828
"k8s.io/apimachinery/pkg/types"
2929
"k8s.io/klog"
3030
"k8s.io/kubernetes/pkg/kubelet/events"
31+
"k8s.io/kubernetes/pkg/kubelet/stats/pidlimit"
3132
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
3233
)
3334

@@ -40,7 +41,7 @@ func (cm *containerManagerImpl) createNodeAllocatableCgroups() error {
4041
cgroupConfig := &CgroupConfig{
4142
Name: cm.cgroupRoot,
4243
// The default limits for cpu shares can be very low which can lead to CPU starvation for pods.
43-
ResourceParameters: getCgroupConfig(cm.capacity),
44+
ResourceParameters: getCgroupConfig(cm.internalCapacity),
4445
}
4546
if cm.cgroupManager.Exists(cgroupConfig.Name) {
4647
return nil
@@ -58,10 +59,10 @@ func (cm *containerManagerImpl) enforceNodeAllocatableCgroups() error {
5859

5960
// We need to update limits on node allocatable cgroup no matter what because
6061
// default cpu shares on cgroups are low and can cause cpu starvation.
61-
nodeAllocatable := cm.capacity
62+
nodeAllocatable := cm.internalCapacity
6263
// Use Node Allocatable limits instead of capacity if the user requested enforcing node allocatable.
6364
if cm.CgroupsPerQOS && nc.EnforceNodeAllocatable.Has(kubetypes.NodeAllocatableEnforcementKey) {
64-
nodeAllocatable = cm.getNodeAllocatableAbsolute()
65+
nodeAllocatable = cm.getNodeAllocatableInternalAbsolute()
6566
}
6667

6768
klog.V(4).Infof("Attempting to enforce Node Allocatable with config: %+v", nc)
@@ -130,7 +131,7 @@ func enforceExistingCgroup(cgroupManager CgroupManager, cName CgroupName, rl v1.
130131
if cgroupConfig.ResourceParameters == nil {
131132
return fmt.Errorf("%q cgroup is not config properly", cgroupConfig.Name)
132133
}
133-
klog.V(4).Infof("Enforcing limits on cgroup %q with %d cpu shares and %d bytes of memory", cName, cgroupConfig.ResourceParameters.CpuShares, cgroupConfig.ResourceParameters.Memory)
134+
klog.V(4).Infof("Enforcing limits on cgroup %q with %d cpu shares, %d bytes of memory, and %d processes", cName, cgroupConfig.ResourceParameters.CpuShares, cgroupConfig.ResourceParameters.Memory, cgroupConfig.ResourceParameters.PidsLimit)
134135
if !cgroupManager.Exists(cgroupConfig.Name) {
135136
return fmt.Errorf("%q cgroup does not exist", cgroupConfig.Name)
136137
}
@@ -157,6 +158,10 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
157158
val := MilliCPUToShares(q.MilliValue())
158159
rc.CpuShares = &val
159160
}
161+
if q, exists := rl[pidlimit.PIDs]; exists {
162+
val := q.Value()
163+
rc.PidsLimit = &val
164+
}
160165
rc.HugePageLimit = HugePageLimits(rl)
161166

162167
return &rc
@@ -166,8 +171,12 @@ func getCgroupConfig(rl v1.ResourceList) *ResourceConfig {
166171
// Note that not all resources that are available on the node are included in the returned list of resources.
167172
// Returns a ResourceList.
168173
func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList {
174+
return cm.getNodeAllocatableAbsoluteImpl(cm.capacity)
175+
}
176+
177+
func (cm *containerManagerImpl) getNodeAllocatableAbsoluteImpl(capacity v1.ResourceList) v1.ResourceList {
169178
result := make(v1.ResourceList)
170-
for k, v := range cm.capacity {
179+
for k, v := range capacity {
171180
value := *(v.Copy())
172181
if cm.NodeConfig.SystemReserved != nil {
173182
value.Sub(cm.NodeConfig.SystemReserved[k])
@@ -182,7 +191,13 @@ func (cm *containerManagerImpl) getNodeAllocatableAbsolute() v1.ResourceList {
182191
result[k] = value
183192
}
184193
return result
194+
}
185195

196+
// getNodeAllocatableInternalAbsolute is similar to getNodeAllocatableAbsolute except that
197+
// it also includes internal resources (currently process IDs). It is intended for setting
198+
// up top level cgroups only.
199+
func (cm *containerManagerImpl) getNodeAllocatableInternalAbsolute() v1.ResourceList {
200+
return cm.getNodeAllocatableAbsoluteImpl(cm.internalCapacity)
186201
}
187202

188203
// GetNodeAllocatableReservation returns amount of compute or storage resource that have to be reserved on this node from scheduling.

pkg/kubelet/cm/pod_container_manager_linux.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -87,7 +87,7 @@ func (m *podContainerManagerImpl) EnsureExists(pod *v1.Pod) error {
8787
ResourceParameters: ResourceConfigForPod(pod, m.enforceCPULimits, m.cpuCFSQuotaPeriod),
8888
}
8989
if utilfeature.DefaultFeatureGate.Enabled(kubefeatures.SupportPodPidsLimit) && m.podPidsLimit > 0 {
90-
containerConfig.ResourceParameters.PodPidsLimit = &m.podPidsLimit
90+
containerConfig.ResourceParameters.PidsLimit = &m.podPidsLimit
9191
}
9292
if err := m.cgroupManager.Create(containerConfig); err != nil {
9393
return fmt.Errorf("failed to create container for %v : %v", podContainerName, err)

0 commit comments

Comments
 (0)