Skip to content

Commit e6b5194

Browse files
authored
Merge pull request kubernetes#84300 from klueska/upstream-cpu-manager-reconcile-on-container-state
Update logic in `CPUManager` `reconcileState()`
2 parents 37ee642 + 7be9b0f commit e6b5194

File tree

2 files changed

+68
-24
lines changed

2 files changed

+68
-24
lines changed

pkg/kubelet/cm/cpumanager/cpu_manager.go

Lines changed: 53 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,8 @@ func (m *manager) Start(activePods ActivePodsFunc, sourcesReady config.SourcesRe
200200
if m.policy.Name() == string(PolicyNone) {
201201
return nil
202202
}
203+
// Periodically call m.reconcileState() to continue to keep the CPU sets of
204+
// all pods in sync with and guaranteed CPUs handed out among them.
203205
go wait.Until(func() { m.reconcileState() }, m.reconcilePeriod, wait.NeverStop)
204206
return nil
205207
}
@@ -217,19 +219,24 @@ func (m *manager) AddContainer(p *v1.Pod, c *v1.Container, containerID string) e
217219
}
218220
}
219221
}
222+
223+
// Call down into the policy to assign this container CPUs if required.
220224
err := m.policyAddContainer(p, c, containerID)
221225
if err != nil {
222226
klog.Errorf("[cpumanager] AddContainer error: %v", err)
223227
m.Unlock()
224228
return err
225229
}
230+
231+
// Get the CPUs just assigned to the container (or fall back to the default
232+
// CPUSet if none were assigned).
226233
cpus := m.state.GetCPUSetOrDefault(string(p.UID), c.Name)
227234
m.Unlock()
228235

229236
if !cpus.IsEmpty() {
230237
err = m.updateContainerCPUSet(containerID, cpus)
231238
if err != nil {
232-
klog.Errorf("[cpumanager] AddContainer error: %v", err)
239+
klog.Errorf("[cpumanager] AddContainer error: error updating CPUSet for container (pod: %s, container: %s, container id: %s, err: %v)", p.Name, c.Name, containerID, err)
233240
m.Lock()
234241
err := m.policyRemoveContainerByID(containerID)
235242
if err != nil {
@@ -353,43 +360,59 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec
353360

354361
m.removeStaleState()
355362
for _, pod := range m.activePods() {
363+
pstatus, ok := m.podStatusProvider.GetPodStatus(pod.UID)
364+
if !ok {
365+
klog.Warningf("[cpumanager] reconcileState: skipping pod; status not found (pod: %s)", pod.Name)
366+
failure = append(failure, reconciledContainer{pod.Name, "", ""})
367+
continue
368+
}
369+
356370
allContainers := pod.Spec.InitContainers
357371
allContainers = append(allContainers, pod.Spec.Containers...)
358-
status, ok := m.podStatusProvider.GetPodStatus(pod.UID)
359372
for _, container := range allContainers {
360-
if !ok {
361-
klog.Warningf("[cpumanager] reconcileState: skipping pod; status not found (pod: %s)", pod.Name)
373+
containerID, err := findContainerIDByName(&pstatus, container.Name)
374+
if err != nil {
375+
klog.Warningf("[cpumanager] reconcileState: skipping container; ID not found in pod status (pod: %s, container: %s, error: %v)", pod.Name, container.Name, err)
362376
failure = append(failure, reconciledContainer{pod.Name, container.Name, ""})
363-
break
377+
continue
364378
}
365379

366-
containerID, err := findContainerIDByName(&status, container.Name)
380+
cstatus, err := findContainerStatusByName(&pstatus, container.Name)
367381
if err != nil {
368-
klog.Warningf("[cpumanager] reconcileState: skipping container; ID not found in status (pod: %s, container: %s, error: %v)", pod.Name, container.Name, err)
382+
klog.Warningf("[cpumanager] reconcileState: skipping container; container status not found in pod status (pod: %s, container: %s, error: %v)", pod.Name, container.Name, err)
369383
failure = append(failure, reconciledContainer{pod.Name, container.Name, ""})
370384
continue
371385
}
372386

373-
// Check whether container is present in state, there may be 3 reasons why it's not present:
374-
// - policy does not want to track the container
375-
// - kubelet has just been restarted - and there is no previous state file
376-
// - container has been removed from state by RemoveContainer call (DeletionTimestamp is set)
377-
if _, ok := m.state.GetCPUSet(string(pod.UID), container.Name); !ok {
378-
if status.Phase == v1.PodRunning && pod.DeletionTimestamp == nil {
379-
klog.V(4).Infof("[cpumanager] reconcileState: container is not present in state - trying to add (pod: %s, container: %s, container id: %s)", pod.Name, container.Name, containerID)
380-
err := m.AddContainer(pod, &container, containerID)
387+
if cstatus.State.Waiting != nil ||
388+
(cstatus.State.Waiting == nil && cstatus.State.Running == nil && cstatus.State.Terminated == nil) {
389+
klog.Warningf("[cpumanager] reconcileState: skipping container; container still in the waiting state (pod: %s, container: %s, error: %v)", pod.Name, container.Name, err)
390+
failure = append(failure, reconciledContainer{pod.Name, container.Name, ""})
391+
continue
392+
}
393+
394+
if cstatus.State.Terminated != nil {
395+
// Since the container is terminated, we know it is safe to
396+
// remove it without any reconciliation. Removing the container
397+
// will also remove it from the `containerMap` so that this
398+
// container will be skipped next time around the loop.
399+
_, _, err := m.containerMap.GetContainerRef(containerID)
400+
if err == nil {
401+
klog.Warningf("[cpumanager] reconcileState: skipping container; already terminated (pod: %s, container id: %s)", pod.Name, containerID)
402+
err := m.RemoveContainer(containerID)
381403
if err != nil {
382-
klog.Errorf("[cpumanager] reconcileState: failed to add container (pod: %s, container: %s, container id: %s, error: %v)", pod.Name, container.Name, containerID, err)
404+
klog.Errorf("[cpumanager] reconcileState: failed to remove container (pod: %s, container id: %s, error: %v)", pod.Name, containerID, err)
383405
failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID})
384-
continue
385406
}
386-
} else {
387-
// if DeletionTimestamp is set, pod has already been removed from state
388-
// skip the pod/container since it's not running and will be deleted soon
389-
continue
390407
}
408+
continue
391409
}
392410

411+
// Once we make it here we know we have a running container.
412+
// Idempotently add it to the containerMap incase it is missing.
413+
// This can happen after a kubelet restart, for example.
414+
m.containerMap.Add(string(pod.UID), container.Name, containerID)
415+
393416
cset := m.state.GetCPUSetOrDefault(string(pod.UID), container.Name)
394417
if cset.IsEmpty() {
395418
// NOTE: This should not happen outside of tests.
@@ -427,6 +450,15 @@ func findContainerIDByName(status *v1.PodStatus, name string) (string, error) {
427450
return "", fmt.Errorf("unable to find ID for container with name %v in pod status (it may not be running)", name)
428451
}
429452

453+
func findContainerStatusByName(status *v1.PodStatus, name string) (*v1.ContainerStatus, error) {
454+
for _, status := range append(status.InitContainerStatuses, status.ContainerStatuses...) {
455+
if status.Name == name {
456+
return &status, nil
457+
}
458+
}
459+
return nil, fmt.Errorf("unable to find status for container with name %v in pod status (it may not be running)", name)
460+
}
461+
430462
func (m *manager) updateContainerCPUSet(containerID string, cpus cpuset.CPUSet) error {
431463
// TODO: Consider adding a `ResourceConfigForContainer` helper in
432464
// helpers_linux.go similar to what exists for pods.

pkg/kubelet/cm/cpumanager/cpu_manager_test.go

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -702,6 +702,9 @@ func TestReconcileState(t *testing.T) {
702702
{
703703
Name: "fakeContainerName",
704704
ContainerID: "docker://fakeContainerID",
705+
State: v1.ContainerState{
706+
Running: &v1.ContainerStateRunning{},
707+
},
705708
},
706709
},
707710
},
@@ -738,6 +741,9 @@ func TestReconcileState(t *testing.T) {
738741
{
739742
Name: "fakeContainerName",
740743
ContainerID: "docker://fakeContainerID",
744+
State: v1.ContainerState{
745+
Running: &v1.ContainerStateRunning{},
746+
},
741747
},
742748
},
743749
},
@@ -753,7 +759,7 @@ func TestReconcileState(t *testing.T) {
753759
expectFailedContainerName: "",
754760
},
755761
{
756-
description: "cpu manager reconclie - pod status not found",
762+
description: "cpu manager reconcile - pod status not found",
757763
activePods: []*v1.Pod{
758764
{
759765
ObjectMeta: metav1.ObjectMeta{
@@ -775,10 +781,10 @@ func TestReconcileState(t *testing.T) {
775781
stDefaultCPUSet: cpuset.NewCPUSet(),
776782
updateErr: nil,
777783
expectSucceededContainerName: "",
778-
expectFailedContainerName: "fakeContainerName",
784+
expectFailedContainerName: "",
779785
},
780786
{
781-
description: "cpu manager reconclie - container id not found",
787+
description: "cpu manager reconcile - container state not found",
782788
activePods: []*v1.Pod{
783789
{
784790
ObjectMeta: metav1.ObjectMeta{
@@ -831,6 +837,9 @@ func TestReconcileState(t *testing.T) {
831837
{
832838
Name: "fakeContainerName",
833839
ContainerID: "docker://fakeContainerID",
840+
State: v1.ContainerState{
841+
Running: &v1.ContainerStateRunning{},
842+
},
834843
},
835844
},
836845
},
@@ -867,6 +876,9 @@ func TestReconcileState(t *testing.T) {
867876
{
868877
Name: "fakeContainerName",
869878
ContainerID: "docker://fakeContainerID",
879+
State: v1.ContainerState{
880+
Running: &v1.ContainerStateRunning{},
881+
},
870882
},
871883
},
872884
},

0 commit comments

Comments
 (0)