Skip to content

Commit f2acbf6

Browse files
committed
Base CPUManager state reconciliation on container state, not pod state
1 parent f6cf9b8 commit f2acbf6

File tree

2 files changed

+50
-16
lines changed

2 files changed

+50
-16
lines changed

pkg/kubelet/cm/cpumanager/cpu_manager.go

Lines changed: 36 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -367,26 +367,39 @@ func (m *manager) reconcileState() (success []reconciledContainer, failure []rec
367367
continue
368368
}
369369

370-
// Check whether container is present in state, there may be 3 reasons why it's not present:
371-
// - policy does not want to track the container
372-
// - kubelet has just been restarted - and there is no previous state file
373-
// - container has been removed from state by RemoveContainer call (DeletionTimestamp is set)
374-
if _, ok := m.state.GetCPUSet(string(pod.UID), container.Name); !ok {
375-
if pstatus.Phase == v1.PodRunning && pod.DeletionTimestamp == nil {
376-
klog.V(4).Infof("[cpumanager] reconcileState: container is not present in state - trying to add (pod: %s, container: %s, container id: %s)", pod.Name, container.Name, containerID)
377-
err := m.AddContainer(pod, &container, containerID)
370+
cstatus, err := findContainerStatusByName(&pstatus, container.Name)
371+
if err != nil {
372+
klog.Warningf("[cpumanager] reconcileState: skipping container; container status not found in pod status (pod: %s, container: %s, error: %v)", pod.Name, container.Name, err)
373+
failure = append(failure, reconciledContainer{pod.Name, container.Name, ""})
374+
continue
375+
}
376+
377+
if cstatus.State.Waiting != nil ||
378+
(cstatus.State.Waiting == nil && cstatus.State.Running == nil && cstatus.State.Terminated == nil) {
379+
klog.Warningf("[cpumanager] reconcileState: skipping container; container still in the waiting state (pod: %s, container: %s)", pod.Name, container.Name)
380+
failure = append(failure, reconciledContainer{pod.Name, container.Name, ""})
381+
continue
382+
}
383+
384+
if cstatus.State.Terminated != nil {
385+
// Since the container is terminated, we know it is safe to
386+
// remove it without any reconciliation. Removing the container
387+
// will also remove it from the `containerMap` so that this
388+
// container will be skipped next time around the loop.
389+
_, _, err := m.containerMap.GetContainerRef(containerID)
390+
if err == nil {
391+
klog.Warningf("[cpumanager] reconcileState: skipping container; already terminated (pod: %s, container id: %s)", pod.Name, containerID)
392+
err := m.RemoveContainer(containerID)
378393
if err != nil {
379-
klog.Errorf("[cpumanager] reconcileState: failed to add container (pod: %s, container: %s, container id: %s, error: %v)", pod.Name, container.Name, containerID, err)
394+
klog.Errorf("[cpumanager] reconcileState: failed to remove container (pod: %s, container id: %s, error: %v)", pod.Name, containerID, err)
380395
failure = append(failure, reconciledContainer{pod.Name, container.Name, containerID})
381-
continue
382396
}
383-
} else {
384-
// if DeletionTimestamp is set, pod has already been removed from state
385-
// skip the pod/container since it's not running and will be deleted soon
386-
continue
387397
}
398+
continue
388399
}
389400

401+
m.containerMap.Add(string(pod.UID), container.Name, containerID)
402+
390403
cset := m.state.GetCPUSetOrDefault(string(pod.UID), container.Name)
391404
if cset.IsEmpty() {
392405
// NOTE: This should not happen outside of tests.
@@ -424,6 +437,15 @@ func findContainerIDByName(status *v1.PodStatus, name string) (string, error) {
424437
return "", fmt.Errorf("unable to find ID for container with name %v in pod status (it may not be running)", name)
425438
}
426439

440+
func findContainerStatusByName(status *v1.PodStatus, name string) (*v1.ContainerStatus, error) {
441+
for _, status := range append(status.InitContainerStatuses, status.ContainerStatuses...) {
442+
if status.Name == name {
443+
return &status, nil
444+
}
445+
}
446+
return nil, fmt.Errorf("unable to find status for container with name %v in pod status (it may not be running)", name)
447+
}
448+
427449
func (m *manager) updateContainerCPUSet(containerID string, cpus cpuset.CPUSet) error {
428450
// TODO: Consider adding a `ResourceConfigForContainer` helper in
429451
// helpers_linux.go similar to what exists for pods.

pkg/kubelet/cm/cpumanager/cpu_manager_test.go

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -701,6 +701,9 @@ func TestReconcileState(t *testing.T) {
701701
{
702702
Name: "fakeContainerName",
703703
ContainerID: "docker://fakeContainerID",
704+
State: v1.ContainerState{
705+
Running: &v1.ContainerStateRunning{},
706+
},
704707
},
705708
},
706709
},
@@ -737,6 +740,9 @@ func TestReconcileState(t *testing.T) {
737740
{
738741
Name: "fakeContainerName",
739742
ContainerID: "docker://fakeContainerID",
743+
State: v1.ContainerState{
744+
Running: &v1.ContainerStateRunning{},
745+
},
740746
},
741747
},
742748
},
@@ -752,7 +758,7 @@ func TestReconcileState(t *testing.T) {
752758
expectFailedContainerName: "",
753759
},
754760
{
755-
description: "cpu manager reconclie - pod status not found",
761+
description: "cpu manager reconcile - pod status not found",
756762
activePods: []*v1.Pod{
757763
{
758764
ObjectMeta: metav1.ObjectMeta{
@@ -777,7 +783,7 @@ func TestReconcileState(t *testing.T) {
777783
expectFailedContainerName: "",
778784
},
779785
{
780-
description: "cpu manager reconclie - container id not found",
786+
description: "cpu manager reconcile - container state not found",
781787
activePods: []*v1.Pod{
782788
{
783789
ObjectMeta: metav1.ObjectMeta{
@@ -830,6 +836,9 @@ func TestReconcileState(t *testing.T) {
830836
{
831837
Name: "fakeContainerName",
832838
ContainerID: "docker://fakeContainerID",
839+
State: v1.ContainerState{
840+
Running: &v1.ContainerStateRunning{},
841+
},
833842
},
834843
},
835844
},
@@ -866,6 +875,9 @@ func TestReconcileState(t *testing.T) {
866875
{
867876
Name: "fakeContainerName",
868877
ContainerID: "docker://fakeContainerID",
878+
State: v1.ContainerState{
879+
Running: &v1.ContainerStateRunning{},
880+
},
869881
},
870882
},
871883
},

0 commit comments

Comments
 (0)