Skip to content

Commit ff7d144

Browse files
committed
kubelet container status calculation doesn't handle suddenly missing data properly
1 parent 6d01c5a commit ff7d144

File tree

2 files changed

+60
-6
lines changed

2 files changed

+60
-6
lines changed

pkg/kubelet/kubelet_pods.go

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1616,7 +1616,7 @@ func (kl *Kubelet) convertStatusToAPIStatus(pod *v1.Pod, podStatus *kubecontaine
16161616
// convertToAPIContainerStatuses converts the given internal container
16171617
// statuses into API container statuses.
16181618
func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecontainer.PodStatus, previousStatus []v1.ContainerStatus, containers []v1.Container, hasInitContainers, isInitContainer bool) []v1.ContainerStatus {
1619-
convertContainerStatus := func(cs *kubecontainer.Status) *v1.ContainerStatus {
1619+
convertContainerStatus := func(cs *kubecontainer.Status, oldStatus *v1.ContainerStatus) *v1.ContainerStatus {
16201620
cid := cs.ID.String()
16211621
status := &v1.ContainerStatus{
16221622
Name: cs.Name,
@@ -1625,17 +1625,17 @@ func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecon
16251625
ImageID: cs.ImageID,
16261626
ContainerID: cid,
16271627
}
1628-
switch cs.State {
1629-
case kubecontainer.ContainerStateRunning:
1628+
switch {
1629+
case cs.State == kubecontainer.ContainerStateRunning:
16301630
status.State.Running = &v1.ContainerStateRunning{StartedAt: metav1.NewTime(cs.StartedAt)}
1631-
case kubecontainer.ContainerStateCreated:
1631+
case cs.State == kubecontainer.ContainerStateCreated:
16321632
// Treat containers in the "created" state as if they are exited.
16331633
// The pod workers are supposed start all containers it creates in
16341634
// one sync (syncPod) iteration. There should not be any normal
16351635
// "created" containers when the pod worker generates the status at
16361636
// the beginning of a sync iteration.
16371637
fallthrough
1638-
case kubecontainer.ContainerStateExited:
1638+
case cs.State == kubecontainer.ContainerStateExited:
16391639
status.State.Terminated = &v1.ContainerStateTerminated{
16401640
ExitCode: int32(cs.ExitCode),
16411641
Reason: cs.Reason,
@@ -1644,6 +1644,24 @@ func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecon
16441644
FinishedAt: metav1.NewTime(cs.FinishedAt),
16451645
ContainerID: cid,
16461646
}
1647+
1648+
case cs.State == kubecontainer.ContainerStateUnknown &&
1649+
oldStatus != nil && // we have an old status
1650+
oldStatus.State.Running != nil: // our previous status was running
1651+
// if this happens, then we know that this container was previously running and isn't anymore (assuming the CRI isn't failing to return running containers).
1652+
// you can imagine this happening in cases where a container failed and the kubelet didn't ask about it in time to see the result.
1653+
// in this case, the container should not to into waiting state immediately because that can make cases like runonce pods actually run
1654+
// twice. "container never ran" is different than "container ran and failed". This is handled differently in the kubelet
1655+
// and it is handled differently in higher order logic like crashloop detection and handling
1656+
status.State.Terminated = &v1.ContainerStateTerminated{
1657+
Reason: "ContainerStatusUnknown",
1658+
Message: "The container could not be located when the pod was terminated",
1659+
ExitCode: 137, // this code indicates an error
1660+
}
1661+
// the restart count normally comes from the CRI (see near the top of this method), but since this is being added explicitly
1662+
// for the case where the CRI did not return a status, we need to manually increment the restart count to be accurate.
1663+
status.RestartCount = oldStatus.RestartCount + 1
1664+
16471665
default:
16481666
// this collapses any unknown state to container waiting. If any container is waiting, then the pod status moves to pending even if it is running.
16491667
// if I'm reading this correctly, then any failure to read status on any container results in the entire pod going pending even if the containers
@@ -1767,7 +1785,11 @@ func (kl *Kubelet) convertToAPIContainerStatuses(pod *v1.Pod, podStatus *kubecon
17671785
if containerSeen[cName] >= 2 {
17681786
continue
17691787
}
1770-
status := convertContainerStatus(cStatus)
1788+
var oldStatusPtr *v1.ContainerStatus
1789+
if oldStatus, ok := oldStatuses[cName]; ok {
1790+
oldStatusPtr = &oldStatus
1791+
}
1792+
status := convertContainerStatus(cStatus, oldStatusPtr)
17711793
if containerSeen[cName] == 0 {
17721794
statuses[cName] = status
17731795
} else {

pkg/kubelet/kubelet_test.go

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1607,6 +1607,38 @@ func TestGenerateAPIPodStatusWithReasonCache(t *testing.T) {
16071607
}},
16081608
},
16091609
},
1610+
// For Unknown Container Status:
1611+
// * In certain situations a container can be running and fail to retrieve the status which results in
1612+
// * a transition to the Unknown state. Prior to this fix, a container would make an invalid transition
1613+
// * from Running->Waiting. This test validates the correct behavior of transitioning from Running->Terminated.
1614+
{
1615+
containers: []v1.Container{{Name: "unknown"}},
1616+
statuses: []*kubecontainer.Status{
1617+
{
1618+
Name: "unknown",
1619+
State: kubecontainer.ContainerStateUnknown,
1620+
},
1621+
{
1622+
Name: "unknown",
1623+
State: kubecontainer.ContainerStateRunning,
1624+
},
1625+
},
1626+
reasons: map[string]error{},
1627+
oldStatuses: []v1.ContainerStatus{{
1628+
Name: "unknown",
1629+
State: v1.ContainerState{Running: &v1.ContainerStateRunning{}},
1630+
}},
1631+
expectedState: map[string]v1.ContainerState{
1632+
"unknown": {Terminated: &v1.ContainerStateTerminated{
1633+
ExitCode: 137,
1634+
Message: "The container could not be located when the pod was terminated",
1635+
Reason: "ContainerStatusUnknown",
1636+
}},
1637+
},
1638+
expectedLastTerminationState: map[string]v1.ContainerState{
1639+
"unknown": {Running: &v1.ContainerStateRunning{}},
1640+
},
1641+
},
16101642
}
16111643

16121644
for i, test := range tests {

0 commit comments

Comments
 (0)