@@ -23,22 +23,20 @@ package resources
2323import (
2424 "context"
2525 "fmt"
26+ "strings"
2627 "time"
2728
28- "github.com/arangodb/kube-arangodb/pkg/deployment/patch"
29-
30- "github.com/arangodb/kube-arangodb/pkg/util/errors"
31- inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector"
32-
29+ core "k8s.io/api/core/v1"
3330 v1 "k8s.io/api/core/v1"
3431 meta "k8s.io/apimachinery/pkg/apis/meta/v1"
3532
36- "strings"
37-
3833 api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
34+ "github.com/arangodb/kube-arangodb/pkg/deployment/patch"
3935 "github.com/arangodb/kube-arangodb/pkg/metrics"
4036 "github.com/arangodb/kube-arangodb/pkg/util"
37+ "github.com/arangodb/kube-arangodb/pkg/util/errors"
4138 "github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
39+ inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector"
4240 podv1 "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector/pod/v1"
4341)
4442
@@ -48,11 +46,36 @@ var (
4846)
4947
5048const (
51- podScheduleTimeout = time .Minute // How long we allow the schedule to take scheduling a pod.
49+ podScheduleTimeout = time .Minute // How long we allow the schedule to take scheduling a pod.
50+ terminationRestartPeriod = time .Second * - 30 // If previous pod termination happened less than this time ago,
51+ // we will mark the pod as scheduled for termination
5252 recheckSoonPodInspectorInterval = util .Interval (time .Second ) // Time between Pod inspection if we think something will change soon
5353 maxPodInspectorInterval = util .Interval (time .Hour ) // Maximum time between Pod inspection (if nothing else happens)
5454)
5555
56+ func (r * Resources ) handleRestartedPod (pod * core.Pod , memberStatus * api.MemberStatus , wasTerminated , markAsTerminated * bool ) {
57+ containerStatus , exist := k8sutil .GetContainerStatusByName (pod , api .ServerGroupReservedContainerNameServer )
58+ if exist && containerStatus .State .Terminated != nil {
59+ // do not record termination time again in the code below
60+ * wasTerminated = true
61+
62+ termination := containerStatus .State .Terminated .FinishedAt
63+ if memberStatus .RecentTerminationsSince (termination .Time ) == 0 {
64+ memberStatus .RecentTerminations = append (memberStatus .RecentTerminations , termination )
65+ }
66+
67+ previousTermination := containerStatus .LastTerminationState .Terminated
68+ allowedRestartPeriod := time .Now ().Add (terminationRestartPeriod )
69+ if previousTermination != nil && ! previousTermination .FinishedAt .Time .Before (allowedRestartPeriod ) {
70+ r .log .Debug ().Str ("pod-name" , pod .GetName ()).Msg ("pod is continuously restarting - we will terminate it" )
71+ * markAsTerminated = true
72+ } else {
73+ * markAsTerminated = false
74+ r .log .Debug ().Str ("pod-name" , pod .GetName ()).Msg ("pod is restarting - we are not marking it as terminated yet.." )
75+ }
76+ }
77+ }
78+
5679// InspectPods lists all pods that belong to the given deployment and updates
5780// the member status of the deployment accordingly.
5881// Returns: Interval_till_next_inspection, error
@@ -102,10 +125,17 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
102125 if k8sutil .IsPodSucceeded (pod , coreContainers ) {
103126 // Pod has terminated with exit code 0.
104127 wasTerminated := memberStatus .Conditions .IsTrue (api .ConditionTypeTerminated )
105- if memberStatus .Conditions .Update (api .ConditionTypeTerminated , true , "Pod Succeeded" , "" ) {
128+ markAsTerminated := true
129+
130+ if pod .Spec .RestartPolicy == core .RestartPolicyAlways && ! wasTerminated {
131+ r .handleRestartedPod (pod , & memberStatus , & wasTerminated , & markAsTerminated )
132+ }
133+
134+ if markAsTerminated && memberStatus .Conditions .Update (api .ConditionTypeTerminated , true , "Pod Succeeded" , "" ) {
106135 log .Debug ().Str ("pod-name" , pod .GetName ()).Msg ("Updating member condition Terminated to true: Pod Succeeded" )
107136 updateMemberStatusNeeded = true
108137 nextInterval = nextInterval .ReduceTo (recheckSoonPodInspectorInterval )
138+
109139 if ! wasTerminated {
110140 // Record termination time
111141 now := meta .Now ()
@@ -115,7 +145,13 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
115145 } else if k8sutil .IsPodFailed (pod , coreContainers ) {
116146 // Pod has terminated with at least 1 container with a non-zero exit code.
117147 wasTerminated := memberStatus .Conditions .IsTrue (api .ConditionTypeTerminated )
118- if memberStatus .Conditions .Update (api .ConditionTypeTerminated , true , "Pod Failed" , "" ) {
148+ markAsTerminated := true
149+
150+ if pod .Spec .RestartPolicy == core .RestartPolicyAlways && ! wasTerminated {
151+ r .handleRestartedPod (pod , & memberStatus , & wasTerminated , & markAsTerminated )
152+ }
153+
154+ if markAsTerminated && memberStatus .Conditions .Update (api .ConditionTypeTerminated , true , "Pod Failed" , "" ) {
119155 if containers := k8sutil .GetFailedContainerNames (pod .Status .InitContainerStatuses ); len (containers ) > 0 {
120156 for _ , container := range containers {
121157 switch container {
@@ -171,6 +207,7 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
171207 log .Debug ().Str ("pod-name" , pod .GetName ()).Msg ("Updating member condition Terminated to true: Pod Failed" )
172208 updateMemberStatusNeeded = true
173209 nextInterval = nextInterval .ReduceTo (recheckSoonPodInspectorInterval )
210+
174211 if ! wasTerminated {
175212 // Record termination time
176213 now := meta .Now ()
0 commit comments