Skip to content

Commit 4ff879f

Browse files
authored
[Feature] GT-26 Add RestartPolicyAlways to ArangoDeployment in order to restart ArangoDB on failure (#989)
1 parent 9843a56 commit 4ff879f

File tree

13 files changed

+138
-30
lines changed

13 files changed

+138
-30
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
- (Feature) Add `ArangoBackup` CRD auto-installer
99
- (Feature) Add `ArangoBackupPolicy` CRD auto-installer
1010
- (Feature) Add `ArangoJob` CRD auto-installer
11+
- (Feature) Add RestartPolicyAlways to ArangoDeployment in order to restart ArangoDB on failure
1112

1213
## [1.2.13](https://github.com/arangodb/kube-arangodb/tree/1.2.13) (2022-06-07)
1314
- (Bugfix) Fix arangosync members state inspection

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ Feature-wise production readiness table:
9797
| Operator Internal Metrics Exporter | 1.2.0 | >= 3.7.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | N/A |
9898
| Operator Internal Metrics Exporter | 1.2.3 | >= 3.7.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | It is always enabled |
9999
| Operator Ephemeral Volumes | 1.2.2 | >= 3.7.0 | Community, Enterprise | Alpha | False | --deployment.feature.ephemeral-volumes | N/A |
100+
| Pod RestartPolicyAlways | 1.2.13 | >= 3.7.0 | Community, Enterprise | Alpha | False | --deployment.feature.restart-policy-always | N/A |
100101

101102
## Release notes for 0.3.16
102103

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2016-2022 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
21+
package features
22+
23+
func init() {
24+
registerFeature(restartPolicyAlways)
25+
}
26+
27+
var restartPolicyAlways = &feature{
28+
name: "restart-policy-always",
29+
description: "Allow to restart containers with always restart policy",
30+
version: "3.6.0",
31+
enterpriseRequired: false,
32+
enabledByDefault: false,
33+
}
34+
35+
func RestartPolicyAlways() Feature {
36+
return restartPolicyAlways
37+
}

pkg/deployment/images.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,10 @@ func (i *ImageUpdatePod) GetContainerCreator() interfaces.ContainerCreator {
279279
return i.containerCreator
280280
}
281281

282+
func (i *ImageUpdatePod) GetRestartPolicy() core.RestartPolicy {
283+
return core.RestartPolicyNever
284+
}
285+
282286
func (i *ImageUpdatePod) GetAffinityRole() string {
283287
return ""
284288
}

pkg/deployment/reconcile/action_runtime_container_image_update.go

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,14 +22,16 @@ package reconcile
2222

2323
import (
2424
"context"
25+
"time"
2526

27+
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
2628
"github.com/arangodb/kube-arangodb/pkg/deployment/rotation"
29+
"github.com/arangodb/kube-arangodb/pkg/util/errors"
2730
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
31+
2832
"github.com/rs/zerolog"
33+
core "k8s.io/api/core/v1"
2934
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
30-
31-
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
32-
"github.com/arangodb/kube-arangodb/pkg/util/errors"
3335
)
3436

3537
func init() {
@@ -257,9 +259,21 @@ func (a actionRuntimeContainerImageUpdate) CheckProgress(ctx context.Context) (b
257259
return false, false, nil
258260
}
259261

260-
// Pod wont get up and running
262+
// Pod won't get up and running
261263
return true, false, errors.Newf("Container %s failed during image replacement: (%d) %s: %s", name, s.ExitCode, s.Reason, s.Message)
262264
} else if s := cstatus.State.Waiting; s != nil {
265+
if pod.Spec.RestartPolicy == core.RestartPolicyAlways {
266+
lastTermination := cstatus.LastTerminationState.Terminated
267+
if lastTermination != nil {
268+
allowedRestartPeriod := time.Now().Add(time.Second * -20)
269+
if lastTermination.FinishedAt.Time.Before(allowedRestartPeriod) {
270+
return true, false, errors.Newf("Container %s continuously failing during image replacement: (%d) %s: %s", name, lastTermination.ExitCode, lastTermination.Reason, lastTermination.Message)
271+
} else {
272+
a.log.Debug().Str("pod-name", pod.GetName()).Msg("pod is restarting - we are not marking it as terminated yet..")
273+
}
274+
}
275+
}
276+
263277
// Pod is still pulling image or pending for pod start
264278
return false, false, nil
265279
} else if s := cstatus.State.Running; s != nil {

pkg/deployment/resilience/member_failure.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,8 @@ func (r *Resilience) isMemberFailureAcceptable(ctx context.Context, group api.Se
158158
case api.ServerGroupSyncMasters, api.ServerGroupSyncWorkers:
159159
// Sync masters & workers can be replaced at will
160160
return true, "", nil
161+
case api.ServerGroupSingle:
162+
return false, "ServerGroupSingle can not marked as a failed", nil
161163
default:
162164
// TODO
163165
return false, "TODO", nil

pkg/deployment/resources/pod_creator_arangod.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -493,6 +493,13 @@ func (m *MemberArangoDPod) GetContainerCreator() interfaces.ContainerCreator {
493493
}
494494
}
495495

496+
func (m *MemberArangoDPod) GetRestartPolicy() core.RestartPolicy {
497+
if features.RestartPolicyAlways().Enabled() {
498+
return core.RestartPolicyAlways
499+
}
500+
return core.RestartPolicyNever
501+
}
502+
496503
func (m *MemberArangoDPod) createMetricsExporterSidecarInternalExporter() (*core.Container, error) {
497504
image := m.GetContainerCreator().GetImage()
498505

pkg/deployment/resources/pod_creator_sync.go

Lines changed: 16 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,24 +24,19 @@ import (
2424
"context"
2525
"math"
2626

27-
"github.com/arangodb/kube-arangodb/pkg/util/globals"
28-
29-
"github.com/arangodb/kube-arangodb/pkg/util/errors"
30-
31-
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
32-
33-
"github.com/arangodb/kube-arangodb/pkg/util/collection"
34-
35-
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil/interfaces"
36-
37-
"github.com/arangodb/kube-arangodb/pkg/deployment/pod"
38-
39-
"github.com/arangodb/kube-arangodb/pkg/util/constants"
40-
4127
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
4228
"github.com/arangodb/kube-arangodb/pkg/apis/shared"
29+
"github.com/arangodb/kube-arangodb/pkg/deployment/features"
30+
"github.com/arangodb/kube-arangodb/pkg/deployment/pod"
31+
"github.com/arangodb/kube-arangodb/pkg/util/collection"
32+
"github.com/arangodb/kube-arangodb/pkg/util/constants"
33+
"github.com/arangodb/kube-arangodb/pkg/util/errors"
34+
"github.com/arangodb/kube-arangodb/pkg/util/globals"
4335
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
36+
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil/interfaces"
37+
4438
core "k8s.io/api/core/v1"
39+
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
4540
)
4641

4742
const (
@@ -312,6 +307,13 @@ func (m *MemberSyncPod) GetContainerCreator() interfaces.ContainerCreator {
312307
}
313308
}
314309

310+
func (m *MemberSyncPod) GetRestartPolicy() core.RestartPolicy {
311+
if features.RestartPolicyAlways().Enabled() {
312+
return core.RestartPolicyAlways
313+
}
314+
return core.RestartPolicyNever
315+
}
316+
315317
// Init initializes the arangosync pod.
316318
func (m *MemberSyncPod) Init(ctx context.Context, cachedStatus interfaces.Inspector, pod *core.Pod) error {
317319
terminationGracePeriodSeconds := int64(math.Ceil(m.groupSpec.GetTerminationGracePeriod(m.group).Seconds()))

pkg/deployment/resources/pod_inspector.go

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,22 +23,20 @@ package resources
2323
import (
2424
"context"
2525
"fmt"
26+
"strings"
2627
"time"
2728

28-
"github.com/arangodb/kube-arangodb/pkg/deployment/patch"
29-
30-
"github.com/arangodb/kube-arangodb/pkg/util/errors"
31-
inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector"
32-
29+
core "k8s.io/api/core/v1"
3330
v1 "k8s.io/api/core/v1"
3431
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
3532

36-
"strings"
37-
3833
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
34+
"github.com/arangodb/kube-arangodb/pkg/deployment/patch"
3935
"github.com/arangodb/kube-arangodb/pkg/metrics"
4036
"github.com/arangodb/kube-arangodb/pkg/util"
37+
"github.com/arangodb/kube-arangodb/pkg/util/errors"
4138
"github.com/arangodb/kube-arangodb/pkg/util/k8sutil"
39+
inspectorInterface "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector"
4240
podv1 "github.com/arangodb/kube-arangodb/pkg/util/k8sutil/inspector/pod/v1"
4341
)
4442

@@ -48,11 +46,36 @@ var (
4846
)
4947

5048
const (
51-
podScheduleTimeout = time.Minute // How long we allow the schedule to take scheduling a pod.
49+
podScheduleTimeout = time.Minute // How long we allow the schedule to take scheduling a pod.
50+
terminationRestartPeriod = time.Second * -30 // If previous pod termination happened less than this time ago,
51+
// we will mark the pod as scheduled for termination
5252
recheckSoonPodInspectorInterval = util.Interval(time.Second) // Time between Pod inspection if we think something will change soon
5353
maxPodInspectorInterval = util.Interval(time.Hour) // Maximum time between Pod inspection (if nothing else happens)
5454
)
5555

56+
func (r *Resources) handleRestartedPod(pod *core.Pod, memberStatus *api.MemberStatus, wasTerminated, markAsTerminated *bool) {
57+
containerStatus, exist := k8sutil.GetContainerStatusByName(pod, api.ServerGroupReservedContainerNameServer)
58+
if exist && containerStatus.State.Terminated != nil {
59+
// do not record termination time again in the code below
60+
*wasTerminated = true
61+
62+
termination := containerStatus.State.Terminated.FinishedAt
63+
if memberStatus.RecentTerminationsSince(termination.Time) == 0 {
64+
memberStatus.RecentTerminations = append(memberStatus.RecentTerminations, termination)
65+
}
66+
67+
previousTermination := containerStatus.LastTerminationState.Terminated
68+
allowedRestartPeriod := time.Now().Add(terminationRestartPeriod)
69+
if previousTermination != nil && !previousTermination.FinishedAt.Time.Before(allowedRestartPeriod) {
70+
r.log.Debug().Str("pod-name", pod.GetName()).Msg("pod is continuously restarting - we will terminate it")
71+
*markAsTerminated = true
72+
} else {
73+
*markAsTerminated = false
74+
r.log.Debug().Str("pod-name", pod.GetName()).Msg("pod is restarting - we are not marking it as terminated yet..")
75+
}
76+
}
77+
}
78+
5679
// InspectPods lists all pods that belong to the given deployment and updates
5780
// the member status of the deployment accordingly.
5881
// Returns: Interval_till_next_inspection, error
@@ -102,10 +125,17 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
102125
if k8sutil.IsPodSucceeded(pod, coreContainers) {
103126
// Pod has terminated with exit code 0.
104127
wasTerminated := memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated)
105-
if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Succeeded", "") {
128+
markAsTerminated := true
129+
130+
if pod.Spec.RestartPolicy == core.RestartPolicyAlways && !wasTerminated {
131+
r.handleRestartedPod(pod, &memberStatus, &wasTerminated, &markAsTerminated)
132+
}
133+
134+
if markAsTerminated && memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Succeeded", "") {
106135
log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Terminated to true: Pod Succeeded")
107136
updateMemberStatusNeeded = true
108137
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
138+
109139
if !wasTerminated {
110140
// Record termination time
111141
now := meta.Now()
@@ -115,7 +145,13 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
115145
} else if k8sutil.IsPodFailed(pod, coreContainers) {
116146
// Pod has terminated with at least 1 container with a non-zero exit code.
117147
wasTerminated := memberStatus.Conditions.IsTrue(api.ConditionTypeTerminated)
118-
if memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Failed", "") {
148+
markAsTerminated := true
149+
150+
if pod.Spec.RestartPolicy == core.RestartPolicyAlways && !wasTerminated {
151+
r.handleRestartedPod(pod, &memberStatus, &wasTerminated, &markAsTerminated)
152+
}
153+
154+
if markAsTerminated && memberStatus.Conditions.Update(api.ConditionTypeTerminated, true, "Pod Failed", "") {
119155
if containers := k8sutil.GetFailedContainerNames(pod.Status.InitContainerStatuses); len(containers) > 0 {
120156
for _, container := range containers {
121157
switch container {
@@ -171,6 +207,7 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
171207
log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Terminated to true: Pod Failed")
172208
updateMemberStatusNeeded = true
173209
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
210+
174211
if !wasTerminated {
175212
// Record termination time
176213
now := meta.Now()

pkg/deployment/rotation/check.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ func IsRotationRequired(log zerolog.Logger, acs sutil.ACS, spec api.DeploymentSp
126126

127127
if mode, plan, err := compare(log, spec, member, group, specTemplate, statusTemplate); err != nil {
128128
return SkippedRotation, nil, "", err
129+
} else if mode == SkippedRotation {
130+
return mode, plan, "No rotation needed", nil
129131
} else {
130132
return mode, plan, "Pod needs rotation", nil
131133
}

0 commit comments

Comments
 (0)