Skip to content

Commit 124d95d

Browse files
authored
[Bugfix] Fix status propagation (#889)
1 parent 6d3050c commit 124d95d

File tree

12 files changed

+138
-45
lines changed

12 files changed

+138
-45
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
- Add ArangoClusterSynchronization Operator
1717
- Update licenses
1818
- Fix restart procedure in case of failing members
19+
- Fix status propagation race condition
1920

2021
## [1.2.6](https://github.com/arangodb/kube-arangodb/tree/1.2.6) (2021-12-15)
2122
- Add ArangoBackup backoff functionality

pkg/apis/deployment/v1/conditions.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ func (c ConditionType) String() string {
3636
const (
3737
// ConditionTypeReady indicates that the member or entire deployment is ready and running normally.
3838
ConditionTypeReady ConditionType = "Ready"
39+
// ConditionTypeStarted indicates that the member was ready at least once.
40+
ConditionTypeStarted ConditionType = "Started"
41+
// ConditionTypeServing indicates that the member core services are running.
42+
ConditionTypeServing ConditionType = "Serving"
3943
// ConditionTypeTerminated indicates that the member has terminated and will not restart.
4044
ConditionTypeTerminated ConditionType = "Terminated"
4145
// ConditionTypeAutoUpgrade indicates that the member has to be started with `--database.auto-upgrade` once.

pkg/apis/deployment/v1/member_status_list.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,22 @@ func (l MemberStatusList) MembersReady() int {
205205
return readyCount
206206
}
207207

208+
// MembersServing returns the number of members that are in the Serving state.
209+
func (l MemberStatusList) MembersServing() int {
210+
servingCount := 0
211+
for _, x := range l {
212+
if x.Conditions.IsTrue(ConditionTypeServing) {
213+
servingCount++
214+
}
215+
}
216+
return servingCount
217+
}
218+
219+
// AllMembersServing returns the true if all members are in the Serving state.
220+
func (l MemberStatusList) AllMembersServing() bool {
221+
return len(l) == l.MembersServing()
222+
}
223+
208224
// AllMembersReady returns the true if all members are in the Ready state.
209225
func (l MemberStatusList) AllMembersReady() bool {
210226
return len(l) == l.MembersReady()

pkg/apis/deployment/v2alpha1/conditions.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,10 @@ func (c ConditionType) String() string {
3636
const (
3737
// ConditionTypeReady indicates that the member or entire deployment is ready and running normally.
3838
ConditionTypeReady ConditionType = "Ready"
39+
// ConditionTypeStarted indicates that the member was ready at least once.
40+
ConditionTypeStarted ConditionType = "Started"
41+
// ConditionTypeServing indicates that the member core services are running.
42+
ConditionTypeServing ConditionType = "Serving"
3943
// ConditionTypeTerminated indicates that the member has terminated and will not restart.
4044
ConditionTypeTerminated ConditionType = "Terminated"
4145
// ConditionTypeAutoUpgrade indicates that the member has to be started with `--database.auto-upgrade` once.

pkg/apis/deployment/v2alpha1/member_status_list.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,22 @@ func (l MemberStatusList) MembersReady() int {
205205
return readyCount
206206
}
207207

208+
// MembersServing returns the number of members that are in the Serving state.
209+
func (l MemberStatusList) MembersServing() int {
210+
servingCount := 0
211+
for _, x := range l {
212+
if x.Conditions.IsTrue(ConditionTypeServing) {
213+
servingCount++
214+
}
215+
}
216+
return servingCount
217+
}
218+
219+
// AllMembersServing returns the true if all members are in the Serving state.
220+
func (l MemberStatusList) AllMembersServing() bool {
221+
return len(l) == l.MembersServing()
222+
}
223+
208224
// AllMembersReady returns the true if all members are in the Ready state.
209225
func (l MemberStatusList) AllMembersReady() bool {
210226
return len(l) == l.MembersReady()

pkg/deployment/deployment.go

Lines changed: 19 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ import (
5252
"github.com/arangodb/arangosync-client/client"
5353
"github.com/rs/zerolog"
5454
apiextensionsclient "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset"
55-
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
55+
meta "k8s.io/apimachinery/pkg/apis/meta/v1"
5656
"k8s.io/client-go/kubernetes"
5757
"k8s.io/client-go/tools/record"
5858

@@ -381,7 +381,7 @@ func (d *Deployment) handleArangoDeploymentUpdatedEvent(ctx context.Context) err
381381
// Get the most recent version of the deployment from the API server
382382
ctxChild, cancel := globals.GetGlobalTimeouts().Kubernetes().WithTimeout(ctx)
383383
defer cancel()
384-
current, err := d.deps.DatabaseCRCli.DatabaseV1().ArangoDeployments(d.apiObject.GetNamespace()).Get(ctxChild, d.apiObject.GetName(), metav1.GetOptions{})
384+
current, err := d.deps.DatabaseCRCli.DatabaseV1().ArangoDeployments(d.apiObject.GetNamespace()).Get(ctxChild, d.apiObject.GetName(), meta.GetOptions{})
385385
if err != nil {
386386
log.Debug().Err(err).Msg("Failed to get current version of deployment from API server")
387387
if k8sutil.IsNotFound(err) {
@@ -465,21 +465,22 @@ func (d *Deployment) updateCRStatus(ctx context.Context, force ...bool) error {
465465
}
466466

467467
// Send update to API server
468-
ns := d.apiObject.GetNamespace()
469-
depls := d.deps.DatabaseCRCli.DatabaseV1().ArangoDeployments(ns)
470-
update := d.apiObject.DeepCopy()
468+
depls := d.deps.DatabaseCRCli.DatabaseV1().ArangoDeployments(d.GetNamespace())
471469
attempt := 0
472470
for {
473471
attempt++
474-
update.Status = d.status.last
475-
if update.GetDeletionTimestamp() == nil {
476-
ensureFinalizers(update)
472+
if d.apiObject.GetDeletionTimestamp() == nil {
473+
ensureFinalizers(d.apiObject)
477474
}
478475

479476
var newAPIObject *api.ArangoDeployment
480477
err := globals.GetGlobalTimeouts().Kubernetes().RunWithTimeout(ctx, func(ctxChild context.Context) error {
481-
var err error
482-
newAPIObject, err = depls.Update(ctxChild, update, metav1.UpdateOptions{})
478+
p, err := patch.NewPatch(patch.ItemReplace(patch.NewPath("status"), d.status.last)).Marshal()
479+
if err != nil {
480+
return err
481+
}
482+
483+
newAPIObject, err = depls.Patch(ctxChild, d.GetName(), types.JSONPatchType, p, meta.PatchOptions{})
483484

484485
return err
485486
})
@@ -488,21 +489,8 @@ func (d *Deployment) updateCRStatus(ctx context.Context, force ...bool) error {
488489
d.apiObject = newAPIObject
489490
return nil
490491
}
491-
if attempt < 10 && k8sutil.IsConflict(err) {
492-
// API object may have been changed already,
493-
// Reload api object and try again
494-
var current *api.ArangoDeployment
495-
496-
err = globals.GetGlobalTimeouts().Kubernetes().RunWithTimeout(ctx, func(ctxChild context.Context) error {
497-
var err error
498-
current, err = depls.Get(ctxChild, update.GetName(), metav1.GetOptions{})
499-
500-
return err
501-
})
502-
if err == nil {
503-
update = current.DeepCopy()
504-
continue
505-
}
492+
if attempt < 10 {
493+
continue
506494
}
507495
if err != nil {
508496
d.deps.Log.Debug().Err(err).Msg("failed to patch ArangoDeployment status")
@@ -535,7 +523,7 @@ func (d *Deployment) updateCRSpec(ctx context.Context, newSpec api.DeploymentSpe
535523
var newAPIObject *api.ArangoDeployment
536524
err := globals.GetGlobalTimeouts().Kubernetes().RunWithTimeout(ctx, func(ctxChild context.Context) error {
537525
var err error
538-
newAPIObject, err = d.deps.DatabaseCRCli.DatabaseV1().ArangoDeployments(ns).Update(ctxChild, update, metav1.UpdateOptions{})
526+
newAPIObject, err = d.deps.DatabaseCRCli.DatabaseV1().ArangoDeployments(ns).Update(ctxChild, update, meta.UpdateOptions{})
539527

540528
return err
541529
})
@@ -551,7 +539,7 @@ func (d *Deployment) updateCRSpec(ctx context.Context, newSpec api.DeploymentSpe
551539

552540
err = globals.GetGlobalTimeouts().Kubernetes().RunWithTimeout(ctx, func(ctxChild context.Context) error {
553541
var err error
554-
current, err = d.deps.DatabaseCRCli.DatabaseV1().ArangoDeployments(ns).Get(ctxChild, update.GetName(), metav1.GetOptions{})
542+
current, err = d.deps.DatabaseCRCli.DatabaseV1().ArangoDeployments(ns).Get(ctxChild, update.GetName(), meta.GetOptions{})
555543

556544
return err
557545
})
@@ -568,7 +556,7 @@ func (d *Deployment) updateCRSpec(ctx context.Context, newSpec api.DeploymentSpe
568556
}
569557

570558
// isOwnerOf returns true if the given object belong to this deployment.
571-
func (d *Deployment) isOwnerOf(obj metav1.Object) bool {
559+
func (d *Deployment) isOwnerOf(obj meta.Object) bool {
572560
ownerRefs := obj.GetOwnerReferences()
573561
if len(ownerRefs) < 1 {
574562
return false
@@ -583,9 +571,9 @@ func (d *Deployment) isOwnerOf(obj metav1.Object) bool {
583571
func (d *Deployment) lookForServiceMonitorCRD() {
584572
var err error
585573
if d.GetScope().IsNamespaced() {
586-
_, err = d.deps.KubeMonitoringCli.ServiceMonitors(d.GetNamespace()).List(context.Background(), metav1.ListOptions{})
574+
_, err = d.deps.KubeMonitoringCli.ServiceMonitors(d.GetNamespace()).List(context.Background(), meta.ListOptions{})
587575
} else {
588-
_, err = d.deps.KubeExtCli.ApiextensionsV1().CustomResourceDefinitions().Get(context.Background(), "servicemonitors.monitoring.coreos.com", metav1.GetOptions{})
576+
_, err = d.deps.KubeExtCli.ApiextensionsV1().CustomResourceDefinitions().Get(context.Background(), "servicemonitors.monitoring.coreos.com", meta.GetOptions{})
589577
}
590578
log := d.deps.Log
591579
log.Debug().Msgf("Looking for ServiceMonitor CRD...")
@@ -637,7 +625,7 @@ func (d *Deployment) ApplyPatch(ctx context.Context, p ...patch.Item) error {
637625

638626
ctxChild, cancel := globals.GetGlobalTimeouts().Kubernetes().WithTimeout(ctx)
639627
defer cancel()
640-
depl, err := c.Patch(ctxChild, d.apiObject.GetName(), types.JSONPatchType, data, metav1.PatchOptions{})
628+
depl, err := c.Patch(ctxChild, d.apiObject.GetName(), types.JSONPatchType, data, meta.PatchOptions{})
641629
if err != nil {
642630
return err
643631
}

pkg/deployment/member/phase_updates.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ var phase = phaseMap{
6767
func removeMemberConditionsMapFunc(m *api.MemberStatus) {
6868
// Clean conditions
6969
m.Conditions.Remove(api.ConditionTypeReady)
70+
m.Conditions.Remove(api.ConditionTypeStarted)
7071
m.Conditions.Remove(api.ConditionTypeTerminated)
7172
m.Conditions.Remove(api.ConditionTypeTerminating)
7273
m.Conditions.Remove(api.ConditionTypeAgentRecoveryNeeded)

pkg/deployment/reconcile/plan_builder_rotate_upgrade.go

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -384,18 +384,23 @@ func groupReadyForRestart(context PlanBuilderContext, spec api.DeploymentSpec, s
384384
return true
385385
}
386386

387-
// If current member is not ready, kill anyway
388-
if !member.Conditions.IsTrue(api.ConditionTypeReady) {
387+
// If current member did not become ready even once. Kill it
388+
if !member.Conditions.IsTrue(api.ConditionTypeStarted) {
389+
return true
390+
}
391+
392+
// If current core containers are dead kill it.
393+
if !member.Conditions.IsTrue(api.ConditionTypeServing) {
389394
return true
390395
}
391396

392397
switch group {
393398
case api.ServerGroupDBServers:
394399
// TODO: Improve shard placement discovery and keep WriteConcern
395-
return context.GetShardSyncStatus() && status.Members.MembersOfGroup(group).AllMembersReady()
400+
return context.GetShardSyncStatus() && status.Members.MembersOfGroup(group).AllMembersServing()
396401
default:
397402
// In case of agents we can kill only one agent at same time
398-
return status.Members.MembersOfGroup(group).AllMembersReady()
403+
return status.Members.MembersOfGroup(group).AllMembersServing()
399404
}
400405
}
401406

pkg/deployment/reconcile/plan_builder_utils.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,8 @@ func emptyPlanBuilder(ctx context.Context,
6060

6161
func removeConditionActionV2(actionReason string, conditionType api.ConditionType) api.Action {
6262
return api.NewAction(api.ActionTypeSetConditionV2, api.ServerGroupUnknown, "", actionReason).
63-
AddParam(setConditionActionV2KeyAction, setConditionActionV2KeyTypeRemove).
64-
AddParam(setConditionActionV2KeyType, string(conditionType))
63+
AddParam(setConditionActionV2KeyAction, string(conditionType)).
64+
AddParam(setConditionActionV2KeyType, setConditionActionV2KeyTypeRemove)
6565
}
6666

6767
func updateConditionActionV2(actionReason string, conditionType api.ConditionType, status bool, reason, message, hash string) api.Action {

pkg/deployment/resources/pod_inspector.go

Lines changed: 28 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ import (
3535
v1 "k8s.io/api/core/v1"
3636
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3737

38+
"strings"
39+
3840
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
3941
"github.com/arangodb/kube-arangodb/pkg/metrics"
4042
"github.com/arangodb/kube-arangodb/pkg/util"
@@ -216,10 +218,12 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
216218
}
217219
// End of Topology labels
218220

219-
if k8sutil.AreContainersReady(pod, coreContainers) {
221+
if k8sutil.IsPodReady(pod) && k8sutil.AreContainersReady(pod, coreContainers) {
220222
// Pod is now ready
221-
if memberStatus.Conditions.Update(api.ConditionTypeReady, true, "Pod Ready", "") {
222-
log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Ready & Initialised to true")
223+
if anyOf(memberStatus.Conditions.Update(api.ConditionTypeReady, true, "Pod Ready", ""),
224+
memberStatus.Conditions.Update(api.ConditionTypeStarted, true, "Pod Started", ""),
225+
memberStatus.Conditions.Update(api.ConditionTypeServing, true, "Pod Serving", "")) {
226+
log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Ready, Started & Serving to true")
223227

224228
if status.Topology.IsTopologyOwned(memberStatus.Topology) {
225229
nodes, ok := cachedStatus.GetNodes()
@@ -238,10 +242,19 @@ func (r *Resources) InspectPods(ctx context.Context, cachedStatus inspectorInter
238242
updateMemberStatusNeeded = true
239243
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
240244
}
245+
} else if k8sutil.AreContainersReady(pod, coreContainers) {
246+
// Pod is not ready, but core containers are fine
247+
if anyOf(memberStatus.Conditions.Update(api.ConditionTypeReady, false, "Pod Not Ready", ""),
248+
memberStatus.Conditions.Update(api.ConditionTypeServing, true, "Pod is still serving", "")) {
249+
log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Ready to false, while all core containers are ready")
250+
updateMemberStatusNeeded = true
251+
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
252+
}
241253
} else {
242254
// Pod is not ready
243-
if memberStatus.Conditions.Update(api.ConditionTypeReady, false, "Pod Not Ready", "") {
244-
log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Ready to false")
255+
if anyOf(memberStatus.Conditions.Update(api.ConditionTypeReady, false, "Pod Not Ready", ""),
256+
memberStatus.Conditions.Update(api.ConditionTypeServing, false, "Pod Core containers are not ready", strings.Join(coreContainers, ", "))) {
257+
log.Debug().Str("pod-name", pod.GetName()).Msg("Updating member condition Ready & Serving to false")
245258
updateMemberStatusNeeded = true
246259
nextInterval = nextInterval.ReduceTo(recheckSoonPodInspectorInterval)
247260
}
@@ -394,3 +407,13 @@ func removeLabel(labels map[string]string, key string) map[string]string {
394407

395408
return labels
396409
}
410+
411+
func anyOf(bools ...bool) bool {
412+
for _, b := range bools {
413+
if b {
414+
return true
415+
}
416+
}
417+
418+
return false
419+
}

0 commit comments

Comments
 (0)