Skip to content

Commit 50fa4fd

Browse files
authored
[BUGFIX] Do not remove used DBServer (#515)
1 parent 70c40bb commit 50fa4fd

File tree

12 files changed

+295
-13
lines changed

12 files changed

+295
-13
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# Change Log
22

33
## Master
4+
- Prevent DBServer deletion if there are any shards active on it
45
- Add Maintenance mode annotation for ArangoDeployment
56

67
## [0.4.2](https://github.com/arangodb/kube-arangodb/tree/0.4.2) (2019-11-12)

pkg/apis/deployment/v1/plan.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ const (
3636
ActionTypeAddMember ActionType = "AddMember"
3737
// ActionTypeRemoveMember causes a member to be removed.
3838
ActionTypeRemoveMember ActionType = "RemoveMember"
39+
// ActionTypeRecreateMember recreates member. Used when member is still owner of some shards.
40+
ActionTypeRecreateMember ActionType = "RecreateMember"
3941
// ActionTypeCleanOutMember causes a member to be cleaned out (dbserver only).
4042
ActionTypeCleanOutMember ActionType = "CleanOutMember"
4143
// ActionTypeShutdownMember causes a member to be shutdown and removed from the cluster.

pkg/deployment/agency/agency.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2020 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Adam Janikowski
21+
//
22+
23+
package agency
24+
25+
type ArangoPlanDatabases map[string]ArangoPlanCollections
26+
27+
func (a ArangoPlanDatabases) IsDBServerInDatabases(name string) bool {
28+
for _, collections := range a {
29+
if collections.IsDBServerInCollections(name) {
30+
return true
31+
}
32+
}
33+
return false
34+
}
35+
36+
type ArangoPlanCollections map[string]ArangoPlanCollection
37+
38+
func (a ArangoPlanCollections) IsDBServerInCollections(name string) bool {
39+
for _, collection := range a {
40+
if collection.IsDBServerInShards(name) {
41+
return true
42+
}
43+
}
44+
return false
45+
}
46+
47+
type ArangoPlanCollection struct {
48+
Shards ArangoPlanShard `json:"shards"`
49+
}
50+
51+
func (a ArangoPlanCollection) IsDBServerInShards(name string) bool {
52+
for _, dbservers := range a.Shards {
53+
for _, dbserver := range dbservers {
54+
if dbserver == name {
55+
return true
56+
}
57+
}
58+
}
59+
return false
60+
}
61+
62+
type ArangoPlanShard map[string][]string
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2020 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Adam Janikowski
21+
//
22+
23+
package agency
24+
25+
const (
26+
ArangoKey = "arango"
27+
PlanKey = "Plan"
28+
PlanCollectionsKey = "Collections"
29+
)

pkg/deployment/context_impl.go

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -426,3 +426,17 @@ func (d *Deployment) DisableScalingCluster() error {
426426
func (d *Deployment) EnableScalingCluster() error {
427427
return d.clusterScalingIntegration.EnableScalingCluster()
428428
}
429+
430+
// GetAgencyPlan returns agency plan
431+
func (d *Deployment) GetAgencyData(ctx context.Context, i interface{}, keyParts ... string) error {
432+
a, err := d.GetAgency(ctx)
433+
if err != nil {
434+
return err
435+
}
436+
437+
if err = a.ReadKey(ctx, keyParts, i); err != nil {
438+
return err
439+
}
440+
441+
return err
442+
}

pkg/deployment/reconcile/action_context.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ package reconcile
2525
import (
2626
"context"
2727
"fmt"
28+
v1 "k8s.io/api/core/v1"
2829

2930
"github.com/arangodb/go-driver/agency"
3031

@@ -70,6 +71,9 @@ type ActionContext interface {
7071
// DeletePvc deletes a persistent volume claim with given name in the namespace
7172
// of the deployment. If the pvc does not exist, the error is ignored.
7273
DeletePvc(pvcName string) error
74+
// GetPvc returns PVC info about PVC with given name in the namespace
75+
// of the deployment.
76+
GetPvc(pvcName string) (*v1.PersistentVolumeClaim, error)
7377
// RemovePodFinalizers removes all the finalizers from the Pod with given name in the namespace
7478
// of the deployment. If the pod does not exist, the error is ignored.
7579
RemovePodFinalizers(podName string) error
@@ -110,6 +114,10 @@ type actionContext struct {
110114
context Context
111115
}
112116

117+
func (ac *actionContext) GetPvc(pvcName string) (*v1.PersistentVolumeClaim, error) {
118+
return ac.context.GetPvc(pvcName)
119+
}
120+
113121
// Gets the specified mode of deployment
114122
func (ac *actionContext) GetMode() api.DeploymentMode {
115123
return ac.context.GetSpec().GetMode()
Lines changed: 98 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,98 @@
1+
//
2+
// DISCLAIMER
3+
//
4+
// Copyright 2020 ArangoDB GmbH, Cologne, Germany
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.
17+
//
18+
// Copyright holder is ArangoDB GmbH, Cologne, Germany
19+
//
20+
// Author Adam Janikowski
21+
//
22+
23+
package reconcile
24+
25+
import (
26+
"context"
27+
"fmt"
28+
kubeErrors "k8s.io/apimachinery/pkg/api/errors"
29+
"time"
30+
31+
"github.com/rs/zerolog"
32+
33+
api "github.com/arangodb/kube-arangodb/pkg/apis/deployment/v1"
34+
)
35+
36+
// NewRecreateMemberAction creates a new Action that implements the given
37+
// planned RecreateMember action.
38+
func NewRecreateMemberAction(log zerolog.Logger, action api.Action, actionCtx ActionContext) Action {
39+
return &actionRecreateMember{
40+
log: log,
41+
action: action,
42+
actionCtx: actionCtx,
43+
}
44+
}
45+
46+
// actionRecreateMember implements an RecreateMemberAction.
47+
type actionRecreateMember struct {
48+
log zerolog.Logger
49+
action api.Action
50+
actionCtx ActionContext
51+
}
52+
53+
// Start performs the start of the action.
54+
// Returns true if the action is completely finished, false in case
55+
// the start time needs to be recorded and a ready condition needs to be checked.
56+
func (a *actionRecreateMember) Start(ctx context.Context) (bool, error) {
57+
m, ok := a.actionCtx.GetMemberStatusByID(a.action.MemberID)
58+
if !ok {
59+
return false, fmt.Errorf("expecting member to be present in list, but it is not")
60+
}
61+
62+
_, err := a.actionCtx.GetPvc(m.PersistentVolumeClaimName)
63+
if err != nil {
64+
if kubeErrors.IsNotFound(err) {
65+
return false, fmt.Errorf("PVC is missing %s. DBServer wont be recreated without old PV", m.PersistentVolumeClaimName)
66+
}
67+
68+
return false, maskAny(err)
69+
}
70+
71+
if m.Phase == api.MemberPhaseFailed {
72+
// Change cluster phase to ensure it wont be removed
73+
m.Phase = api.MemberPhaseNone
74+
}
75+
76+
if err = a.actionCtx.UpdateMember(m); err != nil {
77+
return false, maskAny(err)
78+
}
79+
80+
return true, nil
81+
}
82+
83+
// CheckProgress checks the progress of the action.
84+
// Returns true if the action is completely finished, false otherwise.
85+
func (a *actionRecreateMember) CheckProgress(ctx context.Context) (bool, bool, error) {
86+
// Nothing todo
87+
return true, false, nil
88+
}
89+
90+
// Timeout returns the amount of time after which this action will timeout.
91+
func (a *actionRecreateMember) Timeout() time.Duration {
92+
return recreateMemberTimeout
93+
}
94+
95+
// Return the MemberID used / created in this action
96+
func (a *actionRecreateMember) MemberID() string {
97+
return a.action.MemberID
98+
}

pkg/deployment/reconcile/context.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ package reconcile
2424

2525
import (
2626
"context"
27-
2827
"github.com/arangodb/arangosync-client/client"
2928
driver "github.com/arangodb/go-driver"
3029
"github.com/arangodb/go-driver/agency"
@@ -105,4 +104,8 @@ type Context interface {
105104
DisableScalingCluster() error
106105
// EnableScalingCluster enables scaling DBservers and coordinators
107106
EnableScalingCluster() error
107+
// GetAgencyData object for key path
108+
GetAgencyData(ctx context.Context, i interface{}, keyParts ... string) error
108109
}
110+
111+

pkg/deployment/reconcile/plan_builder.go

Lines changed: 71 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,11 @@
2323
package reconcile
2424

2525
import (
26+
goContext "context"
27+
"fmt"
28+
"github.com/arangodb/kube-arangodb/pkg/deployment/agency"
29+
"time"
30+
2631
driver "github.com/arangodb/go-driver"
2732
upgraderules "github.com/arangodb/go-upgrade-rules"
2833
"github.com/rs/zerolog"
@@ -78,6 +83,27 @@ func (d *Reconciler) CreatePlan() error {
7883
return nil
7984
}
8085

86+
func fetchAgency(log zerolog.Logger,
87+
spec api.DeploymentSpec, status api.DeploymentStatus,
88+
context PlanBuilderContext) (*agency.ArangoPlanDatabases, error) {
89+
if spec.GetMode() != api.DeploymentModeCluster && spec.GetMode() != api.DeploymentModeActiveFailover {
90+
return nil, nil
91+
} else if status.Members.Agents.MembersReady() > 0 {
92+
agencyCtx, agencyCancel := goContext.WithTimeout(goContext.Background(), time.Minute)
93+
defer agencyCancel()
94+
95+
ret := &agency.ArangoPlanDatabases{}
96+
97+
if err := context.GetAgencyData(agencyCtx, ret, agency.ArangoKey, agency.PlanKey, agency.PlanCollectionsKey); err != nil {
98+
return nil, err
99+
}
100+
101+
return ret, nil
102+
} else {
103+
return nil, fmt.Errorf("not able to read from agency when agency is down")
104+
}
105+
}
106+
81107
// createPlan considers the given specification & status and creates a plan to get the status in line with the specification.
82108
// If a plan already exists, the given plan is returned with false.
83109
// Otherwise the new plan is returned with a boolean true.
@@ -90,30 +116,63 @@ func createPlan(log zerolog.Logger, apiObject k8sutil.APIObject,
90116
return currentPlan, false
91117
}
92118

119+
// Fetch agency plan
120+
agencyPlan, agencyErr := fetchAgency(log, spec, status, context)
121+
93122
// Check for various scenario's
94123
var plan api.Plan
95124

96125
// Check for members in failed state
97126
status.Members.ForeachServerGroup(func(group api.ServerGroup, members api.MemberStatusList) error {
98127
for _, m := range members {
99-
if m.Phase == api.MemberPhaseFailed && plan.IsEmpty() {
100-
log.Debug().
101-
Str("id", m.ID).
102-
Str("role", group.AsRole()).
103-
Msg("Creating member replacement plan because member has failed")
104-
newID := ""
105-
if group == api.ServerGroupAgents {
106-
newID = m.ID // Agents cannot (yet) be replaced with new IDs
128+
if m.Phase != api.MemberPhaseFailed || len(plan) > 0 {
129+
continue
130+
}
131+
132+
memberLog := log.Info().Str("id", m.ID).Str("role", group.AsRole())
133+
134+
if group == api.ServerGroupDBServers && spec.GetMode() == api.DeploymentModeCluster {
135+
// Do pre check for DBServers. If agency is down DBServers should not be touch
136+
if agencyErr != nil {
137+
memberLog.Msg("Error in agency")
138+
continue
107139
}
108-
plan = append(plan,
109-
api.NewAction(api.ActionTypeRemoveMember, group, m.ID),
110-
api.NewAction(api.ActionTypeAddMember, group, newID),
111-
)
140+
141+
if agencyPlan == nil {
142+
memberLog.Msg("AgencyPlan is nil")
143+
continue
144+
}
145+
146+
if agencyPlan.IsDBServerInDatabases(m.ID) {
147+
// DBServer still exists in agency plan! Will not be removed, but needs to be recreated
148+
memberLog.Msg("Recreating DBServer - it cannot be removed gracefully")
149+
plan = append(plan,
150+
api.NewAction(api.ActionTypeRecreateMember, group, m.ID))
151+
continue
152+
}
153+
154+
// Everything is fine, proceed
112155
}
156+
157+
memberLog.Msg("Creating member replacement plan because member has failed")
158+
newID := ""
159+
if group == api.ServerGroupAgents {
160+
newID = m.ID // Agents cannot (yet) be replaced with new IDs
161+
}
162+
plan = append(plan,
163+
api.NewAction(api.ActionTypeRemoveMember, group, m.ID),
164+
api.NewAction(api.ActionTypeAddMember, group, newID),
165+
)
113166
}
114167
return nil
115168
})
116169

170+
// Ensure that we were able to get agency info
171+
if len(plan) == 0 && agencyErr != nil {
172+
log.Err(agencyErr).Msg("unable to build further plan without access to agency")
173+
return plan, false
174+
}
175+
117176
// Check for cleaned out dbserver in created state
118177
for _, m := range status.Members.DBServers {
119178
if plan.IsEmpty() && m.Phase.IsCreatedOrDrain() && m.Conditions.IsTrue(api.ConditionTypeCleanedOut) {

0 commit comments

Comments
 (0)