Skip to content

Commit ebd0dfd

Browse files
authored
[Bugfix] Allow shards with RF1 in EnforcedResignLeadership action (#1441)
1 parent 83c5c83 commit ebd0dfd

File tree

3 files changed

+37
-5
lines changed

3 files changed

+37
-5
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
- (Feature) EnforcedResignLeadership action
1414
- (Maintenance) Make scale_down_candidate annotation obsolete
1515
- (Bugfix) Fix ResignJob ID propagation
16+
- (Bugfix) Allow shards with RF1 in EnforcedResignLeadership action
1617

1718
## [1.2.33](https://github.com/arangodb/kube-arangodb/tree/1.2.33) (2023-09-27)
1819
- (Maintenance) Bump golang.org/x/net to v0.13.0

pkg/deployment/agency/state/state.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,6 +238,30 @@ func (s State) PlanLeaderServers() Servers {
238238
return r
239239
}
240240

241+
// PlanLeaderServersWithFailOver returns all servers which are part of the plan as a leader and can fail over
242+
func (s State) PlanLeaderServersWithFailOver() Servers {
243+
q := map[Server]bool{}
244+
245+
for _, db := range s.Plan.Collections {
246+
for _, col := range db {
247+
for _, shards := range col.Shards {
248+
if len(shards) <= 1 {
249+
continue
250+
}
251+
q[shards[0]] = true
252+
}
253+
}
254+
}
255+
256+
r := make([]Server, 0, len(q))
257+
258+
for k := range q {
259+
r = append(r, k)
260+
}
261+
262+
return r
263+
}
264+
241265
type CollectionShardDetails []CollectionShardDetail
242266

243267
type CollectionShardDetail struct {

pkg/deployment/reconcile/action_enforce_resign_leadership.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -103,30 +103,37 @@ func (a *actionEnforceResignLeadership) CheckProgress(ctx context.Context) (bool
103103
}
104104

105105
// Lets start resign job if required
106-
if j, ok := a.actionCtx.Get(a.action, resignLeadershipJobID); ok && j != "" {
106+
if j, ok := a.actionCtx.Get(a.action, resignLeadershipJobID); ok && j != "" && j != "N/A" {
107107
_, jobStatus := agencyState.Target.GetJob(state.JobID(j))
108108
switch jobStatus {
109109
case state.JobPhaseFailed:
110110
a.log.Error("Resign server job failed")
111111
// Remove key
112-
a.actionCtx.Add(resignLeadershipJobID, "", true)
112+
a.actionCtx.Add(resignLeadershipJobID, "N/A", true)
113113
return false, false, nil
114114
case state.JobPhaseFinished:
115115
a.log.Info("Job finished")
116116
// Remove key
117-
a.actionCtx.Add(resignLeadershipJobID, "", true)
117+
a.actionCtx.Add(resignLeadershipJobID, "N/A", true)
118118
case state.JobPhaseUnknown:
119119
a.log.Str("status", string(jobStatus)).Error("Resign server job unknown status")
120120
return false, false, nil
121121
default:
122122
return false, false, nil
123123
}
124124

125+
a.actionCtx.Add(resignLeadershipJobID, "N/A", true)
126+
125127
// Job is Finished, check if we are not a leader anymore
126128
if agencyState.PlanLeaderServers().Contains(state.Server(m.ID)) {
127129
// We are still a leader!
128-
a.log.Warn("DBServers is still a leader for shards")
129-
return false, false, nil
130+
if agencyState.PlanLeaderServersWithFailOver().Contains(state.Server(m.ID)) {
131+
// We need to retry
132+
a.log.Warn("DBServer is still a leader for shards")
133+
return false, false, nil
134+
}
135+
// Nothing to do as RF is set to 1
136+
a.log.Warn("DBServer is still a leader for shards, but ReplicationFactor is set to 1")
130137
}
131138
return true, false, nil
132139
}

0 commit comments

Comments
 (0)