Skip to content

Commit c8aeadf

Browse files
authored
fix: Prevent data loss by removing dangerous failover operation (#1505)
fix: prevent data loss by removing dangerous failover operation Remove automatic failover execution when cluster is severely damaged (unhealthy nodes >= total-1) to prevent triggering FLUSHALL commands that can cause complete data loss. Instead, require manual intervention for such critical scenarios. This addresses issues where all Pod restarts in Kubernetes could trigger cluster resets that fail and fallback to FLUSHALL, wiping all data. Fixes: #1069, #1164 Signed-off-by: yangw <[email protected]>
1 parent a5515da commit c8aeadf

File tree

1 file changed

+2
-5
lines changed

1 file changed

+2
-5
lines changed

internal/controller/rediscluster/rediscluster_controller.go

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -266,13 +266,10 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
266266
// recheck if there's still a lot of unhealthy nodes after attempting to repair the masters
267267
unhealthyNodeCount, err = k8sutils.UnhealthyNodesInCluster(ctx, r.K8sClient, instance)
268268
if err != nil {
269-
logger.Error(err, "failed to determine unhealthy node count in cluster")
269+
return intctrlutil.RequeueE(ctx, err, "failed to determine unhealthy node count in cluster")
270270
}
271271
if int(totalReplicas) > 1 && unhealthyNodeCount >= int(totalReplicas)-1 {
272-
logger.Info("unhealthy nodes exist after attempting to repair disconnected masters; starting failover")
273-
if err = k8sutils.ExecuteFailoverOperation(ctx, r.K8sClient, instance); err != nil {
274-
return intctrlutil.RequeueE(ctx, err, "")
275-
}
272+
return intctrlutil.RequeueE(ctx, fmt.Errorf("cluster broken: %d/%d nodes unhealthy, manual intervention required", unhealthyNodeCount, totalReplicas), "")
276273
}
277274
}
278275

0 commit comments

Comments
 (0)