fix: Prevent data loss by removing dangerous failover operation (#1505)

drivebyer · web-flow · commit c8aeadfea627 · 2025-09-04T09:46:19.000+08:00
fix: prevent data loss by removing dangerous failover operation Remove automatic failover execution when cluster is severely damaged (unhealthy nodes >= total-1) to prevent triggering FLUSHALL commands that can cause complete data loss. Instead, require manual intervention for such critical scenarios. This addresses issues where all Pod restarts in Kubernetes could trigger cluster resets that fail and fallback to FLUSHALL, wiping all data. Fixes: #1069, #1164 Signed-off-by: yangw <wuyangmuc@gmail.com>
diff --git a/internal/controller/rediscluster/rediscluster_controller.go b/internal/controller/rediscluster/rediscluster_controller.go
@@ -266,13 +266,10 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
 		// recheck if there's still a lot of unhealthy nodes after attempting to repair the masters
 		unhealthyNodeCount, err = k8sutils.UnhealthyNodesInCluster(ctx, r.K8sClient, instance)
 		if err != nil {
-			logger.Error(err, "failed to determine unhealthy node count in cluster")
+			return intctrlutil.RequeueE(ctx, err, "failed to determine unhealthy node count in cluster")
 		}
 		if int(totalReplicas) > 1 && unhealthyNodeCount >= int(totalReplicas)-1 {
-			logger.Info("unhealthy nodes exist after attempting to repair disconnected masters; starting failover")
-			if err = k8sutils.ExecuteFailoverOperation(ctx, r.K8sClient, instance); err != nil {
-				return intctrlutil.RequeueE(ctx, err, "")
-			}
+			return intctrlutil.RequeueE(ctx, fmt.Errorf("cluster broken: %d/%d nodes unhealthy, manual intervention required", unhealthyNodeCount, totalReplicas), "")
 		}
 	}
 

Original file line number	Diff line number	Diff line change
`@@ -266,13 +266,10 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu`
`266`	`266`	`// recheck if there's still a lot of unhealthy nodes after attempting to repair the masters`
`267`	`267`	`unhealthyNodeCount, err = k8sutils.UnhealthyNodesInCluster(ctx, r.K8sClient, instance)`
`268`	`268`	`if err != nil {`
`269`		`- logger.Error(err, "failed to determine unhealthy node count in cluster")`
	`269`	`+ return intctrlutil.RequeueE(ctx, err, "failed to determine unhealthy node count in cluster")`
`270`	`270`	`}`
`271`	`271`	`if int(totalReplicas) > 1 && unhealthyNodeCount >= int(totalReplicas)-1 {`
`272`		`- logger.Info("unhealthy nodes exist after attempting to repair disconnected masters; starting failover")`
`273`		`- if err = k8sutils.ExecuteFailoverOperation(ctx, r.K8sClient, instance); err != nil {`
`274`		`- return intctrlutil.RequeueE(ctx, err, "")`
`275`		`- }`
	`272`	`+ return intctrlutil.RequeueE(ctx, fmt.Errorf("cluster broken: %d/%d nodes unhealthy, manual intervention required", unhealthyNodeCount, totalReplicas), "")`
`276`	`273`	`}`
`277`	`274`	`}`
`278`	`275`