@@ -9,11 +9,13 @@ import (
99 "net"
1010 "strconv"
1111 "strings"
12+ "time"
1213
1314 commonapi "github.com/OT-CONTAINER-KIT/redis-operator/api/common/v1beta2"
1415 rcvb2 "github.com/OT-CONTAINER-KIT/redis-operator/api/rediscluster/v1beta2"
1516 rrvb2 "github.com/OT-CONTAINER-KIT/redis-operator/api/redisreplication/v1beta2"
1617 common "github.com/OT-CONTAINER-KIT/redis-operator/internal/controller/common"
18+ retry "github.com/avast/retry-go"
1719 redis "github.com/redis/go-redis/v9"
1820 "github.com/samber/lo"
1921 corev1 "k8s.io/api/core/v1"
@@ -440,30 +442,79 @@ func CheckRedisNodeCount(ctx context.Context, client kubernetes.Interface, cr *r
440442
441443// RedisClusterStatusHealth use `redis-cli --cluster check 127.0.0.1:6379`
442444func RedisClusterStatusHealth (ctx context.Context , client kubernetes.Interface , cr * rcvb2.RedisCluster ) bool {
443- redisClient := configureRedisClient (ctx , client , cr , cr .Name + "-leader-0" )
444- defer redisClient .Close ()
445+ logger := log .FromContext (ctx )
446+ leaderReplicas := cr .Spec .GetReplicaCounts ("leader" )
447+
448+ // Try to check cluster health from multiple leader nodes with retry logic
449+ var lastErr error
450+ for i := int32 (0 ); i < leaderReplicas ; i ++ {
451+ podName := fmt .Sprintf ("%s-leader-%d" , cr .Name , i )
452+
453+ // Retry logic with exponential backoff for each node
454+ err := retry .Do (
455+ func () error {
456+ return checkClusterHealth (ctx , client , cr , podName )
457+ },
458+ retry .Attempts (3 ),
459+ retry .Delay (500 * time .Millisecond ),
460+ retry .DelayType (retry .BackOffDelay ),
461+ retry .OnRetry (func (n uint , err error ) {
462+ logger .V (1 ).Info ("Retrying cluster health check" , "pod" , podName , "attempt" , n + 1 , "error" , err )
463+ }),
464+ )
465+
466+ if err == nil {
467+ // Successfully verified cluster health from this node
468+ logger .V (1 ).Info ("Cluster health check passed" , "pod" , podName )
469+ return true
470+ }
471+
472+ lastErr = err
473+ logger .V (1 ).Info ("Cluster health check failed from node" , "pod" , podName , "error" , err )
474+ }
475+
476+ // All nodes failed the health check
477+ if lastErr != nil {
478+ logger .Error (lastErr , "Cluster health check failed from all leader nodes" )
479+ }
480+ return false
481+ }
482+
483+ // checkClusterHealth performs a single cluster health check against a specific pod
484+ func checkClusterHealth (ctx context.Context , client kubernetes.Interface , cr * rcvb2.RedisCluster , podName string ) error {
485+ logger := log .FromContext (ctx )
445486
446487 cmd := []string {"redis-cli" , "--cluster" , "check" , fmt .Sprintf ("127.0.0.1:%d" , * cr .Spec .Port )}
447488 if cr .Spec .KubernetesConfig .ExistingPasswordSecret != nil {
448489 pass , err := getRedisPassword (ctx , client , cr .Namespace , * cr .Spec .KubernetesConfig .ExistingPasswordSecret .Name , * cr .Spec .KubernetesConfig .ExistingPasswordSecret .Key )
449490 if err != nil {
450- log . FromContext ( ctx ). Error ( err , "Error in getting redis password" )
491+ return fmt . Errorf ( "error getting redis password: %w" , err )
451492 }
452- cmd = append (cmd , "-a" )
453- cmd = append (cmd , pass )
493+ cmd = append (cmd , "-a" , pass )
454494 }
455- cmd = append (cmd , getRedisTLSArgs (cr .Spec .TLS , cr .Name + "-leader-0" )... )
456- out , err := executeCommand1 (ctx , client , cr , cmd , cr .Name + "-leader-0" )
495+ cmd = append (cmd , getRedisTLSArgs (cr .Spec .TLS , podName )... )
496+
497+ out , err := executeCommand1 (ctx , client , cr , cmd , podName )
457498 if err != nil {
458- return false
499+ return fmt . Errorf ( "failed to execute cluster check command: %w" , err )
459500 }
501+
502+ // Check for the expected success indicators
460503 // [OK] xxx keys in xxx masters.
461504 // [OK] All nodes agree about slots configuration.
462505 // [OK] All 16384 slots covered.
463- if strings .Count (out , "[OK]" ) != 3 {
464- return false
506+ okCount := strings .Count (out , "[OK]" )
507+ if okCount != 3 {
508+ logger .V (1 ).Info ("Cluster health check output" , "pod" , podName , "okCount" , okCount , "output" , out )
509+ return fmt .Errorf ("cluster health check failed: expected 3 [OK] messages, got %d" , okCount )
510+ }
511+
512+ // Additional check: ensure no [ERR] or [WARNING] in critical lines
513+ if strings .Contains (out , "[ERR]" ) {
514+ return fmt .Errorf ("cluster health check found errors in output" )
465515 }
466- return true
516+
517+ return nil
467518}
468519
469520// UnhealthyNodesInCluster returns the number of unhealthy nodes in the cluster cr
0 commit comments