Skip to content

Commit e23140a

Browse files
naimadswdnDamian Seredyn
andauthored
feat: improve RedisCluster health check with retry and multi-node validation (OT-CONTAINER-KIT#1575)
- Retry health checks 3 times with exponential backoff - Check all leader nodes instead of only leader-0 - Add better error handling and logging Signed-off-by: Damian Seredyn <[email protected]> Co-authored-by: Damian Seredyn <[email protected]>
1 parent 1cc6cf1 commit e23140a

File tree

1 file changed

+62
-11
lines changed

1 file changed

+62
-11
lines changed

internal/k8sutils/redis.go

Lines changed: 62 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,13 @@ import (
99
"net"
1010
"strconv"
1111
"strings"
12+
"time"
1213

1314
commonapi "github.com/OT-CONTAINER-KIT/redis-operator/api/common/v1beta2"
1415
rcvb2 "github.com/OT-CONTAINER-KIT/redis-operator/api/rediscluster/v1beta2"
1516
rrvb2 "github.com/OT-CONTAINER-KIT/redis-operator/api/redisreplication/v1beta2"
1617
common "github.com/OT-CONTAINER-KIT/redis-operator/internal/controller/common"
18+
retry "github.com/avast/retry-go"
1719
redis "github.com/redis/go-redis/v9"
1820
"github.com/samber/lo"
1921
corev1 "k8s.io/api/core/v1"
@@ -440,30 +442,79 @@ func CheckRedisNodeCount(ctx context.Context, client kubernetes.Interface, cr *r
440442

441443
// RedisClusterStatusHealth use `redis-cli --cluster check 127.0.0.1:6379`
442444
func RedisClusterStatusHealth(ctx context.Context, client kubernetes.Interface, cr *rcvb2.RedisCluster) bool {
443-
redisClient := configureRedisClient(ctx, client, cr, cr.Name+"-leader-0")
444-
defer redisClient.Close()
445+
logger := log.FromContext(ctx)
446+
leaderReplicas := cr.Spec.GetReplicaCounts("leader")
447+
448+
// Try to check cluster health from multiple leader nodes with retry logic
449+
var lastErr error
450+
for i := int32(0); i < leaderReplicas; i++ {
451+
podName := fmt.Sprintf("%s-leader-%d", cr.Name, i)
452+
453+
// Retry logic with exponential backoff for each node
454+
err := retry.Do(
455+
func() error {
456+
return checkClusterHealth(ctx, client, cr, podName)
457+
},
458+
retry.Attempts(3),
459+
retry.Delay(500*time.Millisecond),
460+
retry.DelayType(retry.BackOffDelay),
461+
retry.OnRetry(func(n uint, err error) {
462+
logger.V(1).Info("Retrying cluster health check", "pod", podName, "attempt", n+1, "error", err)
463+
}),
464+
)
465+
466+
if err == nil {
467+
// Successfully verified cluster health from this node
468+
logger.V(1).Info("Cluster health check passed", "pod", podName)
469+
return true
470+
}
471+
472+
lastErr = err
473+
logger.V(1).Info("Cluster health check failed from node", "pod", podName, "error", err)
474+
}
475+
476+
// All nodes failed the health check
477+
if lastErr != nil {
478+
logger.Error(lastErr, "Cluster health check failed from all leader nodes")
479+
}
480+
return false
481+
}
482+
483+
// checkClusterHealth performs a single cluster health check against a specific pod
484+
func checkClusterHealth(ctx context.Context, client kubernetes.Interface, cr *rcvb2.RedisCluster, podName string) error {
485+
logger := log.FromContext(ctx)
445486

446487
cmd := []string{"redis-cli", "--cluster", "check", fmt.Sprintf("127.0.0.1:%d", *cr.Spec.Port)}
447488
if cr.Spec.KubernetesConfig.ExistingPasswordSecret != nil {
448489
pass, err := getRedisPassword(ctx, client, cr.Namespace, *cr.Spec.KubernetesConfig.ExistingPasswordSecret.Name, *cr.Spec.KubernetesConfig.ExistingPasswordSecret.Key)
449490
if err != nil {
450-
log.FromContext(ctx).Error(err, "Error in getting redis password")
491+
return fmt.Errorf("error getting redis password: %w", err)
451492
}
452-
cmd = append(cmd, "-a")
453-
cmd = append(cmd, pass)
493+
cmd = append(cmd, "-a", pass)
454494
}
455-
cmd = append(cmd, getRedisTLSArgs(cr.Spec.TLS, cr.Name+"-leader-0")...)
456-
out, err := executeCommand1(ctx, client, cr, cmd, cr.Name+"-leader-0")
495+
cmd = append(cmd, getRedisTLSArgs(cr.Spec.TLS, podName)...)
496+
497+
out, err := executeCommand1(ctx, client, cr, cmd, podName)
457498
if err != nil {
458-
return false
499+
return fmt.Errorf("failed to execute cluster check command: %w", err)
459500
}
501+
502+
// Check for the expected success indicators
460503
// [OK] xxx keys in xxx masters.
461504
// [OK] All nodes agree about slots configuration.
462505
// [OK] All 16384 slots covered.
463-
if strings.Count(out, "[OK]") != 3 {
464-
return false
506+
okCount := strings.Count(out, "[OK]")
507+
if okCount != 3 {
508+
logger.V(1).Info("Cluster health check output", "pod", podName, "okCount", okCount, "output", out)
509+
return fmt.Errorf("cluster health check failed: expected 3 [OK] messages, got %d", okCount)
510+
}
511+
512+
// Additional check: ensure no [ERR] or [WARNING] in critical lines
513+
if strings.Contains(out, "[ERR]") {
514+
return fmt.Errorf("cluster health check found errors in output")
465515
}
466-
return true
516+
517+
return nil
467518
}
468519

469520
// UnhealthyNodesInCluster returns the number of unhealthy nodes in the cluster cr

0 commit comments

Comments
 (0)