Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions internal/controller/rediscluster/rediscluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,30 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu
logger.Info("repairing unhealthy masters successful, no unhealthy masters left")
return intctrlutil.RequeueAfter(ctx, time.Second*30, "no unhealthy nodes found after repairing disconnected masters")
}

if leaderReplicas == 1 && followerReplicas == 1 {
logger.Info("unhealthy nodes detected; attempting to repair disconnected nodes when in single-node cluster")
if err = k8sutils.RepairDisconnectedCluster(ctx, r.K8sClient, instance); err != nil {
logger.Error(err, "failed to repair disconnected nodes")
}

err = retry.Do(func() error {
nc, nErr := k8sutils.UnhealthyNodesInCluster(ctx, r.K8sClient, instance)
if nErr != nil {
return nErr
}
if nc == 0 {
return nil
}
return fmt.Errorf("%d unhealthy nodes", nc)
}, retry.Attempts(3), retry.Delay(time.Second*5))

if err == nil {
logger.Info("repairing unhealthy single shard cluster successful, no unhealthy nodes left")
return intctrlutil.Requeue()
}
}

// recheck if there's still a lot of unhealthy nodes after attempting to repair the masters
unhealthyNodeCount, err = k8sutils.UnhealthyNodesInCluster(ctx, r.K8sClient, instance)
if err != nil {
Expand Down
6 changes: 6 additions & 0 deletions internal/controllerutil/controller_common.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,9 @@ func RequeueECheck(ctx context.Context, err error, msg string, keysAndValues ...
}
return RequeueE(ctx, err, msg, keysAndValues...)
}

func Requeue() (reconcile.Result, error) {
return reconcile.Result{
Requeue: true,
}, nil
}
32 changes: 32 additions & 0 deletions internal/k8sutils/redis.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,38 @@ func getMasterHostFromClusterNode(node clusterNodesResponse) (string, error) {
return strings.Split(addressAndHost, ",")[1], nil
}

// RepairDisconnectedNode attempts to repair disconnected/failed single shard cluster by issuing
// a CLUSTER MEET with the updated address of the slave
func RepairDisconnectedCluster(ctx context.Context, client kubernetes.Interface, cr *rcvb2.RedisCluster) error {
redisClient := configureRedisClient(ctx, client, cr, cr.Name+"-leader-0")
defer redisClient.Close()
return repairDisconnectedCluster(ctx, client, cr, redisClient)
}

func repairDisconnectedCluster(ctx context.Context, client kubernetes.Interface, cr *rcvb2.RedisCluster, redisClient *redis.Client) error {
podName := cr.Name + "-follower-0"

ip := getRedisServerIP(ctx, client, RedisDetails{
PodName: podName,
Namespace: cr.Namespace,
})

if ip == "" {
err := fmt.Errorf("failed to get IP for pod %s", podName)
log.FromContext(ctx).Error(err, "Failed to get follower pod IP")
return err
}

err := redisClient.ClusterMeet(ctx, ip, strconv.Itoa(*cr.Spec.Port)).Err()
if err != nil {
log.FromContext(ctx).Error(err, "Failed to execute CLUSTER MEET on follower node", "PodName", podName, "IP", ip)
return err
}

log.FromContext(ctx).Info("Successfully executed CLUSTER MEET for follower node", "PodName", podName, "IP", ip)
return nil
}

// CreateMultipleLeaderRedisCommand will create command for single leader cluster creation
func CreateMultipleLeaderRedisCommand(ctx context.Context, client kubernetes.Interface, cr *rcvb2.RedisCluster) []string {
cmd := []string{"redis-cli", "--cluster", "create"}
Expand Down
72 changes: 72 additions & 0 deletions internal/k8sutils/redis_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -912,3 +912,75 @@ e7d1eecce10fd6bb5eb35b9f99a514335d9ba9ca 127.0.0.1:30001@31001,hostname1 myself,
})
}
}

func TestRepairDisconnectedCluster(t *testing.T) {
ctx := context.Background()
redisClient, mock := redismock.NewClientMock()

namespace := "default"
clusterName := "redis-cluster"
followerPodIP := "10.244.0.25"
port := 6379

k8sClient := k8sClientFake.NewSimpleClientset(&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: clusterName + "-follower-0",
Namespace: namespace,
},
Status: corev1.PodStatus{
PodIP: followerPodIP,
},
})

mock.ExpectClusterMeet(followerPodIP, strconv.Itoa(port)).SetVal("OK")

err := repairDisconnectedCluster(ctx, k8sClient, &rcvb2.RedisCluster{
ObjectMeta: metav1.ObjectMeta{
Name: clusterName,
Namespace: namespace,
},
Spec: rcvb2.RedisClusterSpec{
Port: &port,
},
}, redisClient)

assert.NoError(t, err)
assert.NoError(t, mock.ExpectationsWereMet())
}

func TestRepairDisconnectedClusterWithClusterMeetFailure(t *testing.T) {
ctx := context.Background()
redisClient, mock := redismock.NewClientMock()

namespace := "default"
clusterName := "redis-cluster"
followerPodIP := "10.244.0.25"
port := 6379
expectedErr := fmt.Errorf("CLUSTER MEET command failed")

k8sClient := k8sClientFake.NewSimpleClientset(&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: clusterName + "-follower-0",
Namespace: namespace,
},
Status: corev1.PodStatus{
PodIP: followerPodIP,
},
})

mock.ExpectClusterMeet(followerPodIP, strconv.Itoa(port)).SetErr(expectedErr)

err := repairDisconnectedCluster(ctx, k8sClient, &rcvb2.RedisCluster{
ObjectMeta: metav1.ObjectMeta{
Name: clusterName,
Namespace: namespace,
},
Spec: rcvb2.RedisClusterSpec{
Port: &port,
},
}, redisClient)

assert.Error(t, err)
assert.Equal(t, expectedErr, err)
assert.NoError(t, mock.ExpectationsWereMet())
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json
apiVersion: chainsaw.kyverno.io/v1alpha1
kind: Test
metadata:
name: redis-cluster-single-shard-follower-disconnected-v6
spec:
description: |
Test that a single-shard Redis cluster with HA (1 leader, 1 follower) can detect and repair
disconnected nodes when both leader and follower are deleted simultaneously. This validates
that RepairDisconnectedMasters properly handles both master and slave node disconnections.
steps:
- name: Create single-shard cluster with follower
try:
- apply:
file: single-shard-ha-cluster.yaml
- apply:
file: ../../../data-assert/resources.yaml
- assert:
file: ready-cluster.yaml
- name: Simulate both leader and follower disconnection simultaneously
try:
- script:
timeout: 10s
content: >
kubectl delete pod --namespace ${NAMESPACE} redis-single-shard-ha-leader-0 redis-single-shard-ha-follower-0

- name: Wait for cluster to detect failures
try:
- assert:
timeout: 5m
file: failed-cluster.yaml

- name: Wait for cluster to self-heal after both nodes disconnected
try:
- assert:
timeout: 5m
file: ready-cluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
apiVersion: redis.redis.opstreelabs.in/v1beta2
kind: RedisCluster
metadata:
name: redis-single-shard-ha
status:
state: Failed
reason: RedisCluster has unhealthy nodes
readyLeaderReplicas: 1
readyFollowerReplicas: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
apiVersion: redis.redis.opstreelabs.in/v1beta2
kind: RedisCluster
metadata:
name: redis-single-shard-ha
status:
readyLeaderReplicas: 1
readyFollowerReplicas: 1
state: Ready
reason: RedisCluster is ready
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
---
apiVersion: redis.redis.opstreelabs.in/v1beta2
kind: RedisCluster
metadata:
name: redis-single-shard-ha
spec:
clusterSize: 1
clusterVersion: v6
podSecurityContext:
runAsUser: 1000
fsGroup: 1000
redisLeader:
replicas: 1
redisFollower:
replicas: 1
kubernetesConfig:
image: quay.io/opstree/redis:latest
imagePullPolicy: Always
resources:
requests:
cpu: 101m
memory: 128Mi
limits:
cpu: 101m
memory: 128Mi
redisExporter:
enabled: true
image: quay.io/opstree/redis-exporter:v1.44.0
imagePullPolicy: Always
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 100m
memory: 128Mi
storage:
volumeClaimTemplate:
spec:
accessModes: [ReadWriteOnce]
resources:
requests:
storage: 1Gi
nodeConfVolume: true
nodeConfVolumeClaimTemplate:
spec:
accessModes: [ReadWriteOnce]
resources:
requests:
storage: 100Mi
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
---
# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json
apiVersion: chainsaw.kyverno.io/v1alpha1
kind: Test
metadata:
name: redis-cluster-single-shard-follower-disconnected-v7
spec:
description: |
Test that a single-shard Redis cluster with HA (1 leader, 1 follower) can detect and repair
disconnected nodes when both leader and follower are deleted simultaneously. This validates
that RepairDisconnectedMasters properly handles both master and slave node disconnections.
steps:
- name: Create single-shard cluster with follower
try:
- apply:
file: single-shard-ha-cluster.yaml
- apply:
file: ../../../data-assert/resources.yaml
- assert:
file: ready-cluster.yaml
- name: Simulate both leader and follower disconnection simultaneously
try:
- script:
timeout: 10s
content: >
kubectl delete pod --namespace ${NAMESPACE} redis-single-shard-ha-leader-0 redis-single-shard-ha-follower-0

- name: Wait for cluster to detect failures
try:
- assert:
timeout: 5m
file: failed-cluster.yaml

- name: Wait for cluster to self-heal after both nodes disconnected
try:
- assert:
timeout: 5m
file: ready-cluster.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
apiVersion: redis.redis.opstreelabs.in/v1beta2
kind: RedisCluster
metadata:
name: redis-single-shard-ha
status:
state: Failed
reason: RedisCluster has unhealthy nodes
readyLeaderReplicas: 1
readyFollowerReplicas: 1
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
---
apiVersion: redis.redis.opstreelabs.in/v1beta2
kind: RedisCluster
metadata:
name: redis-single-shard-ha
status:
readyLeaderReplicas: 1
readyFollowerReplicas: 1
state: Ready
reason: RedisCluster is ready
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
---
apiVersion: redis.redis.opstreelabs.in/v1beta2
kind: RedisCluster
metadata:
name: redis-single-shard-ha
spec:
clusterSize: 1
clusterVersion: v7
podSecurityContext:
runAsUser: 1000
fsGroup: 1000
redisLeader:
replicas: 1
redisFollower:
replicas: 1
kubernetesConfig:
image: quay.io/opstree/redis:latest
imagePullPolicy: Always
resources:
requests:
cpu: 101m
memory: 128Mi
limits:
cpu: 101m
memory: 128Mi
redisExporter:
enabled: true
image: quay.io/opstree/redis-exporter:v1.44.0
imagePullPolicy: Always
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 100m
memory: 128Mi
storage:
volumeClaimTemplate:
spec:
accessModes: [ReadWriteOnce]
resources:
requests:
storage: 1Gi
nodeConfVolume: true
nodeConfVolumeClaimTemplate:
spec:
accessModes: [ReadWriteOnce]
resources:
requests:
storage: 100Mi