diff --git a/docs/content/en/docs/Monitoring/metrics.md b/docs/content/en/docs/Monitoring/metrics.md index 8d7f8659ab..6369af14ed 100644 --- a/docs/content/en/docs/Monitoring/metrics.md +++ b/docs/content/en/docs/Monitoring/metrics.md @@ -41,9 +41,21 @@ Total number of rediscluster rebalance operations. Type: Counter. ### rediscluster_remove_follower_attempt Number of times to remove follower attempts. Type: Counter. +### rediscluster_repair_disconnected_attempt +Number of times to repair a Redis cluster disconnected from the cluster. Type: Counter. + +### rediscluster_repair_failed +Number of times to repair a Redis cluster failed. Type: Counter. + ### rediscluster_replicas_size_desired Total desired number of rediscluster replicas. Type: Gauge. +### rediscluster_reset_attempt +Number of times to reset a Redis cluster. Type: Counter. + +### rediscluster_reset_failed +Number of times to reset a Redis cluster failed. Type: Counter. + ### rediscluster_reshard_total Total number of rediscluster reshard operations. Type: Counter. diff --git a/internal/controller/rediscluster/rediscluster_controller.go b/internal/controller/rediscluster/rediscluster_controller.go index cc0a31ecc5..8e8e369f26 100644 --- a/internal/controller/rediscluster/rediscluster_controller.go +++ b/internal/controller/rediscluster/rediscluster_controller.go @@ -66,7 +66,7 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu } return intctrlutil.Reconciled() } - monitoring.RedisReplicationSkipReconcile.WithLabelValues(instance.Namespace, instance.Name).Set(0) + monitoring.RedisClusterSkipReconcile.WithLabelValues(instance.Namespace, instance.Name).Set(0) if common.IsSkipReconcile(ctx, instance) { monitoring.RedisClusterSkipReconcile.WithLabelValues(instance.Namespace, instance.Name).Set(1) return intctrlutil.Reconciled() @@ -230,7 +230,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu } logger.Info("healthy leader count does not match desired; attempting to repair disconnected masters") + monitoring.RedisClusterRepairDisconnectedAttempt.WithLabelValues(instance.Namespace, instance.Name).Inc() if err = k8sutils.RepairDisconnectedMasters(ctx, r.K8sClient, instance); err != nil { + monitoring.RedisClusterRepairDisconnectedFailed.WithLabelValues(instance.Namespace, instance.Name).Inc() logger.Error(err, "failed to repair disconnected masters") } @@ -256,7 +258,9 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Resu } if int(totalReplicas) > 1 && unhealthyNodeCount >= int(totalReplicas)-1 { logger.Info("unhealthy nodes exist after attempting to repair disconnected masters; starting failover") + monitoring.RedisClusterResetAttempt.WithLabelValues(instance.Namespace, instance.Name).Inc() if err = k8sutils.ExecuteFailoverOperation(ctx, r.K8sClient, instance); err != nil { + monitoring.RedisClusterResetFailed.WithLabelValues(instance.Namespace, instance.Name).Inc() return intctrlutil.RequeueE(ctx, err, "") } } diff --git a/internal/monitoring/main.go b/internal/monitoring/main.go index 7ba984490f..3b1a596a48 100644 --- a/internal/monitoring/main.go +++ b/internal/monitoring/main.go @@ -32,5 +32,9 @@ func RegisterRedisClusterMetrics() { RedisClusterRebalanceTotal, RedisClusterRemoveFollowerAttempt, RedisClusterReshardTotal, + RedisClusterRepairDisconnectedAttempt, + RedisClusterRepairDisconnectedFailed, + RedisClusterResetAttempt, + RedisClusterResetFailed, ) } diff --git a/internal/monitoring/rediscluster.go b/internal/monitoring/rediscluster.go index e5f036c06e..4eaf22248d 100644 --- a/internal/monitoring/rediscluster.go +++ b/internal/monitoring/rediscluster.go @@ -48,6 +48,30 @@ var RedisClusterDescription = map[string]MetricDescription{ Type: "Counter", labels: []string{"namespace", "instance"}, }, + "RedisClusterRepairDisconnectedAttempt": { + Name: "rediscluster_repair_disconnected_attempt", + Help: "Number of times to repair a Redis cluster disconnected from the cluster.", + Type: "Counter", + labels: []string{"namespace", "instance"}, + }, + "RedisClusterRepairFailed": { + Name: "rediscluster_repair_failed", + Help: "Number of times to repair a Redis cluster failed.", + Type: "Counter", + labels: []string{"namespace", "instance"}, + }, + "RedisClusterResetAttempt": { + Name: "rediscluster_reset_attempt", + Help: "Number of times to reset a Redis cluster.", + Type: "Counter", + labels: []string{"namespace", "instance"}, + }, + "RedisClusterResetFailed": { + Name: "rediscluster_reset_failed", + Help: "Number of times to reset a Redis cluster failed.", + Type: "Counter", + labels: []string{"namespace", "instance"}, + }, } var ( @@ -114,6 +138,38 @@ var ( }, RedisClusterDescription["RedisClusterAddingNodeAttempt"].labels, ) + + RedisClusterRepairDisconnectedAttempt = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: RedisClusterDescription["RedisClusterRepairDisconnectedAttempt"].Name, + Help: RedisClusterDescription["RedisClusterRepairDisconnectedAttempt"].Help, + }, + RedisClusterDescription["RedisClusterRepairDisconnectedAttempt"].labels, + ) + + RedisClusterRepairDisconnectedFailed = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: RedisClusterDescription["RedisClusterRepairDisconnectedFailed"].Name, + Help: RedisClusterDescription["RedisClusterRepairDisconnectedFailed"].Help, + }, + RedisClusterDescription["RedisClusterRepairDisconnectedFailed"].labels, + ) + + RedisClusterResetAttempt = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: RedisClusterDescription["RedisClusterResetAttempt"].Name, + Help: RedisClusterDescription["RedisClusterResetAttempt"].Help, + }, + RedisClusterDescription["RedisClusterResetAttempt"].labels, + ) + + RedisClusterResetFailed = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: RedisClusterDescription["RedisClusterResetFailed"].Name, + Help: RedisClusterDescription["RedisClusterResetFailed"].Help, + }, + RedisClusterDescription["RedisClusterResetFailed"].labels, + ) ) // ListMetrics will create a slice with the metrics available in metricDescription