Redis cluster error handling (#1066)

john-westcott-iv · jshimkus-rh · Alex-Izquierdo · web-flow · commit ed32e37a7f13 · 2024-09-25T14:19:46.000-04:00
Co-authored-by: Joe Shimkus &lt;jshimkus@redhat.com&gt;
Co-authored-by: Alex &lt;aizquier@redhat.com&gt;
diff --git a/src/aap_eda/core/management/commands/scheduler.py b/src/aap_eda/core/management/commands/scheduler.py
@@ -70,9 +70,13 @@
 https://github.com/rq/rq-scheduler/blob/master/README.rst
 """
 import logging
+import re
 from datetime import datetime
+from time import sleep
 
 import django_rq
+import redis
+from ansible_base.lib.redis.client import DABRedisCluster
 from django.conf import settings
 from django_rq.management.commands import rqscheduler
 from rq_scheduler import Scheduler
@@ -149,4 +153,73 @@ def handle(self, *args, **options) -> None:
         add_startup_jobs(scheduler)
         add_periodic_jobs(scheduler)
         add_cron_jobs(scheduler)
-        super().handle(*args, **options)
+        # We are going to start our own loop here to catch exceptions which
+        # might be coming from a redis cluster and retrying things.
+        while True:
+            try:
+                super().handle(*args, **options)
+            except (
+                redis.exceptions.TimeoutError,
+                redis.exceptions.ClusterDownError,
+                redis.exceptions.ConnectionError,
+            ) as e:
+                # If we got one of these exceptions but are not on a Cluster go
+                # ahead and raise it normally.
+                if not isinstance(scheduler.connection, DABRedisCluster):
+                    raise
+
+                # There are a lot of different exceptions that inherit from
+                # ConnectionError.  So we need to make sure if we got that its
+                # an actual ConnectionError. If not, go ahead and raise it.
+                # Note:  ClusterDownError and TimeoutError are not subclasses
+                #        of ConnectionError.
+                if (
+                    isinstance(e, redis.exceptions.ConnectionError)
+                    and type(e) is not redis.exceptions.ConnectionError
+                ):
+                    raise
+
+                downed_node_ip = re.findall(
+                    r"[0-9]+(?:\.[0-9]+){3}:[0-9]+", str(e)
+                )
+
+                # If we got a cluster issue we will loop here until we can ping
+                # the server again.
+                max_backoff = 60
+                current_backoff = 1
+                while True:
+                    if current_backoff > max_backoff:
+                        # Maybe we just got a network glitch and are waiting
+                        # for a cluster member to fail when its not going to.
+                        # At this point we've waited for 60 secs so lets go
+                        # ahead and let the scheduler try and restart.
+                        logger.error(
+                            "Connection to redis is still down "
+                            "going to attempt to restart scheduler"
+                        )
+                        break
+
+                    backoff = min(current_backoff, max_backoff)
+                    logger.error(
+                        f"Connection to redis cluster failed. Attempting to "
+                        f"reconnect in {backoff}"
+                    )
+                    sleep(backoff)
+                    current_backoff = 2 * current_backoff
+                    try:
+                        if downed_node_ip:
+                            cluster_nodes = (
+                                scheduler.connection.cluster_nodes()
+                            )
+                            for ip in downed_node_ip:
+                                if "fail" not in cluster_nodes[ip]["flags"]:
+                                    raise Exception(
+                                        "Failed node is not yet in a failed "
+                                        "state"
+                                    )
+                        else:
+                            scheduler.connection.ping()
+                        break
+                    # We could tighten this exception up
+                    except Exception:
+                        pass
diff --git a/src/aap_eda/core/tasking/__init__.py b/src/aap_eda/core/tasking/__init__.py
@@ -3,6 +3,7 @@
 
 import logging
 from datetime import datetime, timedelta
+from time import sleep
 from types import MethodType
 from typing import (
     Any,
@@ -330,6 +331,7 @@ def __init__(
             prepare_for_work=prepare_for_work,
             serializer=JSONSerializer,
         )
+        self.is_shutting_down = False
 
     def _set_connection(
         self,
@@ -411,6 +413,91 @@ def handle_job_success(
 
             pipeline.execute()
 
+    def handle_warm_shutdown_request(self):
+        self.is_shutting_down = True
+        super().handle_warm_shutdown_request()
+
+    # We are going to override the work function to create our own loop.
+    # This will allow us to catch exceptions that the default work method will
+    # not handle and restart our worker process if we hit them.
+    def work(
+        self,
+        burst: bool = False,
+        logging_level: str = "INFO",
+        date_format: str = rq.defaults.DEFAULT_LOGGING_DATE_FORMAT,
+        log_format: str = rq.defaults.DEFAULT_LOGGING_FORMAT,
+        max_jobs: Optional[int] = None,
+        with_scheduler: bool = False,
+    ) -> bool:
+        while True:
+            # super.work() returns a value that we want to return on a normal
+            # exit.
+            return_value = None
+            try:
+                return_value = super().work(
+                    burst,
+                    logging_level,
+                    date_format,
+                    log_format,
+                    max_jobs,
+                    with_scheduler,
+                )
+            except (
+                redis.exceptions.TimeoutError,
+                redis.exceptions.ClusterDownError,
+                redis.exceptions.ConnectionError,
+            ) as e:
+                # If we got one of these exceptions but are not on a Cluster go
+                # ahead and raise it normally.
+                if not isinstance(self.connection, DABRedisCluster):
+                    raise
+
+                # There are a lot of different exceptions that inherit from
+                # ConnectionError.  So we need to make sure if we got that its
+                # an actual ConnectionError.  If not, go ahead and raise it.
+                # Note: ClusterDownError and TimeoutError are not subclasses
+                #       of ConnectionError.
+                if (
+                    isinstance(e, redis.exceptions.ConnectionError)
+                    and type(e) is not redis.exceptions.ConnectionError
+                ):
+                    raise
+
+                # If we got a cluster issue we will loop here until we can ping
+                # the server again.
+                max_backoff = 60
+                current_backoff = 1
+                while True:
+                    backoff = min(current_backoff, max_backoff)
+                    logger.error(
+                        f"Connection to redis cluster failed. Attempting to "
+                        f"reconnect in {backoff}"
+                    )
+                    sleep(backoff)
+                    current_backoff = 2 * current_backoff
+                    try:
+                        self.connection.ping()
+                        break
+                    # We could tighten this exception up.
+                    except Exception:
+                        pass
+                # At this point return value is none so we are going to go
+                # ahead and fall through to the loop to restart.
+
+            # We are outside of the work function with either:
+            #   a "normal exist"
+            #   an exit that did not raise an exception
+            if return_value:
+                logger.debug(f"Working exited normally with {return_value}")
+                return return_value
+            elif self.is_shutting_down:
+                # Get got a warm shutdown request, lets respect it
+                return return_value
+            else:
+                logger.error(
+                    "Work exited no return value, going to restart the worker"
+                )
+
 
 class DefaultWorker(Worker):
     """Custom default worker class used for non-activation tasks.
diff --git a/src/aap_eda/settings/default.py b/src/aap_eda/settings/default.py
@@ -453,10 +453,14 @@ def rq_redis_client_instantiation_parameters():
         params["socket_connect_timeout"] = settings.get(
             "MQ_SOCKET_CONNECT_TIMEOUT", 10
         )
-        params["socket_timeout"] = settings.get("MQ_SOCKET_TIMEOUT", 10)
+        params["socket_timeout"] = settings.get("MQ_SOCKET_TIMEOUT", 150)
         params["cluster_error_retry_attempts"] = settings.get(
             "MQ_CLUSTER_ERROR_RETRY_ATTEMPTS", 3
         )
+        from redis.backoff import ConstantBackoff
+        from redis.retry import Retry
+
+        params["retry"] = Retry(backoff=ConstantBackoff(3), retries=20)
     return params
 
 

Original file line number	Diff line number	Diff line change
`@@ -453,10 +453,14 @@ def rq_redis_client_instantiation_parameters():`
`453`	`453`	`params["socket_connect_timeout"] = settings.get(`
`454`	`454`	`"MQ_SOCKET_CONNECT_TIMEOUT", 10`
`455`	`455`	`)`
`456`		`- params["socket_timeout"] = settings.get("MQ_SOCKET_TIMEOUT", 10)`
	`456`	`+ params["socket_timeout"] = settings.get("MQ_SOCKET_TIMEOUT", 150)`
`457`	`457`	`params["cluster_error_retry_attempts"] = settings.get(`
`458`	`458`	`"MQ_CLUSTER_ERROR_RETRY_ATTEMPTS", 3`
`459`	`459`	`)`
	`460`	`+ from redis.backoff import ConstantBackoff`
	`461`	`+ from redis.retry import Retry`
	`462`	`+`
	`463`	`+ params["retry"] = Retry(backoff=ConstantBackoff(3), retries=20)`
`460`	`464`	`return params`
`461`	`465`
`462`	`466`