Fallback logic

AlanCoding · AlanCoding · commit bde25db0af90 · 2025-02-18T14:25:53.000-05:00
diff --git a/ansible_base/task/tasks.py b/ansible_base/task/tasks.py
@@ -7,6 +7,8 @@
 from django.db import transaction
 from django.utils.timezone import now, timedelta
 
+from ansible_base.lib.utils.db import get_pg_notify_params
+
 from ansible_base.task.models import TASK_STATES, Task
 
 logger = logging.getLogger(__name__)
@@ -40,21 +42,31 @@ def run_task_from_queue(dispatcher):
 @task(queue='dab_broadcast')
 def manage_lost_tasks(grace_period: int = 10):
     cutoff_time = now() - timedelta(minutes=grace_period)
-    ctl = Control('dab_broadcast')
     for task in Task.objects.filter(state=TASK_STATES.RUNNING, started_at__lt=cutoff_time).iterator():
-        running_tasks = ctl.control_with_reply('running', data={'uuid': task.wrapper_uuid})
-        print(running_tasks)
-        print(task.wrapper_uuid)
-
-# TODO: Add a "reaper" fallback task, this is currently blocked on dispatcher issue
-# https://github.com/ansible/dispatcher/issues/6
-# When run_task_from_queue starts the task, the uuid needs to be setup to be discoverable
-# this fallback method will query for tasks that are older than a certain grace period
-# later improvement: look for tasks older that the task timeout plus a grace period
-# these tasks are in arrears
-# For each task in arrears, we will:
-# 1. obtain a row-level lock for that task
-# 2. send a roll-call message looking for its uuid
-# 3. if we get no answer to the roll-call, the task status is changed to lost
-# 4. dependent on policies defined in settings, we may re-submit the task up to a retry count
-# 5. if retry count is exhausted, task is failed
+        psycopg_params = get_pg_notify_params()
+        psycopg_params.pop('autocommit')  # dispatcher automatically adds this, causes error, TODO: need pre-check
+        psycopg_params.pop('cursor_factory')
+        psycopg_params.pop('context')  # TODO: remove in inner method, makes non-async, not good
+
+        ctl = Control('dab_broadcast', config=psycopg_params)
+
+        # TODO: row-level lock for the task
+
+        running_tasks = ctl.control_with_reply('running', data={'uuid': str(task.wrapper_uuid)})
+
+        found = False
+        for server_reply in running_tasks:
+            for worker_id, task_data in server_reply:
+                if task_data.get('uuid') == str(task.wrapper_uuid):
+                    found = True
+                    break
+            if found:
+                break
+
+        if not found:
+            # TODO: feature of retry policy
+            logger.warning(f'Could not find task {task.name} {task.wrapper_uuid}, deleting entry')
+            task.delete()
+        else:
+            delta = now() - task.started_at
+            logger.info(f'Noticed {task.name} {task.wrapper_uuid} running for {delta} seconds, seems to be fine')