fix: parallelize queue_dispatching in monitor loop - AAP-37345 (#1357)

Alex-Izquierdo · web-flow · commit 637aa34f932a · 2025-08-29T13:29:25.000+02:00
Try to solve the issue of the bottleneck that supposes the superlinear complexity of the current sequential and no scalable monitor_rulebook loop. Each user request and monitor needs to be queue-dispatched before being processed by the manager. I follow the same approach as for the rest of tasks by implementing a "lock" and "no_lock" versions of `queue_dispatch` and scheduling it per every user and monitor request in the monitor for the default worker. In this way we can parallelize the queue_dispatcher while preserving the consistency, leaving the current monitor loop as a lightweight iterator that schedules all the subsequent tasks. The monitor can scale better in big clusters by just increasing the number of workers. Jira: https://issues.redhat.com/browse/AAP-37345 --------- Signed-off-by: Alex <aizquier@redhat.com>
diff --git a/src/aap_eda/tasks/orchestrator.py b/src/aap_eda/tasks/orchestrator.py
@@ -176,10 +176,34 @@ def _run_request(
 
 
 def queue_dispatch(
+    process_parent_type: ProcessParentType,
+    process_parent_id: int,
+    request_type: Optional[ActivationRequest] = None,
+    request_id: str = "",
+) -> None:  # pragma: no cover
+    job_id = _manage_process_job_id(process_parent_type, process_parent_id)
+    with advisory_lock(job_id, wait=False) as acquired:
+        if not acquired:
+            LOGGER.debug(
+                f"queue_dispatch({job_id}) already being ran, "
+                f"not dispatching request {request_type}",
+            )
+            return
+        queue_dispatch_no_lock(
+            process_parent_type,
+            process_parent_id,
+            request_type,
+            request_id,
+            job_id,
+        )
+
+
+def queue_dispatch_no_lock(
     process_parent_type: ProcessParentType,
     process_parent_id: int,
     request_type: Optional[ActivationRequest],
     request_id: str = "",
+    job_id: str = "",
 ):
     """Dispatch the request to the right queue.
 
@@ -189,8 +213,6 @@ def queue_dispatch(
     checks the health of the queue before dispatching the request.
     Handles workers offline and unhealthy queues.
     """
-    job_id = _manage_process_job_id(process_parent_type, process_parent_id)
-
     # TODO: add "monitor" type to ActivationRequestQueue
     if request_type is None:
         request_type = "Monitor"
@@ -209,14 +231,6 @@ def queue_dispatch(
     assign_request_id(request_id)
     assign_log_tracking_id(process_parent.log_tracking_id)
 
-    with advisory_lock(job_id, wait=False) as acquired:
-        if not acquired:
-            LOGGER.debug(
-                f"_manage({job_id}) already being ran, "
-                f"not dispatching request {request_type}",
-            )
-            return
-
     LOGGER.info(
         f"Dispatching request {request_type} for {process_parent_type} "
         f"{process_parent_id}",
@@ -549,7 +563,10 @@ def monitor_rulebook_processes_no_lock() -> None:
     """
     # run pending user requests
     for request in requests_queue.list_requests():
-        queue_dispatch(
+        tasking.unique_enqueue(
+            "default",
+            "queue_dispatch_" + str(request.process_parent_id),
+            queue_dispatch,
             request.process_parent_type,
             request.process_parent_id,
             request.request,
@@ -564,12 +581,12 @@ def monitor_rulebook_processes_no_lock() -> None:
             ActivationStatus.WORKERS_OFFLINE,
         ]
     ):
-        process_parent_type = str(process.parent_type)
-        process_parent_id = process.activation_id
-
-        queue_dispatch(
-            process_parent_type,
-            process_parent_id,
+        tasking.unique_enqueue(
+            "default",
+            "queue_dispatch_" + str(process.activation_id),
+            queue_dispatch,
+            str(process.parent_type),
+            process.activation_id,
             None,
             str(uuid.uuid4()),
         )
diff --git a/tests/integration/tasks/test_orchestrator.py b/tests/integration/tasks/test_orchestrator.py
@@ -13,6 +13,8 @@
 #  limitations under the License.
 
 import logging
+import time
+from concurrent.futures import ThreadPoolExecutor
 from contextlib import contextmanager
 from unittest import mock
 
@@ -202,26 +204,24 @@ def test_monitor_rulebook_processes(
     get_queue_name_mock.return_value = "activation"
     call_args = [
         mock.call(
-            "activation",
-            orchestrator._manage_process_job_id(
-                ProcessParentType.ACTIVATION, activation.id
-            ),
-            orchestrator._manage,
+            "default",
+            "queue_dispatch_" + str(activation.id),
+            orchestrator.queue_dispatch,
             ProcessParentType.ACTIVATION,
             activation.id,
+            ActivationRequest.START,
             "",
         )
     ]
     for running in bulk_running_processes:
         call_args.append(
             mock.call(
-                "activation",
-                orchestrator._manage_process_job_id(
-                    ProcessParentType.ACTIVATION, running.activation.id
-                ),
-                orchestrator._manage,
+                "default",
+                "queue_dispatch_" + str(running.activation.id),
+                orchestrator.queue_dispatch,
                 ProcessParentType.ACTIVATION,
                 running.activation.id,
+                ActivationRequest.START,
                 "",
             )
         )
@@ -237,6 +237,21 @@ def test_monitor_rulebook_processes(
         )
     orchestrator.monitor_rulebook_processes()
 
+    # Also expect calls for running processes
+    # (these will have None as request type)
+    for running in bulk_running_processes:
+        call_args.append(
+            mock.call(
+                "default",
+                "queue_dispatch_" + str(running.activation.id),
+                orchestrator.queue_dispatch,
+                str(running.parent_type),
+                running.activation.id,
+                None,
+                mock.ANY,  # UUID string
+            )
+        )
+
     enqueue_mock.assert_has_calls(call_args, any_order=True)
 
 
@@ -268,6 +283,45 @@ def advisory_lock_mock(*args, **kwargs):
         )
 
         enqueue_mock.assert_not_called()
-        assert f"_manage({job_id}) already being ran, " in eda_caplog.text
+        assert (
+            f"queue_dispatch({job_id}) already being ran, " in eda_caplog.text
+        )
         activation.refresh_from_db()
         assert activation.status == ActivationStatus.STOPPED
+
+
+@pytest.mark.django_db
+def test_queue_dispatch_advisory_lock(activation, eda_caplog):
+    """Test that queue_dispatch advisory lock prevents duplicate execution."""
+    execution_count = 0
+
+    def mock_queue_dispatch_no_lock(*args, **kwargs):
+        nonlocal execution_count
+        execution_count += 1
+        time.sleep(1.0)
+        return True
+
+    def concurrent_dispatch():
+        """Function to run queue_dispatch concurrently."""
+        with mock.patch(
+            "aap_eda.tasks.orchestrator.queue_dispatch_no_lock",
+            side_effect=mock_queue_dispatch_no_lock,
+        ):
+            orchestrator.queue_dispatch(
+                ProcessParentType.ACTIVATION,
+                activation.id,
+                ActivationRequest.START,
+                "test-request",
+            )
+
+    with ThreadPoolExecutor(max_workers=3) as executor:
+        futures = [executor.submit(concurrent_dispatch) for _ in range(3)]
+        for future in futures:
+            future.result()
+
+    assert execution_count == 1, f"Expected 1 execution, got {execution_count}"
+
+    assert (
+        "queue_dispatch" in eda_caplog.text
+        and "already being ran" in eda_caplog.text
+    )