Fix queue recovery (#204)

maxdml · kraftp · web-flow · commit 1553d98aff55 · 2025-02-07T15:54:51.000-08:00
When performing recovery, we now re-enqueue workflows that came from a
queue. This allows tasks from a queue to respect the concurrency limits.
Re-enqueue = reset the start time and executor assignment in the queue
table. This ensures the task is re-inserted in the same position in the
queue.

---------

Co-authored-by: Peter Kraft &lt;petereliaskraft@gmail.com&gt;
diff --git a/dbos/_dbos.py b/dbos/_dbos.py
@@ -56,7 +56,7 @@
 )
 from ._roles import default_required_roles, required_roles
 from ._scheduler import ScheduledWorkflow, scheduled
-from ._sys_db import WorkflowStatusString, reset_system_database
+from ._sys_db import reset_system_database
 from ._tracer import dbos_tracer
 
 if TYPE_CHECKING:
@@ -613,6 +613,7 @@ def get_workflow_status(cls, workflow_id: str) -> Optional[WorkflowStatus]:
             workflow_id=workflow_id,
             status=stat["status"],
             name=stat["name"],
+            executor_id=stat["executor_id"],
             recovery_attempts=stat["recovery_attempts"],
             class_name=stat["class_name"],
             config_name=stat["config_name"],
@@ -909,6 +910,7 @@ class WorkflowStatus:
         workflow_id(str):  The ID of the workflow execution
         status(str):  The status of the execution, from `WorkflowStatusString`
         name(str): The workflow function name
+        executor_id(str): The ID of the executor running the workflow
         class_name(str): For member functions, the name of the class containing the workflow function
         config_name(str): For instance member functions, the name of the class instance for the execution
         queue_name(str): For workflows that are or were queued, the queue name
@@ -922,6 +924,7 @@ class WorkflowStatus:
     workflow_id: str
     status: str
     name: str
+    executor_id: Optional[str]
     class_name: Optional[str]
     config_name: Optional[str]
     queue_name: Optional[str]
diff --git a/dbos/_recovery.py b/dbos/_recovery.py
@@ -6,20 +6,29 @@
 
 from ._core import execute_workflow_by_id
 from ._error import DBOSWorkflowFunctionNotFoundError
+from ._sys_db import GetPendingWorkflowsOutput
 
 if TYPE_CHECKING:
     from ._dbos import DBOS, WorkflowHandle
 
 
-def startup_recovery_thread(dbos: "DBOS", workflow_ids: List[str]) -> None:
+def startup_recovery_thread(
+    dbos: "DBOS", pending_workflows: List[GetPendingWorkflowsOutput]
+) -> None:
     """Attempt to recover local pending workflows on startup using a background thread."""
     stop_event = threading.Event()
     dbos.stop_events.append(stop_event)
-    while not stop_event.is_set() and len(workflow_ids) > 0:
+    while not stop_event.is_set() and len(pending_workflows) > 0:
         try:
-            for workflowID in list(workflow_ids):
-                execute_workflow_by_id(dbos, workflowID)
-                workflow_ids.remove(workflowID)
+            for pending_workflow in list(pending_workflows):
+                if (
+                    pending_workflow.queue_name
+                    and pending_workflow.queue_name != "_dbos_internal_queue"
+                ):
+                    dbos._sys_db.clear_queue_assignment(pending_workflow.workflow_uuid)
+                    continue
+                execute_workflow_by_id(dbos, pending_workflow.workflow_uuid)
+                pending_workflows.remove(pending_workflow)
         except DBOSWorkflowFunctionNotFoundError:
             time.sleep(1)
         except Exception as e:
@@ -39,12 +48,23 @@ def recover_pending_workflows(
                 f"Skip local recovery because it's running in a VM: {os.environ.get('DBOS__VMID')}"
             )
         dbos.logger.debug(f"Recovering pending workflows for executor: {executor_id}")
-        workflow_ids = dbos._sys_db.get_pending_workflows(executor_id)
-        dbos.logger.debug(f"Pending workflows: {workflow_ids}")
-
-        for workflowID in workflow_ids:
-            handle = execute_workflow_by_id(dbos, workflowID)
-            workflow_handles.append(handle)
+        pending_workflows = dbos._sys_db.get_pending_workflows(executor_id)
+        for pending_workflow in pending_workflows:
+            if (
+                pending_workflow.queue_name
+                and pending_workflow.queue_name != "_dbos_internal_queue"
+            ):
+                try:
+                    dbos._sys_db.clear_queue_assignment(pending_workflow.workflow_uuid)
+                    workflow_handles.append(
+                        dbos.retrieve_workflow(pending_workflow.workflow_uuid)
+                    )
+                except Exception as e:
+                    dbos.logger.error(e)
+            else:
+                workflow_handles.append(
+                    execute_workflow_by_id(dbos, pending_workflow.workflow_uuid)
+                )
 
     dbos.logger.info("Recovered pending workflows")
     return workflow_handles
diff --git a/dbos/_sys_db.py b/dbos/_sys_db.py
@@ -140,6 +140,12 @@ def __init__(self, workflow_uuids: List[str]):
         self.workflow_uuids = workflow_uuids
 
 
+class GetPendingWorkflowsOutput:
+    def __init__(self, *, workflow_uuid: str, queue_name: Optional[str] = None):
+        self.workflow_uuid: str = workflow_uuid
+        self.queue_name: Optional[str] = queue_name
+
+
 class WorkflowInformation(TypedDict, total=False):
     workflow_uuid: str
     status: WorkflowStatuses  # The status of the workflow.
@@ -465,6 +471,7 @@ def get_workflow_status(
                     SystemSchema.workflow_status.c.authenticated_roles,
                     SystemSchema.workflow_status.c.assumed_role,
                     SystemSchema.workflow_status.c.queue_name,
+                    SystemSchema.workflow_status.c.executor_id,
                 ).where(SystemSchema.workflow_status.c.workflow_uuid == workflow_uuid)
             ).fetchone()
             if row is None:
@@ -479,7 +486,7 @@ def get_workflow_status(
                 "error": None,
                 "app_id": None,
                 "app_version": None,
-                "executor_id": None,
+                "executor_id": row[10],
                 "request": row[2],
                 "recovery_attempts": row[3],
                 "authenticated_user": row[6],
@@ -746,16 +753,27 @@ def get_queued_workflows(
 
         return GetWorkflowsOutput(workflow_uuids)
 
-    def get_pending_workflows(self, executor_id: str) -> list[str]:
+    def get_pending_workflows(
+        self, executor_id: str
+    ) -> list[GetPendingWorkflowsOutput]:
         with self.engine.begin() as c:
             rows = c.execute(
-                sa.select(SystemSchema.workflow_status.c.workflow_uuid).where(
+                sa.select(
+                    SystemSchema.workflow_status.c.workflow_uuid,
+                    SystemSchema.workflow_status.c.queue_name,
+                ).where(
                     SystemSchema.workflow_status.c.status
                     == WorkflowStatusString.PENDING.value,
                     SystemSchema.workflow_status.c.executor_id == executor_id,
                 )
             ).fetchall()
-            return [row[0] for row in rows]
+            return [
+                GetPendingWorkflowsOutput(
+                    workflow_uuid=row.workflow_uuid,
+                    queue_name=row.queue_name,
+                )
+                for row in rows
+            ]
 
     def record_operation_result(
         self, result: OperationResultInternal, conn: Optional[sa.Connection] = None
@@ -1375,6 +1393,19 @@ def remove_from_queue(self, workflow_id: str, queue: "Queue") -> None:
                     .values(completed_at_epoch_ms=int(time.time() * 1000))
                 )
 
+    def clear_queue_assignment(self, workflow_id: str) -> None:
+        with self.engine.begin() as c:
+            c.execute(
+                sa.update(SystemSchema.workflow_queue)
+                .where(SystemSchema.workflow_queue.c.workflow_uuid == workflow_id)
+                .values(executor_id=None, started_at_epoch_ms=None)
+            )
+            c.execute(
+                sa.update(SystemSchema.workflow_status)
+                .where(SystemSchema.workflow_status.c.workflow_uuid == workflow_id)
+                .values(executor_id=None, status=WorkflowStatusString.ENQUEUED.value)
+            )
+
 
 def reset_system_database(config: ConfigFile) -> None:
     sysdb_name = (
diff --git a/tests/test_failures.py b/tests/test_failures.py
@@ -9,7 +9,7 @@
 from sqlalchemy.exc import InvalidRequestError, OperationalError
 
 # Public API
-from dbos import DBOS, GetWorkflowsInput, Queue, SetWorkflowID
+from dbos import DBOS, GetWorkflowsInput, SetWorkflowID
 from dbos._error import DBOSDeadLetterQueueError, DBOSException
 from dbos._sys_db import WorkflowStatusString
 
diff --git a/tests/test_queue.py b/tests/test_queue.py
@@ -615,11 +615,17 @@ def test_step(i: int) -> int:
         original_handle = DBOS.start_workflow(test_workflow)
     for e in step_events:
         e.wait()
+        e.clear()
+
     assert step_counter == 5
 
     # Recover the workflow, then resume it.
     recovery_handles = DBOS.recover_pending_workflows()
+    # Wait until the 2nd invocation of the workflows are dequeued and executed
+    for e in step_events:
+        e.wait()
     event.set()
+
     # There should be one handle for the workflow and another for each queued step.
     assert len(recovery_handles) == queued_steps + 1
     # Verify that both the recovered and original workflows complete correctly.
@@ -639,6 +645,84 @@ def test_step(i: int) -> int:
     assert queue_entries_are_cleaned_up(dbos)
 
 
+def test_queue_concurrency_under_recovery(dbos: DBOS) -> None:
+    event = threading.Event()
+    wf_events = [threading.Event() for _ in range(2)]
+    counter = 0
+
+    @DBOS.workflow()
+    def blocked_workflow(i: int) -> None:
+        wf_events[i].set()
+        nonlocal counter
+        counter += 1
+        event.wait()
+
+    @DBOS.workflow()
+    def noop() -> None:
+        pass
+
+    queue = Queue("test_queue", concurrency=2)
+    handle1 = queue.enqueue(blocked_workflow, 0)
+    handle2 = queue.enqueue(blocked_workflow, 1)
+    handle3 = queue.enqueue(noop)
+
+    # Wait for the two first workflows to be dequeued
+    for e in wf_events:
+        e.wait()
+        e.clear()
+
+    assert counter == 2
+    assert handle1.get_status().status == WorkflowStatusString.PENDING.value
+    assert handle2.get_status().status == WorkflowStatusString.PENDING.value
+    assert handle3.get_status().status == WorkflowStatusString.ENQUEUED.value
+
+    # Manually update the database to pretend the 3rd workflow is PENDING and comes from another executor
+    with dbos._sys_db.engine.begin() as c:
+        query = (
+            sa.update(SystemSchema.workflow_status)
+            .values(status=WorkflowStatusString.PENDING.value, executor_id="other")
+            .where(
+                SystemSchema.workflow_status.c.workflow_uuid
+                == handle3.get_workflow_id()
+            )
+        )
+        c.execute(query)
+
+    # Trigger workflow recovery. The two first workflows should still be blocked but the 3rd one enqueued
+    recovered_other_handles = DBOS.recover_pending_workflows(["other"])
+    assert handle1.get_status().status == WorkflowStatusString.PENDING.value
+    assert handle2.get_status().status == WorkflowStatusString.PENDING.value
+    assert len(recovered_other_handles) == 1
+    assert recovered_other_handles[0].get_workflow_id() == handle3.get_workflow_id()
+    assert handle3.get_status().status == WorkflowStatusString.ENQUEUED.value
+
+    # Trigger workflow recovery for "local". The two first workflows should be re-enqueued then dequeued again
+    recovered_local_handles = DBOS.recover_pending_workflows(["local"])
+    assert len(recovered_local_handles) == 2
+    for h in recovered_local_handles:
+        assert h.get_workflow_id() in [
+            handle1.get_workflow_id(),
+            handle2.get_workflow_id(),
+        ]
+    for e in wf_events:
+        e.wait()
+    assert counter == 4
+    assert handle1.get_status().status == WorkflowStatusString.PENDING.value
+    assert handle2.get_status().status == WorkflowStatusString.PENDING.value
+    # Because tasks are re-enqueued in order, the 3rd task is head of line blocked
+    assert handle3.get_status().status == WorkflowStatusString.ENQUEUED.value
+
+    # Unblock the first two workflows
+    event.set()
+
+    # Verify all queue entries eventually get cleaned up.
+    assert handle1.get_result() == None
+    assert handle2.get_result() == None
+    assert handle3.get_result() == None
+    assert handle3.get_status().executor_id == "local"
+    assert queue_entries_are_cleaned_up(dbos)
+
+
 def test_cancelling_queued_workflows(dbos: DBOS) -> None:
     start_event = threading.Event()
     blocking_event = threading.Event()
@@ -746,17 +830,28 @@ def regular_workflow() -> None:
 
     # Attempt to recover the blocked workflow the maximum number of times
     for i in range(max_recovery_attempts):
+        start_event.clear()
         DBOS.recover_pending_workflows()
+        start_event.wait()
         assert recovery_count == i + 2
 
-    # Verify an additional recovery throws a DLQ error and puts the workflow in the DLQ status.
-    with pytest.raises(Exception) as exc_info:
-        DBOS.recover_pending_workflows()
-    assert exc_info.errisinstance(DBOSDeadLetterQueueError)
+    # Verify an additional recovery throws puts the workflow in the DLQ status.
+    DBOS.recover_pending_workflows()
+    # we can't start_event.wait() here because the workflow will never execute
+    time.sleep(2)
     assert (
         blocked_handle.get_status().status
         == WorkflowStatusString.RETRIES_EXCEEDED.value
     )
+    with dbos._sys_db.engine.begin() as c:
+        query = sa.select(SystemSchema.workflow_status.c.recovery_attempts).where(
+            SystemSchema.workflow_status.c.workflow_uuid
+            == blocked_handle.get_workflow_id()
+        )
+        result = c.execute(query)
+        row = result.fetchone()
+        assert row is not None
+        assert row[0] == max_recovery_attempts + 2
 
     # Verify the blocked workflow entering the DLQ lets the regular workflow run
     assert regular_handle.get_result() == None
@@ -766,6 +861,15 @@ def regular_workflow() -> None:
     assert blocked_handle.get_result() == None
     dbos._sys_db.wait_for_buffer_flush()
     assert blocked_handle.get_status().status == WorkflowStatusString.SUCCESS.value
+    with dbos._sys_db.engine.begin() as c:
+        query = sa.select(SystemSchema.workflow_status.c.recovery_attempts).where(
+            SystemSchema.workflow_status.c.workflow_uuid
+            == blocked_handle.get_workflow_id()
+        )
+        result = c.execute(query)
+        row = result.fetchone()
+        assert row is not None
+        assert row[0] == max_recovery_attempts + 2
 
     # Verify all queue entries eventually get cleaned up.
     assert queue_entries_are_cleaned_up(dbos)