remove explicit notion of recovery (#198)

maxdml · web-flow · commit ddc59a810075 · 2025-02-04T13:27:48.000-08:00
This PR simplifies the logic for putting a workflow in the dead letter
queue. Specifically, it changes the meaning of the `recovery_attempt`
column in the `dbos.workflow_status` table to `attempts`*.

The default of the column is now 1.

`update_workflow_status` just verifies that (number of attempts) &lt;=
max_retries before placing a workflow in the DLQ.

[*] This PR does not change the column name, to facilitate backward
compatibility.
diff --git a/dbos/_context.py b/dbos/_context.py
@@ -63,7 +63,6 @@ def __init__(self) -> None:
         self.parent_workflow_fid: int = -1
         self.workflow_id: str = ""
         self.function_id: int = -1
-        self.in_recovery: bool = False
 
         self.curr_step_function_id: int = -1
         self.curr_tx_function_id: int = -1
@@ -82,7 +81,6 @@ def create_child(self) -> DBOSContext:
         rv.is_within_set_workflow_id_block = self.is_within_set_workflow_id_block
         rv.parent_workflow_id = self.workflow_id
         rv.parent_workflow_fid = self.function_id
-        rv.in_recovery = self.in_recovery
         rv.authenticated_user = self.authenticated_user
         rv.authenticated_roles = (
             self.authenticated_roles[:]
@@ -335,34 +333,6 @@ def __exit__(
         return False  # Did not handle
 
 
-class SetWorkflowRecovery:
-    def __init__(self) -> None:
-        self.created_ctx = False
-
-    def __enter__(self) -> SetWorkflowRecovery:
-        # Code to create a basic context
-        ctx = get_local_dbos_context()
-        if ctx is None:
-            self.created_ctx = True
-            _set_local_dbos_context(DBOSContext())
-        assert_current_dbos_context().in_recovery = True
-
-        return self
-
-    def __exit__(
-        self,
-        exc_type: Optional[Type[BaseException]],
-        exc_value: Optional[BaseException],
-        traceback: Optional[TracebackType],
-    ) -> Literal[False]:
-        assert assert_current_dbos_context().in_recovery == True
-        assert_current_dbos_context().in_recovery = False
-        # Code to clean up the basic context if we created it
-        if self.created_ctx:
-            _clear_local_dbos_context()
-        return False  # Did not handle
-
-
 class EnterDBOSWorkflow(AbstractContextManager[DBOSContext, Literal[False]]):
     def __init__(self, attributes: TracedAttributes) -> None:
         self.created_ctx = False
diff --git a/dbos/_core.py b/dbos/_core.py
@@ -186,7 +186,7 @@ def _init_workflow(
         # We also have to do this for single-step workflows because of the foreign key constraint on the operation outputs table
         # TODO: Make this transactional (and with the queue step below)
         wf_status = dbos._sys_db.update_workflow_status(
-            status, False, ctx.in_recovery, max_recovery_attempts=max_recovery_attempts
+            status, False, max_recovery_attempts=max_recovery_attempts
         )
         # TODO: Modify the inputs if they were changed by `update_workflow_inputs`
         dbos._sys_db.update_workflow_inputs(wfid, _serialization.serialize_args(inputs))
diff --git a/dbos/_dbos.py b/dbos/_dbos.py
@@ -801,7 +801,7 @@ def recover_pending_workflows(
     def cancel_workflow(cls, workflow_id: str) -> None:
         """Cancel a workflow by ID."""
         _get_dbos_instance()._sys_db.set_workflow_status(
-            workflow_id, WorkflowStatusString.CANCELLED, False
+            workflow_id, WorkflowStatusString.CANCELLED
         )
 
     @classmethod
diff --git a/dbos/_recovery.py b/dbos/_recovery.py
@@ -4,7 +4,6 @@
 import traceback
 from typing import TYPE_CHECKING, Any, List
 
-from ._context import SetWorkflowRecovery
 from ._core import execute_workflow_by_id
 from ._error import DBOSWorkflowFunctionNotFoundError
 
@@ -19,8 +18,7 @@ def startup_recovery_thread(dbos: "DBOS", workflow_ids: List[str]) -> None:
     while not stop_event.is_set() and len(workflow_ids) > 0:
         try:
             for workflowID in list(workflow_ids):
-                with SetWorkflowRecovery():
-                    execute_workflow_by_id(dbos, workflowID)
+                execute_workflow_by_id(dbos, workflowID)
                 workflow_ids.remove(workflowID)
         except DBOSWorkflowFunctionNotFoundError:
             time.sleep(1)
@@ -45,8 +43,7 @@ def recover_pending_workflows(
         dbos.logger.debug(f"Pending workflows: {workflow_ids}")
 
         for workflowID in workflow_ids:
-            with SetWorkflowRecovery():
-                handle = execute_workflow_by_id(dbos, workflowID)
+            handle = execute_workflow_by_id(dbos, workflowID)
             workflow_handles.append(handle)
 
     dbos.logger.info("Recovered pending workflows")
diff --git a/dbos/_sys_db.py b/dbos/_sys_db.py
@@ -247,10 +247,10 @@ def update_workflow_status(
         self,
         status: WorkflowStatusInternal,
         replace: bool = True,
-        in_recovery: bool = False,
         *,
         conn: Optional[sa.Connection] = None,
         max_recovery_attempts: int = DEFAULT_MAX_RECOVERY_ATTEMPTS,
+        is_status_flush: bool = False,
     ) -> WorkflowStatuses:
         wf_status: WorkflowStatuses = status["status"]
 
@@ -270,6 +270,9 @@ def update_workflow_status(
             authenticated_roles=status["authenticated_roles"],
             assumed_role=status["assumed_role"],
             queue_name=status["queue_name"],
+            recovery_attempts=(
+                1 if wf_status != WorkflowStatusString.ENQUEUED.value else 0
+            ),
         )
         if replace:
             cmd = cmd.on_conflict_do_update(
@@ -278,24 +281,25 @@ def update_workflow_status(
                     status=status["status"],
                     output=status["output"],
                     error=status["error"],
-                ),
-            )
-        elif in_recovery:
-            cmd = cmd.on_conflict_do_update(
-                index_elements=["workflow_uuid"],
-                set_=dict(
-                    recovery_attempts=SystemSchema.workflow_status.c.recovery_attempts
-                    + 1,
+                    recovery_attempts=(
+                        SystemSchema.workflow_status.c.recovery_attempts + 1
+                        if not is_status_flush
+                        else SystemSchema.workflow_status.c.recovery_attempts
+                    ),
                 ),
             )
         else:
-            # A blank update so that we can return the existing status
             cmd = cmd.on_conflict_do_update(
                 index_elements=["workflow_uuid"],
                 set_=dict(
-                    recovery_attempts=SystemSchema.workflow_status.c.recovery_attempts
+                    recovery_attempts=(
+                        SystemSchema.workflow_status.c.recovery_attempts + 1
+                        if not is_status_flush
+                        else SystemSchema.workflow_status.c.recovery_attempts
+                    ),
                 ),
             )
+
         cmd = cmd.returning(SystemSchema.workflow_status.c.recovery_attempts, SystemSchema.workflow_status.c.status, SystemSchema.workflow_status.c.name, SystemSchema.workflow_status.c.class_name, SystemSchema.workflow_status.c.config_name, SystemSchema.workflow_status.c.queue_name)  # type: ignore
 
         if conn is not None:
@@ -325,7 +329,10 @@ def update_workflow_status(
             if err_msg is not None:
                 raise DBOSConflictingWorkflowError(status["workflow_uuid"], err_msg)
 
-            if in_recovery and recovery_attempts > max_recovery_attempts:
+            # recovery_attempt means "attempts" (we kept the name for backward compatibility). It's default value is 1.
+            # Every time we init the status, we increment `recovery_attempts` by 1.
+            # Thus, when this number becomes equal to `maxRetries + 1`, we should mark the workflow as `RETRIES_EXCEEDED`.
+            if recovery_attempts > max_recovery_attempts + 1:
                 with self.engine.begin() as c:
                     c.execute(
                         sa.delete(SystemSchema.workflow_queue).where(
@@ -362,7 +369,6 @@ def set_workflow_status(
         self,
         workflow_uuid: str,
         status: WorkflowStatusString,
-        reset_recovery_attempts: bool,
     ) -> None:
         with self.engine.begin() as c:
             stmt = (
@@ -374,17 +380,6 @@ def set_workflow_status(
             )
             c.execute(stmt)
 
-        if reset_recovery_attempts:
-            with self.engine.begin() as c:
-                stmt = (
-                    sa.update(SystemSchema.workflow_status)
-                    .where(
-                        SystemSchema.workflow_status.c.workflow_uuid == workflow_uuid
-                    )
-                    .values(recovery_attempts=reset_recovery_attempts)
-                )
-                c.execute(stmt)
-
     def get_workflow_status(
         self, workflow_uuid: str
     ) -> Optional[WorkflowStatusInternal]:
@@ -1062,7 +1057,7 @@ def _flush_workflow_status_buffer(self) -> None:
                     continue
                 exported_status[wf_id] = status
                 try:
-                    self.update_workflow_status(status, conn=c)
+                    self.update_workflow_status(status, conn=c, is_status_flush=True)
                     exported += 1
                 except Exception as e:
                     dbos_logger.error(f"Error while flushing status buffer: {e}")
diff --git a/dbos/_workflow_commands.py b/dbos/_workflow_commands.py
@@ -116,7 +116,7 @@ def _cancel_workflow(config: ConfigFile, uuid: str) -> None:
 
     try:
         sys_db = SystemDatabase(config)
-        sys_db.set_workflow_status(uuid, WorkflowStatusString.CANCELLED, False)
+        sys_db.set_workflow_status(uuid, WorkflowStatusString.CANCELLED)
         return
 
     except Exception as e:
diff --git a/tests/test_dbos.py b/tests/test_dbos.py
diff --git a/tests/test_failures.py b/tests/test_failures.py

Original file line number	Diff line number	Diff line change
`@@ -186,7 +186,7 @@ def _init_workflow(`
`186`	`186`	`# We also have to do this for single-step workflows because of the foreign key constraint on the operation outputs table`
`187`	`187`	`# TODO: Make this transactional (and with the queue step below)`
`188`	`188`	`wf_status = dbos._sys_db.update_workflow_status(`
`189`		`- status, False, ctx.in_recovery, max_recovery_attempts=max_recovery_attempts`
	`189`	`+ status, False, max_recovery_attempts=max_recovery_attempts`
`190`	`190`	`)`
`191`	`191`	# TODO: Modify the inputs if they were changed by `update_workflow_inputs`
`192`	`192`	`dbos._sys_db.update_workflow_inputs(wfid, _serialization.serialize_args(inputs))`
Original file line number	Diff line number	Diff line change
`@@ -801,7 +801,7 @@ def recover_pending_workflows(`
`801`	`801`	`def cancel_workflow(cls, workflow_id: str) -> None:`
`802`	`802`	`"""Cancel a workflow by ID."""`
`803`	`803`	`_get_dbos_instance()._sys_db.set_workflow_status(`
`804`		`- workflow_id, WorkflowStatusString.CANCELLED, False`
	`804`	`+ workflow_id, WorkflowStatusString.CANCELLED`
`805`	`805`	`)`
`806`	`806`
`807`	`807`	`@classmethod`