require all jobs in predecessor nodes to have completed successfully before successor nodes are run (will look to relax it with a partially generated state)

tclose · tclose · commit 7245524274f5 · 2025-03-17T21:48:56.000+11:00
diff --git a/pydra/engine/graph.py b/pydra/engine/graph.py
@@ -99,6 +99,10 @@ def node(self, name: str) -> NodeType:
             except KeyError:
                 raise KeyError(f"Node {name!r} not found in graph") from None
 
+    def __getitem__(self, key):
+        """Get a node by its name."""
+        return self.node(key)
+
     @property
     def nodes_names_map(self) -> dict[str, NodeType]:
         """Get a map of node names to nodes."""
diff --git a/pydra/engine/helpers.py b/pydra/engine/helpers.py
@@ -191,6 +191,7 @@ def save(
     task_path: Path,
     result: "Result | None" = None,
     task: "Task[DefType] | None" = None,
+    return_values: dict[str, ty.Any] | None = None,
     name_prefix: str = None,
 ) -> None:
     """
@@ -204,6 +205,8 @@ def save(
         Result to pickle and write
     task : :class:`~pydra.engine.core.TaskBase`
         Task to pickle and write
+    return_values : :obj:`dict`
+        Return values to pickle and write
     """
     from pydra.engine.core import is_workflow
 
@@ -233,6 +236,9 @@ def save(
         if task:
             with (task_path / f"{name_prefix}_task.pklz").open("wb") as fp:
                 cp.dump(task, fp)
+        if return_values:
+            with (task_path / f"{name_prefix}_return_values.pklz").open("wb") as fp:
+                cp.dump(task, fp)
 
 
 def copyfile_workflow(
diff --git a/pydra/engine/lazy.py b/pydra/engine/lazy.py
@@ -180,7 +180,7 @@ def _get_value(
         def retrieve_from_job(job: "Task[DefType]") -> ty.Any:
             if job.errored:
                 raise ValueError(
-                    f"Cannot retrieve value for {self._field} from {self._node.name} as "
+                    f"Cannot retrieve value for {self._field!r} from {self._node.name} as "
                     "the node errored"
                 )
             res = job.result()
diff --git a/pydra/engine/specs.py b/pydra/engine/specs.py
@@ -293,14 +293,10 @@ def __call__(
                         )
             raise
         if result.errored:
-            if isinstance(self, WorkflowDef) or self._splitter:
-                raise RuntimeError(f"Workflow {self} failed with errors")
-            else:
-                errors = result.errors
-                raise RuntimeError(
-                    f"Task {self} failed @ {errors['time of crash']} with the following errors:\n"
-                    + "\n".join(errors["error message"])
-                )
+            raise RuntimeError(
+                f"Task {self} failed @ {result.errors['time of crash']} with the "
+                "following errors:\n" + "\n".join(result.errors["error message"])
+            )
         return result.outputs
 
     def split(
@@ -697,6 +693,18 @@ def task(self):
         with open(task_pkl, "rb") as f:
             return cp.load(f)
 
+    @property
+    def return_values(self):
+        return_values_pkl = self.output_dir / "_return_values.pklz"
+        if not return_values_pkl.exists():
+            return None
+        with open(return_values_pkl, "rb") as f:
+            return cp.load(f)
+
+    @property
+    def job(self):
+        return self.task
+
 
 @attrs.define(kw_only=True)
 class RuntimeSpec:
@@ -798,40 +806,37 @@ def _from_task(cls, task: "Task[WorkflowDef]") -> Self:
         outputs : Outputs
             The outputs of the task
         """
-        outputs = super()._from_task(task)
-        # collecting outputs from tasks
-        output_wf = {}
-        lazy_field: lazy.LazyOutField
+
         workflow: "Workflow" = task.return_values["workflow"]
         exec_graph: "DiGraph[NodeExecution]" = task.return_values["exec_graph"]
-        nodes_dict = {n.name: n for n in exec_graph.nodes}
-        for name, lazy_field in attrs_values(workflow.outputs).items():
-            try:
-                val_out = lazy_field._get_value(workflow=workflow, graph=exec_graph)
-                if isinstance(val_out, StateArray):
-                    val_out = list(val_out)  # implicitly combine state arrays
-                output_wf[name] = val_out
-            except (ValueError, AttributeError):
-                output_wf[name] = None
-                node: "NodeExecution" = nodes_dict[lazy_field._node.name]
-                # checking if the tasks has predecessors that raises error
-                if isinstance(node.errored, list):
-                    raise ValueError(f"Tasks {node._errored} raised an error")
-                else:
-                    err_files = [(t.output_dir / "_error.pklz") for t in node.tasks]
-                    err_files = [f for f in err_files if f.exists()]
-                    if not err_files:
-                        raise
-                    raise ValueError(
-                        f"Task {lazy_field._node.name!r} raised an error, full crash report is "
-                        f"here: "
-                        + (
-                            str(err_files[0])
-                            if len(err_files) == 1
-                            else "\n" + "\n".join(str(f) for f in err_files)
-                        )
+
+        # Check for errors in any of the workflow nodes
+        if errored := [n for n in exec_graph.nodes if n.errored]:
+            errors = []
+            for node in errored:
+                for task in node.errored.values():
+                    result = task.result()
+                    errors.append(
+                        f"Task {node.name!r} failed @ {result.errors['time of crash']} "
+                        "with the following errors:\n"
+                        + "\n".join(result.errors["error message"])
                     )
-        outputs = attrs.evolve(outputs, **output_wf)
+            raise RuntimeError(
+                f"Workflow {workflow} failed with errors: " + "\n\n".join(errors)
+            )
+
+        # Retrieve values from the output fields
+        values = {}
+        lazy_field: lazy.LazyOutField
+        for name, lazy_field in attrs_values(workflow.outputs).items():
+            val_out = lazy_field._get_value(workflow=workflow, graph=exec_graph)
+            if isinstance(val_out, StateArray):
+                val_out = list(val_out)  # implicitly combine state arrays
+            values[name] = val_out
+
+        # Set the values in the outputs object
+        outputs = super()._from_task(task)
+        outputs = attrs.evolve(outputs, **values)
         outputs._output_dir = task.output_dir
         return outputs
 
diff --git a/pydra/engine/submitter.py b/pydra/engine/submitter.py
@@ -83,6 +83,7 @@ class Submitter:
     messenger_args: dict[str, ty.Any]
     clean_stale_locks: bool
     run_start_time: datetime | None
+    propagate_rerun: bool
 
     def __init__(
         self,
@@ -94,6 +95,7 @@ def __init__(
         audit_flags: AuditFlag = AuditFlag.NONE,
         messengers: ty.Iterable[Messenger] | None = None,
         messenger_args: dict[str, ty.Any] | None = None,
+        propagate_rerun: bool = True,
         clean_stale_locks: bool | None = None,
         **kwargs,
     ):
@@ -121,6 +123,7 @@ def __init__(
 
         self.cache_dir = cache_dir
         self.cache_locations = cache_locations
+        self.propagate_rerun = propagate_rerun
         self.environment = environment if environment is not None else Native()
         self.loop = get_open_loop()
         self._own_loop = not self.loop.is_running()
@@ -188,6 +191,8 @@ def __call__(
         rerun : bool, optional
             Whether to force the re-computation of the task results even if existing
             results are found, by default False
+        propagate_rerun : bool, optional
+            Whether to propagate the rerun flag to all tasks in the workflow, by default True
 
         Returns
         -------
@@ -312,12 +317,12 @@ def expand_workflow(self, workflow_task: "Task[WorkflowDef]", rerun: bool) -> No
         wf = workflow_task.definition.construct()
         # Generate the execution graph
         exec_graph = wf.execution_graph(submitter=self)
+        workflow_task.return_values = {"workflow": wf, "exec_graph": exec_graph}
         tasks = self.get_runnable_tasks(exec_graph)
         while tasks or any(not n.done for n in exec_graph.nodes):
             for task in tasks:
                 self.worker.run(task, rerun=rerun)
             tasks = self.get_runnable_tasks(exec_graph)
-        workflow_task.return_values = {"workflow": wf, "exec_graph": exec_graph}
 
     async def expand_workflow_async(
         self, workflow_task: "Task[WorkflowDef]", rerun: bool
@@ -333,6 +338,7 @@ async def expand_workflow_async(
         wf = workflow_task.definition.construct()
         # Generate the execution graph
         exec_graph = wf.execution_graph(submitter=self)
+        workflow_task.return_values = {"workflow": wf, "exec_graph": exec_graph}
         # keep track of pending futures
         task_futures = set()
         tasks = self.get_runnable_tasks(exec_graph)
@@ -417,7 +423,6 @@ async def expand_workflow_async(
                     task_futures.add(self.worker.run(task, rerun=rerun))
             task_futures = await self.worker.fetch_finished(task_futures)
             tasks = self.get_runnable_tasks(exec_graph)
-        workflow_task.return_values = {"workflow": wf, "exec_graph": exec_graph}
 
     def __enter__(self):
         return self
@@ -467,7 +472,8 @@ def get_runnable_tasks(self, graph: DiGraph) -> list["Task[DefType]"]:
                 continue
             # since the list is sorted (breadth-first) we can stop
             # when we find a task that depends on any task that is already in tasks
-            if set(graph.predecessors[node.name]).intersection(not_started):
+            preds = set(graph.predecessors[node.name])
+            if preds.intersection(not_started):
                 break
             # Record if the node has not been started
             if not node.started:
@@ -619,6 +625,11 @@ def done(self) -> bool:
         # Check to see if any previously queued tasks have completed
         return not (self.queued or self.blocked or self.running)
 
+    @property
+    def has_errored(self) -> bool:
+        self.update_status()
+        return bool(self.errored)
+
     def update_status(self) -> None:
         """Updates the status of the tasks in the node."""
         if not self.started:
@@ -729,56 +740,80 @@ def get_runnable_tasks(self, graph: DiGraph) -> list["Task[DefType]"]:
             List of tasks that are ready to run
         """
         runnable: list["Task[DefType]"] = []
-        if not self.started:
-            self.start()
-        # Check to see if any blocked tasks are now runnable/unrunnable
-        for index, task in list(self.blocked.items()):
-            pred: NodeExecution
-            is_runnable = True
-            # This is required for the commented-out code below
-            # states_ind = (
-            #     list(self.node.state.states_ind[index].items())
-            #     if self.node.state
-            #     else []
-            # )
-            for pred in graph.predecessors[self.node.name]:
-                if pred.node.state:
-                    # FIXME: These should be the only predecessor jobs that are required to have
-                    # completed before the job can be run, however, due to how the state
-                    # is currently built, all predecessors are required to have completed.
-                    # If/when this is relaxed, then the following code should be used instead.
-                    #
-                    # pred_states_ind = {
-                    #     (k, i) for k, i in states_ind if k.startswith(pred.name + ".")
-                    # }
-                    # pred_inds = [
-                    #     i
-                    #     for i, ind in enumerate(pred.node.state.states_ind)
-                    #     if set(ind.items()).issuperset(pred_states_ind)
-                    # ]
-                    pred_inds = list(range(len(pred.node.state.states_ind)))
-                else:
-                    pred_inds = [None]
-                if not all(i in pred.successful for i in pred_inds):
-                    is_runnable = False
-                    blocked = True
-                    if pred_errored := [i for i in pred_inds if i in pred.errored]:
-                        self.unrunnable[index].extend(
-                            [pred.errored[i] for i in pred_errored]
-                        )
-                        blocked = False
-                    if pred_unrunnable := [
-                        i for i in pred_inds if i in pred.unrunnable
-                    ]:
-                        self.unrunnable[index].extend(
-                            [pred.unrunnable[i] for i in pred_unrunnable]
-                        )
-                        blocked = False
-                    if not blocked:
-                        del self.blocked[index]
-                    break
-            if is_runnable:
-                runnable.append(self.blocked.pop(index))
+        predecessors: list["Task[DefType]"] = graph.predecessors[self.node.name]
+
+        # If there is a split, we need to wait for all predecessor nodes to finish
+        # In theory, if the current splitter splits an already split state we should
+        # only need to wait for the direct predecessor jobs to finish, however, this
+        # would require a deep refactor of the State class as we need the whole state
+        # in order to assign consistent state indices across the new split
+
+        # FIXME: The branch for handling partially completed/errored/unrunnable
+        # predecessor nodes can't be used until the State class can be partially
+        # initialised with lazy-fields.
+        if True:  # self.node.splitter:
+            if unrunnable := [p for p in predecessors if p.errored or p.unrunnable]:
+                self.unrunnable = {None: unrunnable}
+                self.blocked = {}
+                assert self.done
+            else:
+                if all(p.done for p in predecessors):
+                    if not self.started:
+                        self.start()
+                    if self.node.state is None:
+                        inds = [None]
+                    else:
+                        inds = list(range(len(self.node.state.states_ind)))
+                    if self.blocked:
+                        for i in inds:
+                            runnable.append(self.blocked.pop(i))
+        else:
+            if not self.started:
+                self.start()
+
+            # Check to see if any blocked tasks are now runnable/unrunnable
+            for index, task in list(self.blocked.items()):
+                pred: NodeExecution
+                is_runnable = True
+                states_ind = (
+                    list(self.node.state.states_ind[index].items())
+                    if self.node.state
+                    else []
+                )
+                for pred in predecessors:
+                    if pred.node.state:
+                        pred_states_ind = {
+                            (k, i)
+                            for k, i in states_ind
+                            if k.startswith(pred.name + ".")
+                        }
+                        pred_inds = [
+                            i
+                            for i, ind in enumerate(pred.node.state.states_ind)
+                            if set(ind.items()).issuperset(pred_states_ind)
+                        ]
+                    else:
+                        pred_inds = [None]
+                    if not all(i in pred.successful for i in pred_inds):
+                        is_runnable = False
+                        blocked = True
+                        if pred_errored := [
+                            pred.errored[i] for i in pred_inds if i in pred.errored
+                        ]:
+                            self.unrunnable[index].extend(pred_errored)
+                            blocked = False
+                        if pred_unrunnable := [
+                            pred.unrunnable[i]
+                            for i in pred_inds
+                            if i in pred.unrunnable
+                        ]:
+                            self.unrunnable[index].extend(pred_unrunnable)
+                            blocked = False
+                        if not blocked:
+                            del self.blocked[index]
+                        break
+                if is_runnable:
+                    runnable.append(self.blocked.pop(index))
         self.queued.update({t.state_index: t for t in runnable})
         return list(self.queued.values())
 
diff --git a/pydra/engine/tests/test_workflow.py b/pydra/engine/tests/test_workflow.py
diff --git a/pydra/engine/tests/utils.py b/pydra/engine/tests/utils.py

Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,7 @@ def _get_value(`
`180`	`180`	`def retrieve_from_job(job: "Task[DefType]") -> ty.Any:`
`181`	`181`	`if job.errored:`
`182`	`182`	`raise ValueError(`
`183`		`- f"Cannot retrieve value for {self._field} from {self._node.name} as "`
	`183`	`+ f"Cannot retrieve value for {self._field!r} from {self._node.name} as "`
`184`	`184`	`"the node errored"`
`185`	`185`	`)`
`186`	`186`	`res = job.result()`