debugging combining states to preserve nested lists over staggered combines

tclose · tclose · commit 1cc40741492b · 2025-03-07T17:46:52.000+11:00
diff --git a/pydra/engine/core.py b/pydra/engine/core.py
@@ -22,7 +22,7 @@
 from pydra.engine import state
 from .lazy import LazyInField, LazyOutField
 from pydra.utils.hash import hash_function, Cache
-from pydra.utils.typing import TypeParser, StateArray
+from pydra.engine.state import State
 from .node import Node
 from datetime import datetime
 from fileformats.core import FileSet
@@ -710,8 +710,7 @@ def construct(
                 )
             for outpt, outpt_lf in zip(output_fields, output_lazy_fields):
                 # Automatically combine any uncombined state arrays into a single lists
-                if TypeParser.get_origin(outpt_lf._type) is StateArray:
-                    outpt_lf._type = list[TypeParser.strip_splits(outpt_lf._type)[0]]
+                outpt_lf._type = State.combine_state_arrays(outpt_lf._type)
                 setattr(outputs, outpt.name, outpt_lf)
         else:
             if unset_outputs := [
diff --git a/pydra/engine/lazy.py b/pydra/engine/lazy.py
@@ -1,5 +1,6 @@
 import typing as ty
 import abc
+from operator import attrgetter
 import attrs
 from pydra.utils.typing import StateArray
 from pydra.utils.hash import hash_single
@@ -152,54 +153,60 @@ def _get_value(
         value : Any
             the resolved value of the lazy-field
         """
-        from pydra.utils.typing import (
-            TypeParser,
-        )  # pylint: disable=import-outside-toplevel
         from pydra.engine.state import StateIndex
 
         if state_index is None:
             state_index = StateIndex()
 
-        task = graph.node(self._node.name).get_tasks(state_index)
-        _, split_depth = TypeParser.strip_splits(self._type)
-
-        def get_nested(task: "Task[DefType]", depth: int):
-            if isinstance(task, StateArray):
-                val = [get_nested(task=t, depth=depth - 1) for t in task]
-                if depth:
-                    val = StateArray[self._type](val)
-            else:
-                if task.errored:
-                    raise ValueError(
-                        f"Cannot retrieve value for {self._field} from {self._node.name} as "
-                        "the node errored"
-                    )
-                res = task.result()
-                if res is None:
-                    raise RuntimeError(
-                        f"Could not find results of '{task.name}' node in a sub-directory "
-                        f"named '{{{task.checksum}}}' in any of the cache locations.\n"
-                        + "\n".join(str(p) for p in set(task.cache_locations))
-                        + f"\n\nThis is likely due to hash changes in '{task.name}' node inputs. "
-                        f"Current values and hashes: {task.inputs}, "
-                        f"{task.definition._hash}\n\n"
-                        "Set loglevel to 'debug' in order to track hash changes "
-                        "throughout the execution of the workflow.\n\n "
-                        "These issues may have been caused by `bytes_repr()` methods "
-                        "that don't return stable hash values for specific object "
-                        "types across multiple processes (see bytes_repr() "
-                        '"singledispatch "function in pydra/utils/hash.py).'
-                        "You may need to write specific `bytes_repr()` "
-                        "implementations (see `pydra.utils.hash.register_serializer`) or a "
-                        "`__bytes_repr__()` dunder methods to handle one or more types in "
-                        "your interface inputs."
-                    )
-                val = res.get_output_field(self._field)
-                val = self._apply_cast(val)
+        jobs = sorted(
+            graph.node(self._node.name).matching_jobs(state_index),
+            key=attrgetter("state_index"),
+        )
+
+        def retrieve_from_job(job: "Task[DefType]") -> ty.Any:
+            if job.errored:
+                raise ValueError(
+                    f"Cannot retrieve value for {self._field} from {self._node.name} as "
+                    "the node errored"
+                )
+            res = job.result()
+            if res is None:
+                raise RuntimeError(
+                    f"Could not find results of '{job.name}' node in a sub-directory "
+                    f"named '{{{job.checksum}}}' in any of the cache locations.\n"
+                    + "\n".join(str(p) for p in set(job.cache_locations))
+                    + f"\n\nThis is likely due to hash changes in '{job.name}' node inputs. "
+                    f"Current values and hashes: {job.inputs}, "
+                    f"{job.definition._hash}\n\n"
+                    "Set loglevel to 'debug' in order to track hash changes "
+                    "throughout the execution of the workflow.\n\n "
+                    "These issues may have been caused by `bytes_repr()` methods "
+                    "that don't return stable hash values for specific object "
+                    "types across multiple processes (see bytes_repr() "
+                    '"singledispatch "function in pydra/utils/hash.py).'
+                    "You may need to write specific `bytes_repr()` "
+                    "implementations (see `pydra.utils.hash.register_serializer`) or a "
+                    "`__bytes_repr__()` dunder methods to handle one or more types in "
+                    "your interface inputs."
+                )
+            val = res.get_output_field(self._field)
+            val = self._apply_cast(val)
             return val
 
-        value = get_nested(task, depth=split_depth)
-        return value
+        if not self._node.state.depth(after_combine=False):
+            assert len(jobs) == 1
+            return retrieve_from_job(jobs[0])
+        elif not self._node.state.keys_final:  # all states are combined over
+            return [retrieve_from_job(j) for j in jobs]
+        elif self._node.state.combiner:
+            values = StateArray()
+            for ind in self._node.state.states_ind_final:
+                values.append(
+                    [retrieve_from_job(j) for j in jobs if j.state_index.matches(ind)]
+                )
+            return values
+        else:
+            return StateArray(retrieve_from_job(j) for j in jobs)
 
     @property
     def _source(self):
diff --git a/pydra/engine/node.py b/pydra/engine/node.py
@@ -2,7 +2,6 @@
 from copy import deepcopy
 from enum import Enum
 import attrs
-from pydra.utils.typing import TypeParser, StateArray
 from . import lazy
 from pydra.engine.helpers import (
     attrs_values,
@@ -128,12 +127,7 @@ def lzout(self) -> OutputType:
             # types based on the number of states the node is split over and whether
             # it has a combiner
             if self._state:
-                type_, _ = TypeParser.strip_splits(outpt._type)
-                if self._state.combiner:
-                    type_ = list[type_]
-                for _ in range(self._state.depth()):
-                    type_ = StateArray[type_]
-                outpt._type = type_
+                outpt._type = self._state.nest_output_type(outpt._type)
             # Flag the output lazy fields as being not typed checked (i.e. assigned to
             # another node's inputs) yet. This is used to prevent the user from changing
             # the type of the output after it has been accessed by connecting it to an
diff --git a/pydra/engine/state.py b/pydra/engine/state.py
@@ -3,10 +3,12 @@
 from copy import deepcopy
 import itertools
 from collections import OrderedDict
+from operator import itemgetter
 from functools import reduce
 import typing as ty
 from . import helpers_state as hlpst
 from .helpers import ensure_list, attrs_values
+from pydra.utils.typing import StateArray, TypeParser
 
 # from .specs import BaseDef
 if ty.TYPE_CHECKING:
@@ -47,6 +49,18 @@ def __len__(self) -> int:
     def __iter__(self) -> ty.Generator[str, None, None]:
         return iter(self.indices)
 
+    def __getitem__(self, key: str) -> int:
+        return self.indices[key]
+
+    def __lt__(self, other: "StateIndex") -> bool:
+        if set(self.indices) != set(other.indices):
+            raise ValueError(
+                f"StateIndex {self} does not contain the same indices as {other}"
+            )
+        return sorted(self.indices.items(), key=itemgetter(0)) < sorted(
+            other.indices.items(), key=itemgetter(0)
+        )
+
     def __repr__(self) -> str:
         return (
             "StateIndex(" + ", ".join(f"{n}={v}" for n, v in self.indices.items()) + ")"
@@ -79,6 +93,21 @@ def subset(self, state_names: ty.Iterable[str]) -> ty.Self:
         """
         return type(self)({k: v for k, v in self.indices.items() if k in state_names})
 
+    def missing(self, state_names: ty.Iterable[str]) -> ty.List[str]:
+        """Return the fields that are missing from the StateIndex
+
+        Parameters
+        ----------
+        fields : list[str]
+            the fields to check for
+
+        Returns
+        -------
+        list[str]
+            the fields that are missing from the StateIndex
+        """
+        return [f for f in state_names if f not in self.indices]
+
     def matches(self, other: "StateIndex") -> bool:
         """Check if the indices that are present in the other StateIndex match
 
@@ -92,6 +121,8 @@ def matches(self, other: "StateIndex") -> bool:
         bool
             True if all the indices in the other StateIndex match
         """
+        if isinstance(other, dict):
+            other = StateIndex(other)
         if not set(self.indices).issuperset(other.indices):
             raise ValueError(
                 f"StateIndex {self} does not contain all the indices in {other}"
@@ -211,10 +242,6 @@ def __str__(self):
     @property
     def names(self):
         """Return the names of the states."""
-        # analysing states from connected tasks if inner_inputs
-        if not hasattr(self, "keys_final"):
-            self.prepare_states()
-            self.prepare_inputs()
         previous_states_keys = {
             f"_{v.name}": v.keys_final for v in self.inner_inputs.values()
         }
@@ -265,6 +292,41 @@ def included(s):
         remaining_stack = [s for s in stack if included(s)]
         return depth + len(remaining_stack)
 
+    def nest_output_type(self, type_: type) -> type:
+        """Nests a type of an output field in a combination of lists and state-arrays
+        based on the state's splitter and combiner
+
+        Parameters
+        ----------
+        type_ : type
+            the type of the output field
+
+        Returns
+        -------
+        type
+            the nested type of the output field
+        """
+
+        state_array_depth = self.depth()
+
+        # If there is a combination, it will get flattened into a single list
+        if self.depth(after_combine=False) > state_array_depth:
+            type_ = list[type_]
+
+        # Nest the uncombined state arrays around the type
+        for _ in range(state_array_depth):
+            type_ = StateArray[type_]
+        return type_
+
+    @classmethod
+    def combine_state_arrays(cls, type_: type) -> type:
+        """Collapses (potentially nested) state array(s) into a single list"""
+        if TypeParser.get_origin(type_) is StateArray:
+            # Implicitly combine any remaining uncombined states into a single
+            # list
+            type_ = list[TypeParser.strip_splits(type_)[0]]
+        return type_
+
     @property
     def splitter(self):
         """Get the splitter of the state."""
diff --git a/pydra/engine/submitter.py b/pydra/engine/submitter.py
@@ -6,7 +6,7 @@
 import os
 from pathlib import Path
 from tempfile import mkdtemp
-from copy import copy
+from copy import copy, deepcopy
 from datetime import datetime
 from collections import defaultdict
 import attrs
@@ -211,16 +211,15 @@ def __call__(
             from pydra.engine.specs import TaskDef
 
             state = State(
-                name="not-important",
+                name="outer_split",
                 definition=task_def,
-                splitter=task_def._splitter,
-                combiner=task_def._combiner,
+                splitter=deepcopy(task_def._splitter),
+                combiner=deepcopy(task_def._combiner),
             )
-            list_depth = 2 if state.depth(after_combine=False) != state.depth() else 1
 
             def wrap_type(tp):
-                for _ in range(list_depth):
-                    tp = list[tp]
+                tp = state.nest_output_type(tp)
+                tp = state.combine_state_arrays(tp)
                 return tp
 
             output_types = {
@@ -568,22 +567,27 @@ def tasks(self) -> ty.Iterable["Task[DefType]"]:
             self._tasks = {t.state_index: t for t in self._generate_tasks()}
         return self._tasks.values()
 
-    def get_tasks(
-        self, index: StateIndex = StateIndex()
-    ) -> "Task | StateArray[Task[DefType]]":
-        """Get a task object for a given state index."""
-        if not self.tasks:
-            return StateArray([])
-        task_index = next(iter(self._tasks)) if self._tasks else StateIndex()
-        if len(task_index) > len(index):
-            tasks = []
-            for ind, task in self._tasks.items():
-                if ind.matches(index):
-                    tasks.append(task)
-            return StateArray(tasks)
-        elif len(index) > len(task_index):
-            index = index.subset(task_index)
-        return self._tasks[index]
+    def matching_jobs(self, index: StateIndex = StateIndex()) -> "StateArray[Task]":
+        """Get the jobs that match a given state index.
+
+        Parameters
+        ----------
+        index : StateIndex, optional
+            The state index of the task to get, by default StateIndex()
+        """
+        matching = StateArray()
+        if self.tasks:
+            task_index = next(iter(self._tasks)) if self._tasks else StateIndex()
+            if len(task_index) > len(index):
+                # Select matching tasks and return them in nested state-array objects
+                for ind, task in self._tasks.items():
+                    if ind.matches(index):
+                        matching.append(task)
+            elif len(index) > len(task_index):
+                matching.append(
+                    self._tasks[index.subset(task_index)]
+                )  # Return a single task
+        return matching
 
     @property
     def started(self) -> bool:
@@ -740,11 +744,7 @@ def get_runnable_tasks(self, graph: DiGraph) -> list["Task[DefType]"]:
             pred: NodeExecution
             is_runnable = True
             for pred in graph.predecessors[self.node.name]:
-                pred_jobs = pred.get_tasks(index)
-                if isinstance(pred_jobs, StateArray):
-                    pred_inds = [j.state_index for j in pred_jobs]
-                else:
-                    pred_inds = [pred_jobs.state_index]
+                pred_inds = [j.state_index for j in pred.matching_jobs(index)]
                 if not all(i in pred.successful for i in pred_inds):
                     is_runnable = False
                     blocked = True
diff --git a/pydra/engine/tests/test_node_task.py b/pydra/engine/tests/test_node_task.py
@@ -1033,7 +1033,7 @@ def test_task_state_comb_1(plugin_dask_opt, tmp_path):
     assert state.splitter_final is None
     assert state.splitter_rpn_final == []
 
-    with Submitter(worker=plugin_dask_opt, cache_dir=tmp_path) as sub:
+    with Submitter(worker="debug", cache_dir=tmp_path) as sub:
         results = sub(nn)
     assert not results.errored, "\n".join(results.errors["error message"])
 
@@ -1147,7 +1147,7 @@ def test_task_state_comb_2(
     assert state.splitter_rpn == state_rpn
     assert state.combiner == state_combiner
 
-    with Submitter(worker=plugin, cache_dir=tmp_path) as sub:
+    with Submitter(worker="debug", cache_dir=tmp_path) as sub:
         results = sub(nn)
     assert not results.errored, "\n".join(results.errors["error message"])
 
@@ -1161,18 +1161,7 @@ def test_task_state_comb_2(
     # it should give values of inputs that corresponds to the specific element
     # results_verb = nn.result(return_inputs=True)
 
-    if state.splitter_rpn_final:
-        for i, res in enumerate(expected):
-            assert results.outputs.out == res
-        # results_verb
-        # for i, res_l in enumerate(expected_val):
-        #     for j, res in enumerate(res_l):
-        #         assert (results_verb[i][j][0], results_verb[i][j][1].output.out) == res
-    # if the combiner is full expected is "a flat list"
-    else:
-        assert results.outputs.out == expected
-        # for i, res in enumerate(expected_val):
-        #     assert (results_verb[i][0], results_verb[i][1].output.out) == res
+    assert results.outputs.out == expected
 
 
 def test_task_state_comb_singl_1(plugin, tmp_path):