Merge pull request #524 from mgxd/rf/submitter

djarecka · web-flow · commit 7e8232c733b6 · 2022-04-14T12:01:56.000-04:00
RF: Submitter logic.
diff --git a/pydra/engine/core.py b/pydra/engine/core.py
@@ -1055,7 +1055,9 @@ async def _run(self, submitter=None, rerun=False, **kwargs):
                 "Workflow output cannot be None, use set_output to define output(s)"
             )
         # creating connections that were defined after adding tasks to the wf
-        self._connect_and_propagate_to_tasks()
+        self._connect_and_propagate_to_tasks(
+            propagate_rerun=self.task_rerun and self.propagate_rerun
+        )
 
         checksum = self.checksum
         output_dir = self.output_dir
@@ -1097,8 +1099,11 @@ async def _run(self, submitter=None, rerun=False, **kwargs):
     async def _run_task(self, submitter, rerun=False):
         if not submitter:
             raise Exception("Submitter should already be set.")
+        for nd in self.graph.nodes:
+            if nd.allow_cache_override:
+                nd.cache_dir = self.cache_dir
         # at this point Workflow is stateless so this should be fine
-        await submitter._run_workflow(self, rerun=rerun)
+        await submitter.expand_workflow(self, rerun=rerun)
 
     def set_output(self, connections):
         """
@@ -1227,21 +1232,31 @@ def create_dotfile(self, type="simple", export=None, name=None):
                 formatted_dot.append(self.graph.export_graph(dotfile=dotfile, ext=ext))
             return dotfile, formatted_dot
 
-    def _connect_and_propagate_to_tasks(self):
+    def _connect_and_propagate_to_tasks(
+        self,
+        *,
+        propagate_rerun=False,
+        override_task_caches=False,
+    ):
         """
         Visit each node in the graph and create the connections.
         Additionally checks if all tasks should be rerun.
         """
         for task in self.graph.nodes:
+            self.create_connections(task)
             # if workflow has task_rerun=True and propagate_rerun=True,
             # it should be passed to the tasks
-            if self.task_rerun and self.propagate_rerun:
-                task.task_rerun = self.task_rerun
+            if propagate_rerun:
+                task.task_rerun = True
                 # if the task is a wf, than the propagate_rerun should be also set
                 if is_workflow(task):
-                    task.propagate_rerun = self.propagate_rerun
+                    task.propagate_rerun = True
+
+            # ported from Submitter.__call__
+            # TODO: no prepare state ?
+            if override_task_caches and task.allow_cache_override:
+                task.cache_dir = self.cache_dir
             task.cache_locations = task._cache_locations + self.cache_locations
-            self.create_connections(task)
 
 
 def is_task(obj):
diff --git a/pydra/engine/submitter.py b/pydra/engine/submitter.py
@@ -1,14 +1,7 @@
 """Handle execution backends."""
 import asyncio
-import time
 from uuid import uuid4
-from .workers import (
-    SerialWorker,
-    ConcurrentFuturesWorker,
-    SlurmWorker,
-    DaskWorker,
-    SGEWorker,
-)
+from .workers import WORKERS
 from .core import is_workflow
 from .helpers import get_open_loop, load_and_run_async
 
@@ -35,61 +28,60 @@ def __init__(self, plugin="cf", **kwargs):
         self.loop = get_open_loop()
         self._own_loop = not self.loop.is_running()
         self.plugin = plugin
-        if self.plugin == "serial":
-            self.worker = SerialWorker()
-        elif self.plugin == "cf":
-            self.worker = ConcurrentFuturesWorker(**kwargs)
-        elif self.plugin == "slurm":
-            self.worker = SlurmWorker(**kwargs)
-        elif self.plugin == "dask":
-            self.worker = DaskWorker(**kwargs)
-        elif self.plugin == "sge":
-            self.worker = SGEWorker(**kwargs)
-        else:
-            raise Exception(f"plugin {self.plugin} not available")
+        try:
+            self.worker = WORKERS[self.plugin](**kwargs)
+        except KeyError:
+            raise NotImplementedError(f"No worker for {self.plugin}")
         self.worker.loop = self.loop
 
     def __call__(self, runnable, cache_locations=None, rerun=False):
-        """Submit."""
+        """Submitter run function."""
         if cache_locations is not None:
             runnable.cache_locations = cache_locations
-        # creating all connections and calculating the checksum of the graph before running
-        if is_workflow(runnable):
-            # TODO: no prepare state ?
-            for nd in runnable.graph.nodes:
-                runnable.create_connections(nd)
-                if nd.allow_cache_override:
-                    nd.cache_dir = runnable.cache_dir
-        if is_workflow(runnable) and runnable.state is None:
-            self.loop.run_until_complete(self.submit_workflow(runnable, rerun=rerun))
-        else:
-            self.loop.run_until_complete(self.submit(runnable, wait=True, rerun=rerun))
-        if is_workflow(runnable):
-            # resetting all connections with LazyFields
-            runnable._reset()
+        self.loop.run_until_complete(self.submit_from_call(runnable, rerun))
         return runnable.result()
 
-    async def submit_workflow(self, workflow, rerun=False):
-        """Distribute or initiate workflow execution."""
-        if is_workflow(workflow):
-            if workflow.plugin and workflow.plugin != self.plugin:
-                # dj: this is not tested!!! TODO
-                await self.worker.run_el(workflow, rerun=rerun)
+    async def submit_from_call(self, runnable, rerun):
+        """
+        This coroutine should only be called once per Submitter call,
+        and serves as the bridge between sync/async lands.
+
+        There are 4 potential paths based on the type of runnable:
+        0) Workflow has a different plugin than a submitter
+        1) Workflow without State
+        2) Task without State
+        3) (Workflow or Task) with State
+
+        Once Python 3.10 is the minimum, this should probably be refactored into using
+        structural pattern matching.
+        """
+        if is_workflow(runnable):
+            # connect and calculate the checksum of the graph before running
+            runnable._connect_and_propagate_to_tasks(override_task_caches=True)
+            # 0
+            if runnable.plugin and runnable.plugin != self.plugin:
+                # if workflow has a different plugin it's treated as a single element
+                await self.worker.run_el(runnable, rerun=rerun)
+            # 1
+            if runnable.state is None:
+                await runnable._run(self, rerun=rerun)
+            # 3
             else:
-                await workflow._run(self, rerun=rerun)
-        else:  # could be a tuple with paths to pickle files wiith tasks and inputs
-            ind, wf_main_pkl, wf_orig = workflow
-            if wf_orig.plugin and wf_orig.plugin != self.plugin:
-                # dj: this is not tested!!! TODO
-                await self.worker.run_el(workflow, rerun=rerun)
+                await self.expand_runnable(runnable, wait=True, rerun=rerun)
+            runnable._reset()
+        else:
+            # 2
+            if runnable.state is None:
+                # run_el should always return a coroutine
+                await self.worker.run_el(runnable, rerun=rerun)
+            # 3
             else:
-                await load_and_run_async(
-                    task_pkl=wf_main_pkl, ind=ind, submitter=self, rerun=rerun
-                )
+                await self.expand_runnable(runnable, wait=True, rerun=rerun)
+        return True
 
-    async def submit(self, runnable, wait=False, rerun=False):
+    async def expand_runnable(self, runnable, wait=False, rerun=False):
         """
-        Coroutine entrypoint for task submission.
+        This coroutine handles state expansion.
 
         Removes any states from `runnable`. If `wait` is
         set to False (default), aggregates all worker
@@ -110,41 +102,37 @@ async def submit(self, runnable, wait=False, rerun=False):
             Coroutines for :class:`~pydra.engine.core.TaskBase` execution.
 
         """
+        if runnable.plugin and runnable.plugin != self.plugin:
+            raise NotImplementedError()
+
         futures = set()
-        if runnable.state:
-            runnable.state.prepare_states(runnable.inputs, cont_dim=runnable.cont_dim)
-            runnable.state.prepare_inputs()
-            logger.debug(
-                f"Expanding {runnable} into {len(runnable.state.states_val)} states"
-            )
-            task_pkl = runnable.pickle_task()
-
-            for sidx in range(len(runnable.state.states_val)):
-                job_tuple = (sidx, task_pkl, runnable)
-                if is_workflow(runnable):
-                    # job has no state anymore
-                    futures.add(self.submit_workflow(job_tuple, rerun=rerun))
-                else:
-                    # tasks are submitted to worker for execution
-                    futures.add(self.worker.run_el(job_tuple, rerun=rerun))
-        else:
+        if runnable.state is None:
+            raise Exception("Only runnables with state should reach here")
+
+        task_pkl = await prepare_runnable_with_state(runnable)
+
+        for sidx in range(len(runnable.state.states_val)):
             if is_workflow(runnable):
-                await self._run_workflow(runnable, rerun=rerun)
+                # job has no state anymore
+                futures.add(
+                    # This unpickles and runs workflow - why are we pickling?
+                    asyncio.create_task(load_and_run_async(task_pkl, sidx, self, rerun))
+                )
             else:
-                # submit task to worker
-                futures.add(self.worker.run_el(runnable, rerun=rerun))
+                futures.add(self.worker.run_el((sidx, task_pkl, runnable), rerun=rerun))
 
         if wait and futures:
-            # run coroutines concurrently and wait for execution
-            # wait until all states complete or error
+            # if wait is True, we are at the end of the graph / state expansion.
+            # Once the remaining jobs end, we will exit `submit_from_call`
             await asyncio.gather(*futures)
             return
         # pass along futures to be awaited independently
         return futures
 
-    async def _run_workflow(self, wf, rerun=False):
+    async def expand_workflow(self, wf, rerun=False):
         """
         Expand and execute a stateless :class:`~pydra.engine.core.Workflow`.
+        This method is only reached by `Workflow._run_task`.
 
         Parameters
         ----------
@@ -157,10 +145,6 @@ async def _run_workflow(self, wf, rerun=False):
             The computed workflow
 
         """
-        for nd in wf.graph.nodes:
-            if nd.allow_cache_override:
-                nd.cache_dir = wf.cache_dir
-
         # creating a copy of the graph that will be modified
         # the copy contains new lists with original runnable objects
         graph_copy = wf.graph.copy()
@@ -180,7 +164,8 @@ async def _run_workflow(self, wf, rerun=False):
                 while not tasks and graph_copy.nodes:
                     tasks, follow_err = get_runnable_tasks(graph_copy)
                     ii += 1
-                    time.sleep(1)
+                    # don't block the event loop!
+                    await asyncio.sleep(1)
                     if ii > 60:
                         raise Exception(
                             "graph is not empty, but not able to get more tasks "
@@ -191,11 +176,15 @@ async def _run_workflow(self, wf, rerun=False):
                 logger.debug(f"Retrieving inputs for {task}")
                 # TODO: add state idx to retrieve values to reduce waiting
                 task.inputs.retrieve_values(wf)
-                if is_workflow(task) and not task.state:
-                    await self.submit_workflow(task, rerun=rerun)
-                else:
-                    for fut in await self.submit(task, rerun=rerun):
+                if task.state:
+                    for fut in await self.expand_runnable(task, rerun=rerun):
                         task_futures.add(fut)
+                # expand that workflow
+                elif is_workflow(task):
+                    await task._run(self, rerun=rerun)
+                # single task
+                else:
+                    task_futures.add(self.worker.run_el(task, rerun=rerun))
             task_futures = await self.worker.fetch_finished(task_futures)
             tasks, follow_err = get_runnable_tasks(graph_copy)
             # updating tasks_errored
@@ -285,3 +274,10 @@ def is_runnable(graph, obj):
         graph.remove_nodes_connections(nd)
 
     return True
+
+
+async def prepare_runnable_with_state(runnable):
+    runnable.state.prepare_states(runnable.inputs, cont_dim=runnable.cont_dim)
+    runnable.state.prepare_inputs()
+    logger.debug(f"Expanding {runnable} into {len(runnable.state.states_val)} states")
+    return runnable.pickle_task()
diff --git a/pydra/engine/workers.py b/pydra/engine/workers.py
@@ -119,7 +119,7 @@ async def fetch_finished(self, futures):
 class SerialWorker(Worker):
     """A worker to execute linearly."""
 
-    def __init__(self):
+    def __init__(self, **kwargs):
         """Initialize worker."""
         logger.debug("Initialize SerialWorker")
 
@@ -876,3 +876,12 @@ async def exec_dask(self, runnable, rerun=False):
     def close(self):
         """Finalize the internal pool of tasks."""
         pass
+
+
+WORKERS = {
+    "serial": SerialWorker,
+    "cf": ConcurrentFuturesWorker,
+    "slurm": SlurmWorker,
+    "dask": DaskWorker,
+    "sge": SGEWorker,
+}