cli: dataflow: run: Commands to run dataflow without sources

aghinsa · web-flow · commit 48c54b91e75f · 2020-08-04T09:36:16.000-07:00
Fixes: #812
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -71,6 +71,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Support for immediate response in HTTP service
 - Daal4py example usage.
 - Gitter chatbot tutorial.
+- Option to run dataflow without sources from cli.
 ### Changed
 - Renamed `-seed` to `-inputs` in `dataflow create` command
 - Renamed configloader/png to configloader/image and added support for loading JPEG and TIFF file formats
diff --git a/dffml/cli/dataflow.py b/dffml/cli/dataflow.py
@@ -1,7 +1,7 @@
 import pathlib
 import hashlib
 import contextlib
-from typing import List
+from typing import List, Dict, Any
 
 from ..base import BaseConfig
 from ..df.base import BaseOrchestrator, OperationImplementation
@@ -27,6 +27,7 @@
 )
 from ..util.cli.parser import ParseInputsAction
 from ..base import config, field
+from ..high_level import run as run_dataflow
 
 
 @config
@@ -329,9 +330,120 @@ class RunRecords(CMD):
     _all = RunAllRecords
 
 
+@config
+class RunSingleConfig:
+    dataflow: str = field(
+        "File containing exported DataFlow", required=True,
+    )
+    no_echo: bool = field(
+        "Do not echo back records", default=False,
+    )
+    configloader: BaseConfigLoader = field(
+        "ConfigLoader to use for importing DataFlow", default=None,
+    )
+    orchestrator: BaseOrchestrator = field(
+        "Orchestrator", default=MemoryOrchestrator,
+    )
+    inputs: List[str] = field(
+        "Other inputs to add under each ctx",
+        action=ParseInputsAction,
+        default_factory=lambda: [],
+    )
+    no_strict: bool = field(
+        "Do not exit on operation exceptions, just log errors", default=False,
+    )
+
+
+class RunSingle(CMD):
+    CONFIG = RunSingleConfig
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.orchestrator = self.orchestrator.withconfig(self.extra_config)
+
+    async def get_dataflow(self, dataflow_path):
+        dataflow_path = pathlib.Path(dataflow_path)
+        config_cls = self.configloader
+        if config_cls is None:
+            config_type = dataflow_path.suffix.replace(".", "")
+            config_cls = BaseConfigLoader.load(config_type)
+        async with config_cls.withconfig(self.extra_config) as configloader:
+            async with configloader() as loader:
+                exported = await loader.loadb(dataflow_path.read_bytes())
+                dataflow = DataFlow._fromdict(**exported)
+        return dataflow
+
+    async def run(self):
+        dataflow = await self.get_dataflow(self.dataflow)
+        dataflow_inputs = []
+        for value, def_name in self.inputs:
+            dataflow_inputs.append(
+                Input(value=value, definition=dataflow.definitions[def_name],)
+            )
+        async for ctx, results in run_dataflow(
+            dataflow,
+            dataflow_inputs,
+            orchestrator=self.orchestrator,
+            strict=not self.no_strict,
+        ):
+            if not self.no_echo:
+                yield results
+        if self.no_echo:
+            yield CMDOutputOverride
+
+
+@config
+class RunContextsConfig(RunSingleConfig):
+    context_def: str = field(
+        "Definition to be used for contexts key. "
+        + "If set, the key will be added to the set of inputs "
+        + "under each context (which is also the contexts name)",
+        default=False,
+    )
+    contexts: List[str] = field(
+        "Contexts to run", default_factory=lambda: ["context1"], required=False
+    )
+
+
+class RunContexts(RunSingle):
+    CONFIG = RunContextsConfig
+
+    async def run(self):
+        dataflow = await self.get_dataflow(self.dataflow)
+        common_inputs = []
+        for value, def_name in self.inputs:
+            common_inputs.append(
+                Input(value=value, definition=dataflow.definitions[def_name],)
+            )
+
+        dataflow_inputs = {
+            ctx_string: [
+                Input(
+                    value=ctx_string,
+                    definition=dataflow.definitions[self.context_def],
+                )
+            ]
+            + common_inputs
+            for ctx_string in self.contexts
+        }
+
+        async for ctx, result in run_dataflow(
+            dataflow,
+            dataflow_inputs,
+            orchestrator=self.orchestrator,
+            strict=not self.no_strict,
+        ):
+            if not self.no_echo:
+                yield {(await ctx.handle()).as_string(): result}
+        if self.no_echo:
+            yield CMDOutputOverride
+
+
 class Run(CMD):
     """Run dataflow"""
 
+    single = RunSingle
+    contexts = RunContexts
     records = RunRecords
 
 
diff --git a/docs/cli.rst b/docs/cli.rst
@@ -150,12 +150,12 @@ All
 
 Update all the records in any source using the :py:class:`DataFlowSource <dffml.source.df.DataFlowSource>`.
 
-For this example, we are using the `multiply` operation which multiplies every value in a record by a 
+For this example, we are using the `multiply` operation which multiplies every value in a record by a
 factor which is 10 in this case. The example dataflow file looks like this:
 
 .. literalinclude:: /../examples/edit_records.yaml
 
-Create a source file: 
+Create a source file:
 
 .. code-block:: console
 
@@ -175,7 +175,7 @@ Run the command:
         -sources f=csv -source-filename data.csv -source-readwrite \
         -features Years:int:1 Expertise:int:1 Trust:float:1 Salary:int:1 \
         -dataflow edit_records.yaml
-    $ dffml list records -sources f=csv -source-filename data.csv                                                                                                           
+    $ dffml list records -sources f=csv -source-filename data.csv
     [
         {
             "extra": {},
@@ -325,6 +325,22 @@ command during generation.
     {'hello': 'world'}
     {'hello': 'user'}
 
+We can also run the dataflow without using a source
+
+.. code-block:: console
+
+    $ dffml dataflow run contexts \
+        -no-echo \
+        -dataflow df.yaml \
+        -context-def value \
+        -contexts \
+            world \
+            $USER \
+        -input \
+            hello=key
+    {'hello': 'world'}
+    {'hello': 'user'}
+
 Diagram
 ~~~~~~~
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -9,7 +9,7 @@
 import contextlib
 from pathlib import Path
 from unittest.mock import patch
-from typing import List, AsyncIterator
+from typing import List, AsyncIterator, Dict
 
 from dffml.record import Record
 from dffml.feature import Feature, Features
@@ -21,17 +21,21 @@
 from dffml.model.accuracy import Accuracy as AccuracyType
 from dffml.util.entrypoint import entrypoint
 from dffml.util.asynctestcase import (
+    AsyncTestCase,
     AsyncExitStackTestCase,
     non_existant_tempfile,
 )
 from dffml.base import config
+from dffml.df.base import op
 from dffml.cli.cli import Merge
 from dffml.cli.ml import Train, Accuracy, Predict
 from dffml.cli.list import List
 from dffml.cli.dataflow import Dataflow
 
 from .test_df import OPERATIONS, OPIMPS
 
+from dffml import op, DataFlow, Definition
+
 
 class RecordsTestCase(AsyncExitStackTestCase):
     async def setUp(self):
@@ -350,6 +354,76 @@ async def test_run(self):
         shutil.rmtree(tmpdir)
 
 
+class TestDataflowRunSingle(AsyncTestCase):
+    async def test_run(self):
+        tmpdir = tempfile.mkdtemp()
+        handle, dataflow_file = tempfile.mkstemp(suffix=".json", dir=tmpdir)
+        os.close(handle)
+        with open(dataflow_file, mode="w+b") as dataflow_file:
+            dataflow = io.StringIO()
+            with contextlib.redirect_stdout(dataflow):
+                await Dataflow.cli(
+                    "create",
+                    "-configloader",
+                    "json",
+                    *map(lambda op: op.name, OPERATIONS),
+                )
+            dataflow_file.write(dataflow.getvalue().encode())
+            dataflow_file.seek(0)
+            results = await Dataflow.cli(
+                "run",
+                "single",
+                "-dataflow",
+                dataflow_file.name,
+                "-inputs",
+                '["result"]=get_single_spec',
+                "add 40 and 2=calc_string",
+            )
+            self.assertEqual(len(results), 1)
+            self.assertEqual(results[0], {"result": 42})
+        shutil.rmtree(tmpdir)
+
+
+class TestDataflowRunContexts(AsyncTestCase):
+    async def test_run(self):
+        tmpdir = tempfile.mkdtemp()
+        handle, dataflow_file = tempfile.mkstemp(suffix=".json", dir=tmpdir)
+        os.close(handle)
+
+        with open(dataflow_file, mode="w+b") as dataflow_file:
+            dataflow = io.StringIO()
+            with contextlib.redirect_stdout(dataflow):
+                await Dataflow.cli(
+                    "create",
+                    "-configloader",
+                    "json",
+                    *map(lambda op: op.name, OPERATIONS),
+                )
+            dataflow_file.write(dataflow.getvalue().encode())
+            dataflow_file.seek(0)
+            test_contexts = {"add 40 and 2": 42, "multiply 42 and 10": 420}
+            results = await Dataflow.cli(
+                "run",
+                "contexts",
+                "-dataflow",
+                dataflow_file.name,
+                "-context-def",
+                "calc_string",
+                "-contexts",
+                *test_contexts.keys(),
+                "-input",
+                '["result"]=get_single_spec',
+            )
+            self.assertCountEqual(
+                results,
+                [
+                    {ctx_string: {"result": result}}
+                    for ctx_string, result in test_contexts.items()
+                ],
+            )
+        shutil.rmtree(tmpdir)
+
+
 class TestTrain(RecordsTestCase):
     async def test_run(self):
         await Train.cli(