FEAT(#1,net,netop): PRUNE by node-PROPS

ankostis · ankostis · commit 3c5523fe0b4a · 2019-12-11T20:53:08.000+02:00
e.g. assign "colors" to nodes, and solve a subset each time.
diff --git a/graphtik/netop.py b/graphtik/netop.py
@@ -5,7 +5,7 @@
 import logging
 import re
 from collections import abc
-from typing import Collection
+from typing import Any, Callable, Collection, Mapping
 
 import networkx as nx
 from boltons.setutils import IndexedSet as iset
@@ -44,6 +44,7 @@ def __init__(
         *,
         inputs=None,
         outputs=None,
+        predicate: Callable[[Any, Mapping], bool] = None,
         method=None,
         overwrites_collector=None,
     ):
@@ -52,6 +53,8 @@ def __init__(
             see :meth:`narrow()`
         :param outputs:
             see :meth:`narrow()`
+        :param predicate:
+            a 2-argument callable(op, node-data) that should return true for nodes to include
         :param method:
             either `parallel` or None (default);
             if ``"parallel"``, launches multi-threading.
@@ -64,19 +67,19 @@ def __init__(
         :raises ValueError:
             see :meth:`narrow()`
         """
+        ## Set data asap, for debugging, although `pruned()` will reset them.
         self.name = name
         self.inputs = inputs
         self.provides = outputs
-        # Prune network
-        self.net = net.pruned(inputs, outputs)
-        ## Set data asap, for debugging, although `prune()` will reset them.
         self.set_execution_method(method)
         self.set_overwrites_collector(overwrites_collector)
 
         # TODO: Is it really necessary to sroe IO on netop?
         self.inputs = inputs
         self.outputs = outputs
 
+        # Prune network
+        self.net = net.pruned(inputs, outputs, predicate)
         self.name, self.needs, self.provides = reparse_operation_data(
             self.name, self.net.needs, self.net.provides
         )
@@ -94,7 +97,11 @@ def __repr__(self):
         )
 
     def narrow(
-        self, inputs: Collection = None, outputs: Collection = None, name=None
+        self,
+        inputs: Collection = None,
+        outputs: Collection = None,
+        name=None,
+        predicate: Callable[[Any, Mapping], bool] = None,
     ) -> "NetworkOperation":
         """
         Return a copy with a network pruned for the given `needs` & `provides`.
@@ -118,6 +125,8 @@ def narrow(
                 <old-name>-<uid>
 
             - otherwise, the given `name` is applied.
+        :param predicate:
+            a 2-argument callable(op, node-data) that should return true for nodes to include
 
         :return:
             A narrowed netop clone, which **MIGHT be empty!***
@@ -146,6 +155,7 @@ def narrow(
             name,
             inputs=inputs,
             outputs=outputs,
+            predicate=predicate,
             method=self.method,
             overwrites_collector=self.overwrites_collector,
         )
@@ -316,7 +326,7 @@ def proc_op(op, parent=None):
         ## Convey any node-props specified in the netop here
         #  to all sub-operations.
         #
-        if node_props or parent:
+        if node_props or (not merge and parent):
             kw = {}
             if node_props:
                 op_node_props = op.node_props.copy()
diff --git a/graphtik/network.py b/graphtik/network.py
@@ -72,7 +72,17 @@
 from collections import abc, defaultdict, namedtuple
 from contextvars import ContextVar
 from multiprocessing.dummy import Pool
-from typing import Collection, Iterable, List, Optional, Tuple, Union
+from typing import (
+    Any,
+    Callable,
+    Collection,
+    Iterable,
+    List,
+    Mapping,
+    Optional,
+    Tuple,
+    Union,
+)
 
 import networkx as nx
 from boltons.setutils import IndexedSet as iset
@@ -650,8 +660,23 @@ def _unsatisfied_operations(self, dag, inputs: Collection):
 
         return unsatisfied
 
+    def _apply_graph_predicate(self, graph, predicate):
+        to_del = []
+        for node, data in graph.nodes.items():
+            try:
+                if isinstance(node, Operation) and not predicate(node, data):
+                    to_del.append(node)
+            except Exception as ex:
+                raise ValueError(
+                    f"Node-predicate({predicate}) failed due to: {ex}\n  node: {node}, {self}"
+                ) from ex
+        graph.remove_nodes_from(to_del)
+
     def _prune_graph(
-        self, inputs: Optional[Collection], outputs: Optional[Collection]
+        self,
+        inputs: Optional[Collection],
+        outputs: Optional[Collection],
+        predicate: Callable[[Any, Mapping], bool] = None,
     ) -> Tuple[nx.DiGraph, Collection, Collection, Collection]:
         """
         Determines what graph steps need to run to get to the requested
@@ -667,6 +692,8 @@ def _prune_graph(
             The desired output names.  This can also be ``None``, in which
             case the necessary steps are all graph nodes that are reachable
             from the provided inputs.
+        :param predicate:
+            a 2-argument callable(op, node-data) that should return true for nodes to include
 
         :return:
             a 4-tuple with the *pruned_dag*, the out-edges of the inputs,
@@ -719,6 +746,9 @@ def _prune_graph(
         broken_dag = dag.copy()  # preserve net's graph
         broken_edges = set()  # unordered, not iterated
 
+        if predicate:
+            self._apply_graph_predicate(broken_dag, predicate)
+
         # Break the incoming edges to all given inputs.
         #
         # Nodes producing any given intermediate inputs are unecessary
@@ -762,7 +792,10 @@ def _prune_graph(
         return pruned_dag, broken_edges, tuple(inputs), tuple(outputs)
 
     def pruned(
-        self, inputs: Collection = None, outputs: Collection = None
+        self,
+        inputs: Collection = None,
+        outputs: Collection = None,
+        predicate: Callable[[Any, Mapping], bool] = None,
     ) -> "Network":
         """
         Return a pruned network supporting just the given `inputs` & `outputs`.
@@ -771,19 +804,23 @@ def pruned(
             all possible inputs names
         :param outputs:
             all possible output names
+        :param predicate:
+            a 2-argument callable(op, node-data) that should return true for nodes to include
 
         :return:
             the pruned clone, or this, if both `inputs` & `outputs` were `None`
         """
-        if inputs is None and outputs is None:
+        if (inputs, outputs, predicate) == (None, None, None):
             return self
 
         if inputs is not None:
             inputs = astuple(inputs, "outputs", allowed_types=(list, tuple))
         if outputs is not None:
             outputs = astuple(outputs, "outputs", allowed_types=(list, tuple))
 
-        pruned_dag, _br_edges, _needs, _provides = self._prune_graph(inputs, outputs)
+        pruned_dag, _br_edges, _needs, _provides = self._prune_graph(
+            inputs, outputs, predicate
+        )
         return Network(graph=pruned_dag)
 
     def _build_execution_steps(
diff --git a/test/test_graphtik.py b/test/test_graphtik.py
@@ -50,9 +50,9 @@ def filtdict(d, *keys):
     return type(d)(i for i in d.items() if i[0] in keys)
 
 
-def addall(*a):
+def addall(*a, **kw):
     "Same as a + b + ...."
-    return sum(a)
+    return sum(a) + sum(kw.values())
 
 
 def abspow(a, p):
@@ -321,13 +321,48 @@ def test_network_merge_in_doctests():
     assert merged_graph.provides
 
     assert (
-        repr(merged_graph) ==
-        "NetworkOperation('merged_graph', "
+        repr(merged_graph) == "NetworkOperation('merged_graph', "
         "needs=['a', 'b', 'ab', 'a_minus_ab', 'c'], "
         "provides=['ab', 'a_minus_ab', 'abs_a_minus_ab_cubed', 'cab'], x4ops)"
     )
 
 
+@pytest.fixture
+def samplenet():
+    # Set up a network such that we don't need to provide a or b d if we only
+    # request sum3 as output and if we provide sum2.
+    sum_op1 = operation(name="sum_op1", needs=["a", "b"], provides="sum1")(add)
+    sum_op2 = operation(name="sum_op2", needs=["c", "d"], provides="sum2")(add)
+    sum_op3 = operation(name="sum_op3", needs=["c", "sum2"], provides="sum3")(add)
+    return compose("test_net", sum_op1, sum_op2, sum_op3)
+
+
+def test_node_props_based_prune():
+    netop = compose(
+        "N",
+        operation(name="A", needs=["a"], provides=["aa"], node_props={"color": "red"})(
+            identity
+        ),
+        operation(
+            name="B", needs=["b"], provides=["bb"], node_props={"color": "green"}
+        )(identity),
+        operation(name="C", needs=["c"], provides=["cc"])(identity),
+        operation(
+            name="SUM",
+            needs=[optional(i) for i in ("aa", "bb", "cc")],
+            provides=["sum"],
+        )(addall),
+    )
+    inp = {"a": 1, "b": 2, "c": 3}
+    # assert netop(**inp)["sum"] == 6
+
+    pred = lambda n, d: d.get("color", None) != "red"
+    assert netop.narrow(predicate=pred)(**inp)["sum"] == 5
+
+    pred = lambda n, d: "color" not in d
+    assert netop.narrow(predicate=pred)(**inp)["sum"] == 3
+
+
 def test_input_based_pruning():
     # Tests to make sure we don't need to pass graph inputs if we're provided
     # with data further downstream in the graph as an input.
@@ -349,72 +384,51 @@ def test_input_based_pruning():
     assert results["sum3"] == add(sum1, sum2)
 
 
-def test_output_based_pruning():
+def test_output_based_pruning(samplenet):
     # Tests to make sure we don't need to pass graph inputs if they're not
     # needed to compute the requested outputs.
 
     c = 2
     d = 3
 
-    # Set up a network such that we don't need to provide a or b if we only
-    # request sum3 as output.
-    sum_op1 = operation(name="sum_op1", needs=["a", "b"], provides="sum1")(add)
-    sum_op2 = operation(name="sum_op2", needs=["c", "d"], provides="sum2")(add)
-    sum_op3 = operation(name="sum_op3", needs=["c", "sum2"], provides="sum3")(add)
-    net = compose("test_net", sum_op1, sum_op2, sum_op3)
-
-    results = net.compute({"a": 0, "b": 0, "c": c, "d": d}, ["sum3"])
+    results = samplenet.compute({"a": 0, "b": 0, "c": c, "d": d}, ["sum3"])
 
     # Make sure we got expected result without having to pass a or b.
     assert "sum3" in results
     assert results["sum3"] == add(c, add(c, d))
 
 
-def test_deps_pruning_vs_narrowing():
+def test_deps_pruning_vs_narrowing(samplenet):
     # Tests to make sure we don't need to pass graph inputs if they're not
     # needed to compute the requested outputs or of we're provided with
     # inputs that are further downstream in the graph.
 
     c = 2
     sum2 = 5
 
-    # Set up a network such that we don't need to provide a or b d if we only
-    # request sum3 as output and if we provide sum2.
-    sum_op1 = operation(name="sum_op1", needs=["a", "b"], provides="sum1")(add)
-    sum_op2 = operation(name="sum_op2", needs=["c", "d"], provides="sum2")(add)
-    sum_op3 = operation(name="sum_op3", needs=["c", "sum2"], provides="sum3")(add)
-    net = compose("test_net", sum_op1, sum_op2, sum_op3)
-
-    results = net.compute({"c": c, "sum2": sum2}, ["sum3"])
+    results = samplenet.compute({"c": c, "sum2": sum2}, ["sum3"])
 
     # Make sure we got expected result without having to pass a, b, or d.
     assert "sum3" in results
     assert results["sum3"] == add(c, sum2)
 
     # Compare with both `narrow()`.
-    net = net.narrow(inputs=["c", "sum2"], outputs=["sum3"])
+    net = samplenet.narrow(inputs=["c", "sum2"], outputs=["sum3"])
     results = net(c=c, sum2=sum2)
 
     # Make sure we got expected result without having to pass a, b, or d.
     assert "sum3" in results
     assert results["sum3"] == add(c, sum2)
 
 
-def test_pruning_raises_for_bad_output():
+def test_pruning_raises_for_bad_output(samplenet):
     # Make sure we get a ValueError during the pruning step if we request an
     # output that doesn't exist.
 
-    # Set up a network that doesn't have the output sum4, which we'll request
-    # later.
-    sum_op1 = operation(name="sum_op1", needs=["a", "b"], provides="sum1")(add)
-    sum_op2 = operation(name="sum_op2", needs=["c", "d"], provides="sum2")(add)
-    sum_op3 = operation(name="sum_op3", needs=["c", "sum2"], provides="sum3")(add)
-    net = compose("test_net", sum_op1, sum_op2, sum_op3)
-
     # Request two outputs we can compute and one we can't compute.  Assert
     # that this raises a ValueError.
     with pytest.raises(ValueError) as exinfo:
-        net.compute({"a": 1, "b": 2, "c": 3, "d": 4}, ["sum1", "sum3", "sum4"])
+        samplenet.compute({"a": 1, "b": 2, "c": 3, "d": 4}, ["sum1", "sum3", "sum4"])
     assert exinfo.match("sum4")