InternScience
diff --git a/‎graphgen/operators/init/init_llm.py‎ ‎graphgen/common/init_llm.py‎graphgen/operators/init/init_llm.py renamed to graphgen/common/init_llm.py
Lines changed: 2 additions & 0 deletions b/‎graphgen/operators/init/init_llm.py‎ ‎graphgen/common/init_llm.py‎graphgen/operators/init/init_llm.py renamed to graphgen/common/init_llm.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎graphgen/engine.py‎
Lines changed: 165 additions & 104 deletions b/‎graphgen/engine.py‎
Lines changed: 165 additions & 104 deletions
diff --git a/‎graphgen/operators/build_kg/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎graphgen/operators/build_kg/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎graphgen/operators/build_kg/build_kg.py‎
Lines changed: 0 additions & 59 deletions b/‎graphgen/operators/build_kg/build_kg.py‎
Lines changed: 0 additions & 59 deletions
@@ -79,3 +79,5 @@ def init_llm(model_type: str) -> Optional[BaseLLMWrapper]:
     backend = config.pop("backend")
     llm_wrapper = LLMFactory.create_llm_wrapper(backend, config)
     return llm_wrapper
+
+# TODO: use ray serve when loading large models to avoid re-loading in each actor
@@ -1,125 +1,186 @@
-"""
-orchestration engine for GraphGen
-"""
+import inspect
+import logging
+from collections import defaultdict, deque
+from functools import wraps
+from typing import Any, Callable, Dict, List, Set
 
-import threading
-import traceback
-from typing import Any, Callable, List
+import ray
+import ray.data
 
+from graphgen.bases import Config, Node
 
-class Context(dict):
-    _lock = threading.Lock()
 
-    def set(self, k, v):
-        with self._lock:
-            self[k] = v
+class Engine:
+    def __init__(
+        self, config: Dict[str, Any], functions: Dict[str, Callable], **ray_init_kwargs
+    ):
+        self.config = Config(**config)
+        self.functions = functions
+        self.datasets: Dict[str, ray.data.Dataset] = {}
+
+        if not ray.is_initialized():
+            context = ray.init(
+                ignore_reinit_error=True,
+                logging_level=logging.ERROR,
+                log_to_driver=True,
+                **ray_init_kwargs,
+            )
+            print(f"Ray Dashboard URL: {context.dashboard_url}")
 
-    def get(self, k, default=None):
-        with self._lock:
-            return super().get(k, default)
+    @staticmethod
+    def _topo_sort(nodes: List[Node]) -> List[Node]:
+        id_to_node: Dict[str, Node] = {}
+        for n in nodes:
+            id_to_node[n.id] = n
+
+        indeg: Dict[str, int] = {nid: 0 for nid in id_to_node}
+        adj: Dict[str, List[str]] = defaultdict(list)
+
+        for n in nodes:
+            nid = n.id
+            deps: List[str] = n.dependencies
+            uniq_deps: Set[str] = set(deps)
+            for d in uniq_deps:
+                if d not in id_to_node:
+                    raise ValueError(
+                        f"The dependency node id {d} of node {nid} is not defined in the configuration."
+                    )
+                indeg[nid] += 1
+                adj[d].append(nid)
+
+        zero_deg: deque = deque(
+            [id_to_node[nid] for nid, deg in indeg.items() if deg == 0]
+        )
+        sorted_nodes: List[Node] = []
+
+        while zero_deg:
+            cur = zero_deg.popleft()
+            sorted_nodes.append(cur)
+            cur_id = cur.id
+            for nb_id in adj.get(cur_id, []):
+                indeg[nb_id] -= 1
+                if indeg[nb_id] == 0:
+                    zero_deg.append(id_to_node[nb_id])
+
+        if len(sorted_nodes) != len(nodes):
+            remaining = [nid for nid, deg in indeg.items() if deg > 0]
+            raise ValueError(
+                f"The configuration contains cycles, unable to execute. Remaining nodes with indegree > 0: {remaining}"
+            )
 
+        return sorted_nodes
 
-class OpNode:
-    def __init__(
-        self, name: str, deps: List[str], func: Callable[["OpNode", Context], Any]
-    ):
-        self.name, self.deps, self.func = name, deps, func
+    def _get_input_dataset(
+        self, node: Node, initial_ds: ray.data.Dataset
+    ) -> ray.data.Dataset:
+        deps = node.dependencies
 
+        if not deps:
+            return initial_ds
 
-class Engine:
-    def __init__(self, max_workers: int = 4):
-        self.max_workers = max_workers
-
-    def run(self, ops: List[OpNode], ctx: Context):
-        self._validate(ops)
-        name2op = {operation.name: operation for operation in ops}
-
-        # topological sort
-        graph = {n: set(name2op[n].deps) for n in name2op}
-        topo = []
-        q = [n for n, d in graph.items() if not d]
-        while q:
-            cur = q.pop(0)
-            topo.append(cur)
-            for child in [c for c, d in graph.items() if cur in d]:
-                graph[child].remove(cur)
-                if not graph[child]:
-                    q.append(child)
-
-        if len(topo) != len(ops):
+        if len(deps) == 1:
+            return self.datasets[deps[0]]
+
+        main_ds = self.datasets[deps[0]]
+        other_dss = [self.datasets[d] for d in deps[1:]]
+        if not all(ds.schema() == main_ds.schema() for ds in other_dss):
             raise ValueError(
-                "Cyclic dependencies detected among operations."
-                "Please check your configuration."
+                f"Union requires all datasets to have the same schema for node {node.id}"
             )
+        return main_ds.union(*other_dss)
+
+    def _execute_node(self, node: Node, initial_ds: ray.data.Dataset):
+        if node.op_name not in self.functions:
+            raise ValueError(f"Operator {node.op_name} not found for node {node.id}")
+
+        if node.type == "source":
+            op_handler = self.functions[node.op_name]
+            node_params = node.params
+            self.datasets[node.id] = op_handler(**node_params)
+            return
 
-        # semaphore for max_workers
-        sem = threading.Semaphore(self.max_workers)
-        done = {n: threading.Event() for n in name2op}
-        exc = {}
-
-        def _exec(n: str):
-            with sem:
-                for d in name2op[n].deps:
-                    done[d].wait()
-                if any(d in exc for d in name2op[n].deps):
-                    exc[n] = Exception("Skipped due to failed dependencies")
-                    done[n].set()
-                    return
-                try:
-                    name2op[n].func(name2op[n], ctx)
-                except Exception:
-                    exc[n] = traceback.format_exc()
-                done[n].set()
-
-        ts = [threading.Thread(target=_exec, args=(n,), daemon=True) for n in topo]
-        for t in ts:
-            t.start()
-        for t in ts:
-            t.join()
-        if exc:
-            raise RuntimeError(
-                "Some operations failed:\n"
-                + "\n".join(f"---- {op} ----\n{tb}" for op, tb in exc.items())
+        input_ds = self._get_input_dataset(node, initial_ds)
+
+        op_handler = self.functions[node.op_name]
+        node_params = node.params
+
+        if inspect.isclass(op_handler):
+            replicas = node_params.pop("replicas", 1)
+            batch_size = (
+                int(node_params.pop("batch_size"))
+                if "batch_size" in node_params
+                else "default"
             )
+            compute_resources = node_params.pop("compute_resources", {})
+
+            if node.type == "aggregate":
+                self.datasets[node.id] = input_ds.repartition(1).map_batches(
+                    op_handler,
+                    compute=ray.data.ActorPoolStrategy(min_size=1, max_size=1),
+                    batch_size=None,  # aggregate processes the whole dataset at once
+                    num_gpus=compute_resources.get("num_gpus", 0)
+                    if compute_resources
+                    else 0,
+                    fn_constructor_kwargs=node_params,
+                    batch_format="pandas",
+                )
+            else:
+                # others like map, filter, flatmap, map_batch let actors process data inside batches
+                self.datasets[node.id] = input_ds.map_batches(
+                    op_handler,
+                    compute=ray.data.ActorPoolStrategy(min_size=1, max_size=replicas),
+                    batch_size=batch_size,
+                    num_gpus=compute_resources.get("num_gpus", 0)
+                    if compute_resources
+                    else 0,
+                    fn_constructor_kwargs=node_params,
+                    batch_format="pandas",
+                )
 
-    @staticmethod
-    def _validate(ops: List[OpNode]):
-        name_set = set()
-        for op in ops:
-            if op.name in name_set:
-                raise ValueError(f"Duplicate operation name: {op.name}")
-            name_set.add(op.name)
-        for op in ops:
-            for dep in op.deps:
-                if dep not in name_set:
-                    raise ValueError(
-                        f"Operation {op.name} has unknown dependency: {dep}"
-                    )
+        else:
 
+            @wraps(op_handler)
+            def func_wrapper(row_or_batch: Dict[str, Any]) -> Dict[str, Any]:
+                return op_handler(row_or_batch, **node_params)
+
+            if node.type == "map":
+                self.datasets[node.id] = input_ds.map(func_wrapper)
+            elif node.type == "filter":
+                self.datasets[node.id] = input_ds.filter(func_wrapper)
+            elif node.type == "flatmap":
+                self.datasets[node.id] = input_ds.flat_map(func_wrapper)
+            elif node.type == "aggregate":
+                self.datasets[node.id] = input_ds.repartition(1).map_batches(
+                    func_wrapper, batch_format="default"
+                )
+            elif node.type == "map_batch":
+                self.datasets[node.id] = input_ds.map_batches(func_wrapper)
+            else:
+                raise ValueError(
+                    f"Unsupported node type {node.type} for node {node.id}"
+                )
 
-def collect_ops(config: dict, graph_gen) -> List[OpNode]:
-    """
-    build operation nodes from yaml config
-    :param config
-    :param graph_gen
-    """
-    ops: List[OpNode] = []
-    for stage in config["pipeline"]:
-        name = stage["name"]
-        method_name = stage.get("op_key")
-        method = getattr(graph_gen, method_name)
-        deps = stage.get("deps", [])
+    @staticmethod
+    def _find_leaf_nodes(nodes: List[Node]) -> Set[str]:
+        all_ids = {n.id for n in nodes}
+        deps_set = set()
+        for n in nodes:
+            deps_set.update(n.dependencies)
+        return all_ids - deps_set
 
-        if "params" in stage:
+    def execute(self, initial_ds: ray.data.Dataset) -> Dict[str, List[Any]]:
+        sorted_nodes = self._topo_sort(self.config.nodes)
 
-            def func(self, ctx, _method=method, _params=stage.get("params", {})):
-                return _method(_params)
+        for node in sorted_nodes:
+            self._execute_node(node, initial_ds)
 
-        else:
+        leaf_nodes = self._find_leaf_nodes(sorted_nodes)
 
-            def func(self, ctx, _method=method):
-                return _method()
+        @ray.remote
+        def _fetch_result(ds: ray.data.Dataset) -> List[Any]:
+            return ds.take_all()
 
-        op_node = OpNode(name=name, deps=deps, func=func)
-        ops.append(op_node)
-    return ops
+        results = ray.get(
+            [_fetch_result.remote(self.datasets[node_id]) for node_id in leaf_nodes]
+        )
+        return dict(zip(leaf_nodes, results))
@@ -1 +1 @@
-from .build_kg import build_kg
+from .build_kg_service import BuildKGService
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .build_kg import build_kg`
	`1`	`+from .build_kg_service import BuildKGService`