refactor: refactor partition to accomodate ray data

ChenZiHong-Gavin · ChenZiHong-Gavin · commit a6aedafba0be · 2025-12-10T21:14:56.000+08:00
diff --git a/graphgen/bases/base_partitioner.py b/graphgen/bases/base_partitioner.py
@@ -7,7 +7,7 @@
 
 class BasePartitioner(ABC):
     @abstractmethod
-    async def partition(
+    def partition(
         self,
         g: BaseGraphStorage,
         **kwargs: Any,
@@ -20,39 +20,34 @@ async def partition(
         """
 
     @staticmethod
-    async def community2batch(
-        communities: List[Community], g: BaseGraphStorage
-    ) -> list[
-        tuple[
-            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
-        ]
+    def community2batch(
+        comm: Community, g: BaseGraphStorage
+    ) -> tuple[
+        list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
     ]:
         """
         Convert communities to batches of nodes and edges.
-        :param communities
+        :param comm: Community
         :param g: Graph storage instance
         :return: List of batches, each batch is a tuple of (nodes, edges)
         """
-        batches = []
-        for comm in communities:
-            nodes = comm.nodes
-            edges = comm.edges
-            nodes_data = []
-            for node in nodes:
-                node_data = g.get_node(node)
-                if node_data:
-                    nodes_data.append((node, node_data))
-            edges_data = []
-            for u, v in edges:
-                edge_data = g.get_edge(u, v)
+        nodes = comm.nodes
+        edges = comm.edges
+        nodes_data = []
+        for node in nodes:
+            node_data = g.get_node(node)
+            if node_data:
+                nodes_data.append((node, node_data))
+        edges_data = []
+        for u, v in edges:
+            edge_data = g.get_edge(u, v)
+            if edge_data:
+                edges_data.append((u, v, edge_data))
+            else:
+                edge_data = g.get_edge(v, u)
                 if edge_data:
-                    edges_data.append((u, v, edge_data))
-                else:
-                    edge_data = g.get_edge(v, u)
-                    if edge_data:
-                        edges_data.append((v, u, edge_data))
-            batches.append((nodes_data, edges_data))
-        return batches
+                    edges_data.append((v, u, edge_data))
+        return nodes_data, edges_data
 
     @staticmethod
     def _build_adjacency_list(
diff --git a/graphgen/models/partitioner/anchor_bfs_partitioner.py b/graphgen/models/partitioner/anchor_bfs_partitioner.py
@@ -1,6 +1,6 @@
 import random
 from collections import deque
-from typing import Any, List, Literal, Set, Tuple
+from typing import Any, Iterable, List, Literal, Set, Tuple
 
 from graphgen.bases import BaseGraphStorage
 from graphgen.bases.datatypes import Community
@@ -30,42 +30,37 @@ def __init__(
         self.anchor_type = anchor_type
         self.anchor_ids = anchor_ids
 
-    async def partition(
+    def partition(
         self,
         g: BaseGraphStorage,
         max_units_per_community: int = 1,
         **kwargs: Any,
-    ) -> List[Community]:
+    ) -> Iterable[Community]:
         nodes = g.get_all_nodes()  # List[tuple[id, meta]]
         edges = g.get_all_edges()  # List[tuple[u, v, meta]]
 
         adj, _ = self._build_adjacency_list(nodes, edges)
 
-        anchors: Set[str] = await self._pick_anchor_ids(nodes)
+        anchors: Set[str] = self._pick_anchor_ids(nodes)
         if not anchors:
-            return []  # if no anchors, return empty list
+            return  # if no anchors, return nothing
 
         used_n: set[str] = set()
         used_e: set[frozenset[str]] = set()
-        communities: List[Community] = []
 
         seeds = list(anchors)
         random.shuffle(seeds)
 
         for seed_node in seeds:
             if seed_node in used_n:
                 continue
-            comm_n, comm_e = await self._grow_community(
+            comm_n, comm_e = self._grow_community(
                 seed_node, adj, max_units_per_community, used_n, used_e
             )
             if comm_n or comm_e:
-                communities.append(
-                    Community(id=len(communities), nodes=comm_n, edges=comm_e)
-                )
+                yield Community(id=seed_node, nodes=comm_n, edges=comm_e)
 
-        return communities
-
-    async def _pick_anchor_ids(
+    def _pick_anchor_ids(
         self,
         nodes: List[tuple[str, dict]],
     ) -> Set[str]:
@@ -80,7 +75,7 @@ async def _pick_anchor_ids(
         return anchor_ids
 
     @staticmethod
-    async def _grow_community(
+    def _grow_community(
         seed: str,
         adj: dict[str, List[str]],
         max_units: int,
diff --git a/graphgen/models/partitioner/bfs_partitioner.py b/graphgen/models/partitioner/bfs_partitioner.py
@@ -1,6 +1,6 @@
 import random
 from collections import deque
-from typing import Any, List
+from typing import Any, Iterable, List
 
 from graphgen.bases import BaseGraphStorage, BasePartitioner
 from graphgen.bases.datatypes import Community
@@ -17,20 +17,19 @@ class BFSPartitioner(BasePartitioner):
     (A unit is a node or an edge.)
     """
 
-    async def partition(
+    def partition(
         self,
         g: BaseGraphStorage,
         max_units_per_community: int = 1,
         **kwargs: Any,
-    ) -> List[Community]:
+    ) -> Iterable[Community]:
         nodes = g.get_all_nodes()
         edges = g.get_all_edges()
 
         adj, _ = self._build_adjacency_list(nodes, edges)
 
         used_n: set[str] = set()
         used_e: set[frozenset[str]] = set()
-        communities: List[Community] = []
 
         units = [(NODE_UNIT, n[0]) for n in nodes] + [
             (EDGE_UNIT, frozenset((u, v))) for u, v, _ in edges
@@ -74,8 +73,4 @@ async def partition(
                             queue.append((NODE_UNIT, n))
 
             if comm_n or comm_e:
-                communities.append(
-                    Community(id=len(communities), nodes=comm_n, edges=comm_e)
-                )
-
-        return communities
+                yield Community(id=seed, nodes=comm_n, edges=comm_e)
diff --git a/graphgen/models/partitioner/dfs_partitioner.py b/graphgen/models/partitioner/dfs_partitioner.py
@@ -1,4 +1,5 @@
 import random
+from collections.abc import Iterable
 from typing import Any, List
 
 from graphgen.bases import BaseGraphStorage, BasePartitioner
@@ -16,20 +17,19 @@ class DFSPartitioner(BasePartitioner):
     (In GraphGen, a unit is defined as a node or an edge.)
     """
 
-    async def partition(
+    def partition(
         self,
         g: BaseGraphStorage,
         max_units_per_community: int = 1,
         **kwargs: Any,
-    ) -> List[Community]:
+    ) -> Iterable[Community]:
         nodes = g.get_all_nodes()
         edges = g.get_all_edges()
 
         adj, _ = self._build_adjacency_list(nodes, edges)
 
         used_n: set[str] = set()
         used_e: set[frozenset[str]] = set()
-        communities: List[Community] = []
 
         units = [(NODE_UNIT, n[0]) for n in nodes] + [
             (EDGE_UNIT, frozenset((u, v))) for u, v, _ in edges
@@ -71,8 +71,4 @@ async def partition(
                             stack.append((NODE_UNIT, n))
 
             if comm_n or comm_e:
-                communities.append(
-                    Community(id=len(communities), nodes=comm_n, edges=comm_e)
-                )
-
-        return communities
+                yield Community(id=seed, nodes=comm_n, edges=comm_e)
diff --git a/graphgen/models/partitioner/ece_partitioner.py b/graphgen/models/partitioner/ece_partitioner.py
@@ -1,8 +1,8 @@
-import asyncio
 import random
+from collections import deque
 from typing import Any, Dict, List, Optional, Set, Tuple
 
-from tqdm.asyncio import tqdm as tqdm_async
+from tqdm import tqdm
 
 from graphgen.bases import BaseGraphStorage
 from graphgen.bases.datatypes import Community
@@ -51,7 +51,7 @@ def _sort_units(units: list, edge_sampling: str) -> list:
             raise ValueError(f"Invalid edge sampling: {edge_sampling}")
         return units
 
-    async def partition(
+    def partition(
         self,
         g: BaseGraphStorage,
         max_units_per_community: int = 10,
@@ -73,21 +73,19 @@ async def partition(
 
         used_n: Set[str] = set()
         used_e: Set[frozenset[str]] = set()
-        communities: List = []
+        communities: List[Community] = []
 
         all_units = self._sort_units(all_units, unit_sampling)
 
-        async def _grow_community(
-            seed_unit: Tuple[str, Any, dict]
-        ) -> Optional[Community]:
+        def _grow_community(seed_unit: Tuple[str, Any, dict]) -> Optional[Community]:
             nonlocal used_n, used_e
 
             community_nodes: Dict[str, dict] = {}
             community_edges: Dict[frozenset[str], dict] = {}
-            queue: asyncio.Queue = asyncio.Queue()
+            queue = deque()
             token_sum = 0
 
-            async def _add_unit(u):
+            def _add_unit(u):
                 nonlocal token_sum
                 t, i, d = u
                 if t == NODE_UNIT:  # node
@@ -103,19 +101,19 @@ async def _add_unit(u):
                 token_sum += d.get("length", 0)
                 return True
 
-            await _add_unit(seed_unit)
-            await queue.put(seed_unit)
+            _add_unit(seed_unit)
+            queue.append(seed_unit)
 
             # BFS
-            while not queue.empty():
+            while queue:
                 if (
                     len(community_nodes) + len(community_edges)
                     >= max_units_per_community
                     or token_sum >= max_tokens_per_community
                 ):
                     break
 
-                cur_type, cur_id, _ = await queue.get()
+                cur_type, cur_id, _ = queue.popleft()
 
                 neighbors: List[Tuple[str, Any, dict]] = []
                 if cur_type == NODE_UNIT:
@@ -136,8 +134,8 @@ async def _add_unit(u):
                         or token_sum >= max_tokens_per_community
                     ):
                         break
-                    if await _add_unit(nb):
-                        await queue.put(nb)
+                    if _add_unit(nb):
+                        queue.append(nb)
 
             if len(community_nodes) + len(community_edges) < min_units_per_community:
                 return None
@@ -148,13 +146,13 @@ async def _add_unit(u):
                 edges=[(u, v) for (u, v), _ in community_edges.items()],
             )
 
-        async for unit in tqdm_async(all_units, desc="ECE partition"):
+        for unit in tqdm(all_units, desc="ECE partition"):
             utype, uid, _ = unit
             if (utype == NODE_UNIT and uid in used_n) or (
                 utype == EDGE_UNIT and uid in used_e
             ):
                 continue
-            comm = await _grow_community(unit)
+            comm = _grow_community(unit)
             if comm is not None:
                 communities.append(comm)
 
diff --git a/graphgen/models/partitioner/leiden_partitioner.py b/graphgen/models/partitioner/leiden_partitioner.py
@@ -13,7 +13,7 @@ class LeidenPartitioner(BasePartitioner):
     Leiden partitioner that partitions the graph into communities using the Leiden algorithm.
     """
 
-    async def partition(
+    def partition(
         self,
         g: BaseGraphStorage,
         max_size: int = 20,
@@ -37,12 +37,10 @@ async def partition(
         nodes = g.get_all_nodes()  # List[Tuple[str, dict]]
         edges = g.get_all_edges()  # List[Tuple[str, str, dict]]
 
-        node2cid: Dict[str, int] = await self._run_leiden(
-            nodes, edges, use_lcc, random_seed
-        )
+        node2cid: Dict[str, int] = self._run_leiden(nodes, edges, use_lcc, random_seed)
 
         if max_size is not None and max_size > 0:
-            node2cid = await self._split_communities(node2cid, max_size)
+            node2cid = self._split_communities(node2cid, max_size)
 
         cid2nodes: Dict[int, List[str]] = defaultdict(list)
         for n, cid in node2cid.items():
@@ -58,7 +56,7 @@ async def partition(
         return communities
 
     @staticmethod
-    async def _run_leiden(
+    def _run_leiden(
         nodes: List[Tuple[str, dict]],
         edges: List[Tuple[str, str, dict]],
         use_lcc: bool = False,
@@ -92,9 +90,7 @@ async def _run_leiden(
         return node2cid
 
     @staticmethod
-    async def _split_communities(
-        node2cid: Dict[str, int], max_size: int
-    ) -> Dict[str, int]:
+    def _split_communities(node2cid: Dict[str, int], max_size: int) -> Dict[str, int]:
         """
         Split communities larger than max_size into smaller sub-communities.
         """
diff --git a/graphgen/operators/partition/__init__.py b/graphgen/operators/partition/__init__.py
@@ -1 +1 @@
-from .partition_kg import partition_kg
+from .partition_service import PartitionService
diff --git a/graphgen/operators/partition/partition_service.py b/graphgen/operators/partition/partition_service.py

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .partition_kg import partition_kg`
	`1`	`+from .partition_service import PartitionService`