InternScience
diff --git a/‎.pylintrc‎
Lines changed: 1 addition & 1 deletion b/‎.pylintrc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎graphgen/bases/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎graphgen/bases/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎graphgen/bases/base_generator.py‎
Lines changed: 84 additions & 0 deletions b/‎graphgen/bases/base_generator.py‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎graphgen/bases/base_partitioner.py‎
Lines changed: 76 additions & 0 deletions b/‎graphgen/bases/base_partitioner.py‎
Lines changed: 76 additions & 0 deletions
diff --git a/‎graphgen/bases/base_storage.py‎
Lines changed: 2 additions & 2 deletions b/‎graphgen/bases/base_storage.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎graphgen/bases/datatypes.py‎
Lines changed: 8 additions & 0 deletions b/‎graphgen/bases/datatypes.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎graphgen/configs/aggregated_config.yaml‎
Lines changed: 4 additions & 8 deletions b/‎graphgen/configs/aggregated_config.yaml‎
Lines changed: 4 additions & 8 deletions
diff --git a/‎graphgen/configs/atomic_config.yaml‎
Lines changed: 2 additions & 9 deletions b/‎graphgen/configs/atomic_config.yaml‎
Lines changed: 2 additions & 9 deletions
diff --git a/‎graphgen/configs/cot_config.yaml‎
Lines changed: 3 additions & 3 deletions b/‎graphgen/configs/cot_config.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎graphgen/configs/multi_hop_config.yaml‎
Lines changed: 4 additions & 8 deletions b/‎graphgen/configs/multi_hop_config.yaml‎
Lines changed: 4 additions & 8 deletions
@@ -308,7 +308,7 @@ max-public-methods=20
 max-returns=6
 
 # Maximum number of statements in function / method body.
-max-statements=50
+max-statements=60
 
 # Minimum number of public methods for a class (see R0903).
 min-public-methods=2
 
@@ -1,5 +1,7 @@
+from .base_generator import BaseGenerator
 from .base_kg_builder import BaseKGBuilder
 from .base_llm_client import BaseLLMClient
+from .base_partitioner import BasePartitioner
 from .base_reader import BaseReader
 from .base_splitter import BaseSplitter
 from .base_storage import (
 
@@ -0,0 +1,84 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any
+
+from graphgen.bases.base_llm_client import BaseLLMClient
+
+
+@dataclass
+class BaseGenerator(ABC):
+    """
+    Generate QAs based on given prompts.
+    """
+
+    llm_client: BaseLLMClient
+
+    @staticmethod
+    @abstractmethod
+    def build_prompt(
+        batch: tuple[list[tuple[str, dict]], list[tuple[Any, Any, dict]]]
+    ) -> str:
+        """Build prompt for LLM based on the given batch"""
+
+    @staticmethod
+    @abstractmethod
+    def parse_response(response: str) -> Any:
+        """Parse the LLM response and return the generated QAs"""
+
+    async def generate(
+        self,
+        batch: tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ],
+    ) -> dict[str, Any]:
+        """
+        Generate QAs based on a given batch.
+        :param batch
+        :return: QA pairs
+        """
+        result = {}
+        prompt = self.build_prompt(batch)
+        response = await self.llm_client.generate_answer(prompt)
+        qa_pairs = self.parse_response(response)  # generate one or more QA pairs
+        result.update(qa_pairs)
+        return result
+
+    @staticmethod
+    def format_generation_results(
+        results: list[dict], output_data_format: str
+    ) -> list[dict[str, Any]]:
+        if output_data_format == "Alpaca":
+            results = [
+                {
+                    "instruction": v["question"],
+                    "input": "",
+                    "output": v["answer"],
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        elif output_data_format == "Sharegpt":
+            results = [
+                {
+                    "conversations": [
+                        {"from": "human", "value": v["question"]},
+                        {"from": "gpt", "value": v["answer"]},
+                    ]
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        elif output_data_format == "ChatML":
+            results = [
+                {
+                    "messages": [
+                        {"role": "user", "content": v["question"]},
+                        {"role": "assistant", "content": v["answer"]},
+                    ]
+                }
+                for item in results
+                for k, v in item.items()
+            ]
+        else:
+            raise ValueError(f"Unknown output data format: {output_data_format}")
+        return results
@@ -0,0 +1,76 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any, List
+
+from graphgen.bases.base_storage import BaseGraphStorage
+from graphgen.bases.datatypes import Community
+
+
+@dataclass
+class BasePartitioner(ABC):
+    @abstractmethod
+    async def partition(
+        self,
+        g: BaseGraphStorage,
+        **kwargs: Any,
+    ) -> List[Community]:
+        """
+        Graph -> Communities
+        :param g: Graph storage instance
+        :param kwargs: Additional parameters for partitioning
+        :return: List of communities
+        """
+
+    @staticmethod
+    async def community2batch(
+        communities: List[Community], g: BaseGraphStorage
+    ) -> list[
+        tuple[
+            list[tuple[str, dict]], list[tuple[Any, Any, dict] | tuple[Any, Any, Any]]
+        ]
+    ]:
+        """
+        Convert communities to batches of nodes and edges.
+        :param communities
+        :param g: Graph storage instance
+        :return: List of batches, each batch is a tuple of (nodes, edges)
+        """
+        batches = []
+        for comm in communities:
+            nodes = comm.nodes
+            edges = comm.edges
+            nodes_data = []
+            for node in nodes:
+                node_data = await g.get_node(node)
+                if node_data:
+                    nodes_data.append((node, node_data))
+            edges_data = []
+            for u, v in edges:
+                edge_data = await g.get_edge(u, v)
+                if edge_data:
+                    edges_data.append((u, v, edge_data))
+                else:
+                    edge_data = await g.get_edge(v, u)
+                    if edge_data:
+                        edges_data.append((v, u, edge_data))
+            batches.append((nodes_data, edges_data))
+        return batches
+
+    @staticmethod
+    def _build_adjacency_list(
+        nodes: List[tuple[str, dict]], edges: List[tuple[str, str, dict]]
+    ) -> tuple[dict[str, List[str]], set[tuple[str, str]]]:
+        """
+        Build adjacency list and edge set from nodes and edges.
+        :param nodes
+        :param edges
+        :return: adjacency list, edge set
+        """
+        adj: dict[str, List[str]] = {n[0]: [] for n in nodes}
+        edge_set: set[tuple[str, str]] = set()
+        for e in edges:
+            adj[e[0]].append(e[1])
+            adj[e[1]].append(e[0])
+            edge_set.add((e[0], e[1]))
+            edge_set.add((e[1], e[0]))
+        return adj, edge_set
@@ -78,7 +78,7 @@ async def get_node(self, node_id: str) -> Union[dict, None]:
     async def update_node(self, node_id: str, node_data: dict[str, str]):
         raise NotImplementedError
 
-    async def get_all_nodes(self) -> Union[list[dict], None]:
+    async def get_all_nodes(self) -> Union[list[tuple[str, dict]], None]:
         raise NotImplementedError
 
     async def get_edge(
@@ -91,7 +91,7 @@ async def update_edge(
     ):
         raise NotImplementedError
 
-    async def get_all_edges(self) -> Union[list[dict], None]:
+    async def get_all_edges(self) -> Union[list[tuple[str, str, dict]], None]:
         raise NotImplementedError
 
     async def get_node_edges(
 
@@ -30,3 +30,11 @@ class Token:
     @property
     def logprob(self) -> float:
         return math.log(self.prob)
+
+
+@dataclass
+class Community:
+    id: Union[int, str]
+    nodes: List[str] = field(default_factory=list)
+    edges: List[tuple] = field(default_factory=list)
+    metadata: dict = field(default_factory=dict)
@@ -13,14 +13,10 @@ quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
 partition: # graph partition configuration
   method: ece # ece is a custom partition method based on comprehension loss
   method_params:
-    bidirectional: true # whether to traverse the graph in both directions
-    edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
-    expand_method: max_width # expand method, support: max_width, max_depth
-    isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
-    max_depth: 5 # maximum depth for graph traversal
-    max_extra_edges: 20 # max edges per direction (if expand_method="max_width")
-    max_tokens: 256 # restricts input length (if expand_method="max_tokens")
-    loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
+    max_units_per_community: 20 # max nodes and edges per community
+    min_units_per_community: 5 # min nodes and edges per community
+    max_tokens_per_community: 10240 # max tokens per community
+    unit_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
 generate:
   mode: aggregated # atomic, aggregated, multi_hop, cot
   data_format: ChatML # Alpaca, Sharegpt, ChatML
@@ -11,16 +11,9 @@ quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
   quiz_samples: 2 # number of quiz samples to generate
   re_judge: false # whether to re-judge the existing quiz samples
 partition: # graph partition configuration
-  method: ece # ece is a custom partition method based on comprehension loss
+  method: dfs # partition method, support: dfs, bfs, ece, leiden
   method_params:
-    bidirectional: true # whether to traverse the graph in both directions
-    edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
-    expand_method: max_width # expand method, support: max_width, max_depth
-    isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
-    max_depth: 3 # maximum depth for graph traversal
-    max_extra_edges: 5 # max edges per direction (if expand_method="max_width")
-    max_tokens: 256 # restricts input length (if expand_method="max_tokens")
-    loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
+    max_units_per_community: 1 # atomic partition, one node or edge per community
 generate:
   mode: atomic # atomic, aggregated, multi_hop, cot
   data_format: Alpaca # Alpaca, Sharegpt, ChatML
@@ -9,11 +9,11 @@ search: # web search configuration
 quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
   enabled: false
 partition: # graph partition configuration
-  method: leiden # leiden is a community detection algorithm
+  method: leiden # leiden is a partitioner detection algorithm
   method_params:
     max_size: 20 # Maximum size of communities
-    use_lcc: false
-    random_seed: 42
+    use_lcc: false # whether to use the largest connected component
+    random_seed: 42 # random seed for partitioning
 generate:
   mode: cot # atomic, aggregated, multi_hop, cot
   data_format: Sharegpt # Alpaca, Sharegpt, ChatML
@@ -13,14 +13,10 @@ quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
 partition: # graph partition configuration
   method: ece # ece is a custom partition method based on comprehension loss
   method_params:
-    bidirectional: true # whether to traverse the graph in both directions
-    edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
-    expand_method: max_width # expand method, support: max_width, max_depth
-    isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
-    max_depth: 1 # maximum depth for graph traversal
-    max_extra_edges: 2 # max edges per direction (if expand_method="max_width")
-    max_tokens: 256 # restricts input length (if expand_method="max_tokens")
-    loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
+    max_units_per_community: 3 # max nodes and edges per community, for multi-hop, we recommend setting it to 3
+    min_units_per_community: 3 # min nodes and edges per community, for multi-hop, we recommend setting it to 3
+    max_tokens_per_community: 10240 # max tokens per community
+    unit_sampling: random # edge sampling strategy, support: random, max_loss, min_loss
 generate:
   mode: multi_hop # strategy for generating multi-hop QA pairs
   data_format: ChatML # Alpaca, Sharegpt, ChatML