refactor: refactor generate pipeline

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 9434fd383f90 · 2025-09-30T14:03:32.000+08:00
diff --git a/.env.example b/.env.example
@@ -1,3 +1,4 @@
+TOKENIZER_MODEL=
 SYNTHESIZER_MODEL=
 SYNTHESIZER_BASE_URL=
 SYNTHESIZER_API_KEY=
diff --git a/graphgen/configs/aggregated_config.yaml b/graphgen/configs/aggregated_config.yaml
@@ -6,19 +6,21 @@ split:
 search: # web search configuration
   enabled: false # whether to enable web search
   search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-output_data_type: aggregated # atomic, aggregated, multi_hop, cot
-output_data_format: ChatML # Alpaca, Sharegpt, ChatML
-tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
-quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
+quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
   enabled: true
   quiz_samples: 2 # number of quiz samples to generate
   re_judge: false # whether to re-judge the existing quiz samples
-traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
-  bidirectional: true # whether to traverse the graph in both directions
-  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
-  expand_method: max_width # expand method, support: max_width, max_depth
-  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
-  max_depth: 5 # maximum depth for graph traversal
-  max_extra_edges: 20 # max edges per direction (if expand_method="max_width")
-  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
-  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
+partition: # graph partition configuration
+  method: ece # ece is a custom partition method based on comprehension loss
+  ece_params:
+    bidirectional: true # whether to traverse the graph in both directions
+    edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
+    expand_method: max_width # expand method, support: max_width, max_depth
+    isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
+    max_depth: 5 # maximum depth for graph traversal
+    max_extra_edges: 20 # max edges per direction (if expand_method="max_width")
+    max_tokens: 256 # restricts input length (if expand_method="max_tokens")
+    loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
+generate:
+  mode: aggregated # atomic, aggregated, multi_hop, cot
+  data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/atomic_config.yaml b/graphgen/configs/atomic_config.yaml
@@ -6,19 +6,21 @@ split:
 search: # web search configuration
   enabled: false # whether to enable web search
   search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-output_data_type: atomic # atomic, aggregated, multi_hop, cot
-output_data_format: Alpaca # Alpaca, Sharegpt, ChatML
-tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
-quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
+quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
   enabled: true
   quiz_samples: 2 # number of quiz samples to generate
   re_judge: false # whether to re-judge the existing quiz samples
-traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
-  bidirectional: true # whether to traverse the graph in both directions
-  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
-  expand_method: max_width # expand method, support: max_width, max_depth
-  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
-  max_depth: 3 # maximum depth for graph traversal
-  max_extra_edges: 5 # max edges per direction (if expand_method="max_width")
-  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
-  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
+partition: # graph partition configuration
+  method: ece # ece is a custom partition method based on comprehension loss
+  ece_params:
+    bidirectional: true # whether to traverse the graph in both directions
+    edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
+    expand_method: max_width # expand method, support: max_width, max_depth
+    isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
+    max_depth: 3 # maximum depth for graph traversal
+    max_extra_edges: 5 # max edges per direction (if expand_method="max_width")
+    max_tokens: 256 # restricts input length (if expand_method="max_tokens")
+    loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
+generate:
+  mode: atomic # atomic, aggregated, multi_hop, cot
+  data_format: Alpaca # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/cot_config.yaml b/graphgen/configs/cot_config.yaml
@@ -6,11 +6,14 @@ split:
 search: # web search configuration
   enabled: false # whether to enable web search
   search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-output_data_type: cot # atomic, aggregated, multi_hop, cot
-output_data_format: Sharegpt # Alpaca, Sharegpt, ChatML
-tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
-method_params:
-  method: leiden
-  max_size: 20 # Maximum size of communities
-  use_lcc: false
-  random_seed: 42
+quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
+  enabled: false
+partition: # graph partition configuration
+  method: leiden # leiden is a community detection algorithm
+  leiden_params:
+    max_size: 20 # Maximum size of communities
+    use_lcc: false
+    random_seed: 42
+generate:
+  mode: cot # atomic, aggregated, multi_hop, cot
+  data_format: Sharegpt # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/configs/multi_hop_config.yaml b/graphgen/configs/multi_hop_config.yaml
@@ -6,19 +6,21 @@ split:
 search: # web search configuration
   enabled: false # whether to enable web search
   search_types: ["google"] # search engine types, support: google, bing, uniprot, wikipedia
-output_data_type: multi_hop # atomic, aggregated, multi_hop, cot
-output_data_format: ChatML # Alpaca, Sharegpt, ChatML
-tokenizer: cl100k_base # tokenizer for counting tokens, support tiktoken tokenizer names and local tokenizer path
-quiz_and_judge_strategy: # quiz and test whether the LLM masters the knowledge points
+quiz_and_judge: # quiz and test whether the LLM masters the knowledge points
   enabled: false
   quiz_samples: 2 # number of quiz samples to generate
   re_judge: false # whether to re-judge the existing quiz samples
-traverse_strategy: # strategy for clustering sub-graphs using comprehension loss
-  bidirectional: true # whether to traverse the graph in both directions
-  edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
-  expand_method: max_width # expand method, support: max_width, max_depth
-  isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
-  max_depth: 1 # maximum depth for graph traversal
-  max_extra_edges: 2 # max edges per direction (if expand_method="max_width")
-  max_tokens: 256 # restricts input length (if expand_method="max_tokens")
-  loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
+partition: # graph partition configuration
+  method: ece # ece is a custom partition method based on comprehension loss
+  ece_params:
+    bidirectional: true # whether to traverse the graph in both directions
+    edge_sampling: max_loss # edge sampling strategy, support: random, max_loss, min_loss
+    expand_method: max_width # expand method, support: max_width, max_depth
+    isolated_node_strategy: ignore # strategy for isolated nodes, support: ignore, add
+    max_depth: 1 # maximum depth for graph traversal
+    max_extra_edges: 2 # max edges per direction (if expand_method="max_width")
+    max_tokens: 256 # restricts input length (if expand_method="max_tokens")
+    loss_strategy: only_edge # defines loss computation focus, support: only_edge, both
+generate:
+  mode: multi_hop # strategy for generating multi-hop QA pairs
+  data_format: ChatML # Alpaca, Sharegpt, ChatML
diff --git a/graphgen/generate.py b/graphgen/generate.py
@@ -6,8 +6,8 @@
 import yaml
 from dotenv import load_dotenv
 
-from .graphgen import GraphGen
-from .utils import logger, set_logger
+from graphgen.graphgen import GraphGen
+from graphgen.utils import logger, set_logger
 
 sys_path = os.path.abspath(os.path.dirname(__file__))
 
@@ -50,12 +50,10 @@ def main():
     with open(args.config_file, "r", encoding="utf-8") as f:
         config = yaml.load(f, Loader=yaml.FullLoader)
 
-    output_data_type = config["output_data_type"]
+    mode = config["generate"]["mode"]
     unique_id = int(time.time())
 
-    output_path = os.path.join(
-        working_dir, "data", "graphgen", f"{unique_id}_{output_data_type}"
-    )
+    output_path = os.path.join(working_dir, "data", "graphgen", f"{unique_id}_{mode}")
     set_working_dir(output_path)
 
     set_logger(
@@ -65,35 +63,35 @@ def main():
     logger.info(
         "GraphGen with unique ID %s logging to %s",
         unique_id,
-        os.path.join(
-            working_dir, "logs", f"{unique_id}_graphgen_{output_data_type}.log"
-        ),
+        os.path.join(working_dir, f"{unique_id}.log"),
     )
 
-    graph_gen = GraphGen(working_dir=working_dir, unique_id=unique_id, config=config)
+    graph_gen = GraphGen(working_dir=working_dir, output_path=output_path)
 
-    graph_gen.insert()
+    graph_gen.insert(read_config=config["read"], split_config=config["split"])
 
-    if config["search"]["enabled"]:
-        graph_gen.search()
+    graph_gen.search(search_config=config["search"])
 
     # Use pipeline according to the output data type
-    if output_data_type in ["atomic", "aggregated", "multi_hop"]:
-        if "quiz_and_judge_strategy" in config and config[
-            "quiz_and_judge_strategy"
-        ].get("enabled", False):
-            graph_gen.quiz()
-            graph_gen.judge()
+    if mode in ["atomic", "aggregated", "multi_hop"]:
+        logger.info("Generation mode set to '%s'. Start generation.", mode)
+        if "quiz_and_judge" in config:
+            graph_gen.quiz_and_judge(quiz_and_judge_config=config["quiz_and_judge"])
         else:
             logger.warning(
                 "Quiz and Judge strategy is disabled. Edge sampling falls back to random."
             )
-            graph_gen.traverse_strategy.edge_sampling = "random"
-        graph_gen.traverse()
-    elif output_data_type == "cot":
-        graph_gen.generate_reasoning(method_params=config["method_params"])
+            # TODO: make edge sampling random
+        #     graph_gen.traverse_strategy.edge_sampling = "random"
+    elif mode == "cot":
+        logger.info("Generation mode set to 'cot'. Start generation.")
     else:
-        raise ValueError(f"Unsupported output data type: {output_data_type}")
+        raise ValueError(f"Unsupported output data type: {mode}")
+
+    graph_gen.generate(
+        partition_config=config["partition"],
+        generate_config=config["generate"],
+    )
 
     save_config(os.path.join(output_path, "config.yaml"), config)
     logger.info("GraphGen completed successfully. Data saved to %s", output_path)
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -1,7 +1,7 @@
 import asyncio
 import os
 import time
-from dataclasses import dataclass, field
+from dataclasses import dataclass
 from typing import Dict, cast
 
 import gradio as gr
@@ -14,7 +14,6 @@
     NetworkXStorage,
     OpenAIClient,
     Tokenizer,
-    TraverseStrategy,
 )
 from graphgen.operators import (
     chunk_documents,
@@ -40,30 +39,24 @@
 
 @dataclass
 class GraphGen:
-    unique_id: int = int(time.time())
     working_dir: str = os.path.join(sys_path, "cache")
-    config: Dict = field(default_factory=dict)
+    output_path: str = os.path.join(
+        working_dir, "data", "graphgen", str(int(time.time()))
+    )
 
     # llm
     tokenizer_instance: Tokenizer = None
     synthesizer_llm_client: OpenAIClient = None
     trainee_llm_client: OpenAIClient = None
 
-    # search
-    search_config: dict = field(
-        default_factory=lambda: {"enabled": False, "search_types": ["wikipedia"]}
-    )
-
-    # traversal
-    traverse_strategy: TraverseStrategy = None
-
     # webui
     progress_bar: gr.Progress = None
 
     def __post_init__(self):
         self.tokenizer_instance: Tokenizer = Tokenizer(
-            model_name=self.config["tokenizer"]
+            model_name=os.getenv("TOKENIZER_MODEL")
         )
+
         self.synthesizer_llm_client: OpenAIClient = OpenAIClient(
             model_name=os.getenv("SYNTHESIZER_MODEL"),
             api_key=os.getenv("SYNTHESIZER_API_KEY"),
@@ -76,12 +69,6 @@ def __post_init__(self):
             base_url=os.getenv("TRAINEE_BASE_URL"),
             tokenizer=self.tokenizer_instance,
         )
-        self.search_config = self.config["search"]
-
-        if "traverse_strategy" in self.config:
-            self.traverse_strategy = TraverseStrategy(
-                **self.config["traverse_strategy"]
-            )
 
         self.full_docs_storage: JsonKVStorage = JsonKVStorage(
             self.working_dir, namespace="full_docs"
@@ -99,24 +86,17 @@ def __post_init__(self):
             self.working_dir, namespace="rephrase"
         )
         self.qa_storage: JsonListStorage = JsonListStorage(
-            os.path.join(
-                self.working_dir,
-                "data",
-                "graphgen",
-                f"{self.unique_id}_{self.config['output_data_type']}",
-            ),
+            self.working_dir,
             namespace="qa",
         )
 
     @async_to_sync_method
-    async def insert(self):
+    async def insert(self, read_config: Dict, split_config: Dict):
         """
         insert chunks into the graph
         """
-        input_file = self.config["read"]["input_file"]
-
         # Step 1: Read files
-        data = read_files(input_file)
+        data = read_files(read_config["input_file"])
         if len(data) == 0:
             logger.warning("No data to process")
             return
@@ -141,8 +121,8 @@ async def insert(self):
 
         inserting_chunks = await chunk_documents(
             new_docs,
-            self.config["split"]["chunk_size"],
-            self.config["split"]["chunk_overlap"],
+            split_config["chunk_size"],
+            split_config["chunk_overlap"],
             self.tokenizer_instance,
             self.progress_bar,
         )
@@ -178,6 +158,7 @@ async def insert(self):
             return
 
         await self._insert_done()
+        return _add_entities_and_relations
 
     async def _insert_done(self):
         tasks = []
@@ -193,14 +174,12 @@ async def _insert_done(self):
         await asyncio.gather(*tasks)
 
     @async_to_sync_method
-    async def search(self):
+    async def search(self, search_config: Dict):
         logger.info(
-            "Search is %s", "enabled" if self.search_config["enabled"] else "disabled"
+            "Search is %s", "enabled" if search_config["enabled"] else "disabled"
         )
-        if self.search_config["enabled"]:
-            logger.info(
-                "[Search] %s ...", ", ".join(self.search_config["search_types"])
-            )
+        if search_config["enabled"]:
+            logger.info("[Search] %s ...", ", ".join(search_config["search_types"]))
             all_nodes = await self.graph_storage.get_all_nodes()
             all_nodes_names = [node[0] for node in all_nodes]
             new_search_entities = await self.full_docs_storage.filter_keys(
@@ -210,7 +189,7 @@ async def search(self):
                 "[Search] Found %d entities to search", len(new_search_entities)
             )
             _add_search_data = await search_all(
-                search_types=self.search_config["search_types"],
+                search_types=search_config["search_types"],
                 search_entities=new_search_entities,
             )
             if _add_search_data:
@@ -230,27 +209,37 @@ async def search(self):
                 await self.insert()
 
     @async_to_sync_method
-    async def quiz(self):
-        max_samples = self.config["quiz_and_judge_strategy"]["quiz_samples"]
+    async def quiz_and_judge(self, quiz_and_judge_config: Dict):
+        if quiz_and_judge_config is None or not quiz_and_judge_config.get(
+            "enabled", False
+        ):
+            logger.warning("Quiz and Judge is not used in this pipeline.")
+            return
+        max_samples = quiz_and_judge_config["quiz_samples"]
         await quiz(
             self.synthesizer_llm_client,
             self.graph_storage,
             self.rephrase_storage,
             max_samples,
         )
-        await self.rephrase_storage.index_done_callback()
 
-    @async_to_sync_method
-    async def judge(self):
-        re_judge = self.config["quiz_and_judge_strategy"]["re_judge"]
+        # TODO： assert trainee_llm_client is valid before judge
+        re_judge = quiz_and_judge_config["re_judge"]
         _update_relations = await judge_statement(
             self.trainee_llm_client,
             self.graph_storage,
             self.rephrase_storage,
             re_judge,
         )
+        await self.rephrase_storage.index_done_callback()
         await _update_relations.index_done_callback()
 
+    @async_to_sync_method
+    async def generate(self, partition_config: Dict, generate_config: Dict):
+        # Step 1: partition the graph
+        # TODO: implement graph partitioning, e.g. Partitioner().partition(self.graph_storage)
+        pass
+
     @async_to_sync_method
     async def traverse(self):
         output_data_type = self.config["output_data_type"]
diff --git a/graphgen/models/tokenizer/__init__.py b/graphgen/models/tokenizer/__init__.py

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,4 @@`
	`1`	`+TOKENIZER_MODEL=`
`1`	`2`	`SYNTHESIZER_MODEL=`
`2`	`3`	`SYNTHESIZER_BASE_URL=`
`3`	`4`	`SYNTHESIZER_API_KEY=`