feat(graphgen): support multiple rephrasing strategies

ChenZiHong-Gavin · ChenZiHong-Gavin · commit c4ea24f09f79 · 2025-01-22T01:05:01.000+08:00
diff --git a/generate.py b/generate.py
@@ -12,7 +12,7 @@
 sys_path = os.path.abspath(os.path.dirname(__file__))
 unique_id = int(time.time())
 set_logger(os.path.join(sys_path, "cache", "logs", f"graphgen_{unique_id}.log"), if_stream=False)
-config_path = os.path.join(sys_path, "cache", "configs", f"graphgen_{unique_id}.yaml")
+config_path = os.path.join(sys_path, "cache", "data", "graphgen", str(unique_id), f"config-{unique_id}.yaml")
 
 load_dotenv()
 
@@ -71,7 +71,7 @@ def save_config(global_config):
 
     graph_gen.insert(data, config['data_type'])
 
-    graph_gen.quiz(max_samples=2)
+    graph_gen.quiz(max_samples=config['quiz_samples'])
 
     graph_gen.judge(re_judge=False)
 
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -36,16 +36,16 @@ class GraphGen:
         working_dir, namespace="rephrase"
     )
     qa_storage: JsonKVStorage = JsonKVStorage(
-        os.path.join(working_dir, "data", "graphgen"), namespace=f"qa-{unique_id}"
+        os.path.join(working_dir, "data", "graphgen", str(unique_id)), namespace=f"qa-{unique_id}"
     )
 
     # text chunking
     chunk_size: int = 1024
     chunk_overlap_size: int = 100
 
     # llm
-    teacher_llm_client: OpenAIModel = None
-    student_llm_client: OpenAIModel = None
+    synthesizer_llm_client: OpenAIModel = None
+    training_llm_client: OpenAIModel = None
     tokenizer_instance: Tokenizer = None
 
     # web search
@@ -73,7 +73,7 @@ async def async_split_chunks(self, data: Union[List[list], List[dict]], data_typ
             if len(new_docs) == 0:
                 logger.warning("All docs are already in the storage")
                 return {}
-            logger.info(f"[New Docs] inserting {len(new_docs)} docs")
+            logger.info("[New Docs] inserting %d docs", len(new_docs))
             for doc_key, doc in tqdm_async(
                     new_docs.items(), desc="Chunking documents", unit="doc"
                 ):
@@ -127,14 +127,14 @@ async def async_insert(self, data: Union[List[list], List[dict]], data_type: str
 
         inserting_chunks = await self.async_split_chunks(data, data_type)
 
-        if not len(inserting_chunks):
+        if len(inserting_chunks) == 0:
             logger.warning("All chunks are already in the storage")
             return
-        logger.info(f"[New Chunks] inserting {len(inserting_chunks)} chunks")
+        logger.info("[New Chunks] inserting %d chunks", len(inserting_chunks))
 
         logger.info("[Entity and Relation Extraction]...")
         _add_entities_and_relations = await extract_kg(
-            llm_client=self.teacher_llm_client,
+            llm_client=self.synthesizer_llm_client,
             kg_instance=self.graph_storage,
             tokenizer_instance=self.tokenizer_instance,
             chunks=[Chunk(id=k, content=v['content']) for k, v in inserting_chunks.items()]
@@ -147,7 +147,7 @@ async def async_insert(self, data: Union[List[list], List[dict]], data_type: str
         if self.if_web_search:
             logger.info("[Wiki Search]...")
             _add_wiki_data = await search_wikipedia(
-                llm_client= self.teacher_llm_client,
+                llm_client= self.synthesizer_llm_client,
                 wiki_search_client=self.wiki_client,
                 knowledge_graph_instance=_add_entities_and_relations
             )
@@ -169,15 +169,15 @@ def quiz(self, max_samples=1):
         loop.run_until_complete(self.async_quiz(max_samples))
 
     async def async_quiz(self, max_samples=1):
-        await quiz(self.teacher_llm_client, self.graph_storage, self.rephrase_storage, max_samples)
+        await quiz(self.synthesizer_llm_client, self.graph_storage, self.rephrase_storage, max_samples)
         await self.rephrase_storage.index_done_callback()
 
     def judge(self, re_judge=False):
         loop = create_event_loop()
         loop.run_until_complete(self.async_judge(re_judge))
 
     async def async_judge(self, re_judge=False):
-        _update_relations = await judge_statement(self.student_llm_client, self.graph_storage,
+        _update_relations = await judge_statement(self.training_llm_client, self.graph_storage,
                                                   self.rephrase_storage, re_judge)
         await _update_relations.index_done_callback()
 
@@ -186,7 +186,7 @@ def traverse(self):
         loop.run_until_complete(self.async_traverse())
 
     async def async_traverse(self):
-        results = await traverse_graph_by_edge(self.teacher_llm_client, self.tokenizer_instance,
-                                               self.graph_storage, self.traverse_strategy)
+        results = await traverse_graph_by_edge(self.synthesizer_llm_client, self.tokenizer_instance,
+                                               self.graph_storage, self.traverse_strategy, self.text_chunks_storage)
         await self.qa_storage.upsert(results)
         await self.qa_storage.index_done_callback()
diff --git a/graphgen/operators/traverse_graph.py b/graphgen/operators/traverse_graph.py
@@ -1,7 +1,7 @@
 import asyncio
 from tqdm.asyncio import tqdm as tqdm_async
 
-from models import OpenAIModel, NetworkXStorage, TraverseStrategy, Tokenizer
+from models import OpenAIModel, NetworkXStorage, TraverseStrategy, Tokenizer, JsonKVStorage
 from templates import ANSWER_REPHRASING_PROMPT, QUESTION_GENERATION_PROMPT
 from utils import detect_main_language, compute_content_hash, logger
 from graphgen.operators.split_graph import get_batches_with_strategy
@@ -49,6 +49,46 @@ async def handle_node(node: dict) -> dict:
     await graph_storage.index_done_callback()
     return new_edges, new_nodes
 
+async def _construct_rephrasing_prompt(_process_nodes: list,
+                                       _process_edges: list,
+                                       _difficulty: str,
+                                       text_chunks_storage: JsonKVStorage,
+                                       add_context: bool = False
+                                       ) -> str:
+    entities = [
+        f"{_process_node['node_id']}: {_process_node['description']}" for _process_node in _process_nodes
+    ]
+    relations = [
+        f"{_process_edge[0]} -- {_process_edge[1]}: {_process_edge[2]['description']}"
+        for _process_edge in _process_edges
+    ]
+
+    entities_str = "\n".join([f"{index + 1}. {entity}" for index, entity in enumerate(entities)])
+    relations_str = "\n".join([f"{index + 1}. {relation}" for index, relation in enumerate(relations)])
+    language = "Chinese" if detect_main_language(entities_str + relations_str) == "zh" else "English"
+
+    if add_context:
+        original_ids = ([node['source_id'].split('<SEP>')[0] for node in _process_nodes] +
+                        [edge[2]['source_id'].split('<SEP>')[0] for edge in _process_edges])
+
+        original_ids = list(set(original_ids))
+        original_text = await text_chunks_storage.get_by_ids(original_ids)
+        original_text = "\n".join([f"{index + 1}. {text['content']}" for index, text in enumerate(original_text)])
+
+        prompt = ANSWER_REPHRASING_PROMPT[_difficulty][language]['CONTEXT_TEMPLATE'].format(
+            language=language,
+            original_text=original_text,
+            entities=entities_str,
+            relationships=relations_str
+        )
+        return prompt
+
+    prompt = ANSWER_REPHRASING_PROMPT[_difficulty][language]['TEMPLATE'].format(
+        language=language,
+        entities=entities_str,
+        relationships=relations_str
+    )
+    return prompt
 
 def get_loss_tercile(losses: list) -> (float, float):
     losses = sorted(losses)
@@ -61,11 +101,39 @@ def get_average_loss(batch: tuple) -> float:
     return sum(edge[2]['loss'] for edge in batch[1]) + sum(node['loss'] for node in batch[0]) / \
            (len(batch[0]) + len(batch[1]))
 
+def _post_process_synthetic_data(data):
+    block = data.split("\n\n")
+    qas = []
+    for line in block:
+        if "Question:" in line and "Answer:" in line:
+            question = line.split("Question:")[1].split("Answer:")[0].strip()
+            answer = line.split("Answer:")[1].strip()
+            qas.append({
+                "question": question,
+                "answer": answer
+            })
+        elif "问题：" in line and "答案：" in line:
+            question = line.split("问题：")[1].split("答案：")[0].strip()
+            answer = line.split("答案：")[1].strip()
+            qas.append({
+                "question": question,
+                "answer": answer
+            })
+        elif "问题:" in line and "回答:" in line:
+            question = line.split("问题:")[1].split("回答:")[0].strip()
+            answer = line.split("回答:")[1].strip()
+            qas.append({
+                "question": question,
+                "answer": answer
+            })
+    return qas
+
 async def traverse_graph_by_edge(
     llm_client: OpenAIModel,
     tokenizer: Tokenizer,
     graph_storage: NetworkXStorage,
     traverse_strategy: TraverseStrategy,
+    text_chunks_storage: JsonKVStorage,
     max_concurrent: int = 1000
 ) -> dict:
     """
@@ -75,6 +143,7 @@ async def traverse_graph_by_edge(
     :param tokenizer
     :param graph_storage
     :param traverse_strategy
+    :param text_chunks_storage
     :param max_concurrent
     :return: question and answer
     """
@@ -84,26 +153,15 @@ async def traverse_graph_by_edge(
     async def _process_nodes_and_edges(
             _process_nodes: list,
             _process_edges: list,
-            _difficulty: str
+            _difficulty: str,
     ) -> str:
-        entities = [
-            f"{_process_node['node_id']}: {_process_node['description']}" for _process_node in _process_nodes
-        ]
-        relations = [
-            f"{_process_edge[0]} -- {_process_edge[1]}: {_process_edge[2]['description']}"
-            for _process_edge in _process_edges
-        ]
-
-        entities_str = "\n".join([f"{index + 1}. {entity}" for index, entity in enumerate(entities)])
-        relations_str = "\n".join([f"{index + 1}. {relation}" for index, relation in enumerate(relations)])
-
-        language = "Chinese" if detect_main_language(entities_str + relations_str) == "zh" else "English"
-        prompt = ANSWER_REPHRASING_PROMPT[_difficulty][language]['TEMPLATE'].format(
-            language=language,
-            entities=entities_str,
-            relationships=relations_str
+        prompt = await _construct_rephrasing_prompt(
+            _process_nodes,
+            _process_edges,
+            _difficulty,
+            text_chunks_storage,
+            add_context = False
         )
-
         context = await llm_client.generate_answer(prompt)
 
         # post-process the context
@@ -115,7 +173,8 @@ async def _process_nodes_and_edges(
         return context
 
     async def _process_single_batch(
-        _process_batch: tuple
+        _process_batch: tuple,
+        question_type: str = "single"
     ) -> dict:
         async with semaphore:
             context = await _process_nodes_and_edges(
@@ -125,32 +184,55 @@ async def _process_single_batch(
             )
 
             language = "Chinese" if detect_main_language(context) == "zh" else "English"
-            question = await llm_client.generate_answer(
-                QUESTION_GENERATION_PROMPT[language]['TEMPLATE'].format(
-                    answer=context
-                )
-            )
-
-            if question.startswith("Question:"):
-                question = question[len("Question:"):].strip()
-            elif question.startswith("问题："):
-                question = question[len("问题："):].strip()
-
             pre_length = sum(node['length'] for node in _process_batch[0]) \
                          + sum(edge[2]['length'] for edge in _process_batch[1])
 
             logger.info("%d nodes and %d edges processed", len(_process_batch[0]), len(_process_batch[1]))
             logger.info("Pre-length: %s", pre_length)
-            logger.info("Question: %s Answer: %s", question, context)
 
-            return {
-                compute_content_hash(context): {
-                    "question": question,
-                    "answer": context,
+            if question_type == "single":
+                question = await llm_client.generate_answer(
+                    QUESTION_GENERATION_PROMPT[language]['SINGLE_TEMPLATE'].format(
+                        answer=context
+                    )
+                )
+                if question.startswith("Question:"):
+                    question = question[len("Question:"):].strip()
+                elif question.startswith("问题："):
+                    question = question[len("问题："):].strip()
+
+                return {
+                    compute_content_hash(context): {
+                        "question": question,
+                        "answer": context,
+                        "loss": get_average_loss(_process_batch),
+                        "difficulty": _process_batch[2],
+                    }
+                }
+
+            content = await llm_client.generate_answer(
+                QUESTION_GENERATION_PROMPT[language]['MULTI_TEMPLATE'].format(
+                    doc=context
+                )
+            )
+            qas = _post_process_synthetic_data(content)
+
+            if len(qas) == 0:
+                print(content)
+                logger.error("Error occurred while processing batch, question or answer is None")
+                return {}
+
+            final_results = {}
+            for qa in qas:
+                logger.info("Question: %s", qa['question'])
+                logger.info("Answer: %s", qa['answer'])
+                final_results[compute_content_hash(qa['question'])] = {
+                    "question": qa['question'],
+                    "answer": qa['answer'],
                     "loss": get_average_loss(_process_batch),
                     "difficulty": _process_batch[2],
                 }
-            }
+            return final_results
 
     results = {}
     edges = list(await graph_storage.get_all_edges())
diff --git a/models/storage/json_storage.py b/models/storage/json_storage.py
@@ -23,7 +23,7 @@ async def index_done_callback(self):
     async def get_by_id(self, id):
         return self._data.get(id, None)
 
-    async def get_by_ids(self, ids, fields=None):
+    async def get_by_ids(self, ids, fields=None) -> list:
         if fields is None:
             return [self._data.get(id, None) for id in ids]
         return [
diff --git a/scripts/generate.sh b/scripts/generate.sh
@@ -1,3 +1 @@
-python3 generate.py --input_file resources/examples/raw_demo.jsonl \
-                    --data_type raw \
-    #               --web_search
+python3 generate.py --config_file configs/graphgen_config.yaml
diff --git a/scripts/judge.sh b/scripts/judge.sh
@@ -0,0 +1 @@
+python3 evaluate.py --output cache/output/new_graph.graphml \
diff --git a/templates/answer_rephrasing.py b/templates/answer_rephrasing.py
diff --git a/templates/question_generation.py b/templates/question_generation.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+python3 evaluate.py --output cache/output/new_graph.graphml \`