Refactor/refactor generators (#137)

ChenZiHong-Gavin · gemini-code-assist[bot] · web-flow · commit c4093dff4946 · 2025-12-23T21:00:30.000+08:00
* fix: change cache_dir in read operator to working_dir

* refactor: use xml format prompt in Generators

* feat: change temperature &amp; max_token in vllmwrapper

* Update graphgen/models/generator/vqa_generator.py

Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;

---------

Co-authored-by: gemini-code-assist[bot] &lt;176961590+gemini-code-assist[bot]@users.noreply.github.com&gt;
diff --git a/graphgen/models/generator/aggregated_generator.py b/graphgen/models/generator/aggregated_generator.py
@@ -1,4 +1,5 @@
-from typing import Any
+import re
+from typing import Any, Optional
 
 from graphgen.bases import BaseGenerator
 from graphgen.templates import AGGREGATED_GENERATION_PROMPT
@@ -56,19 +57,21 @@ def build_prompt(
         return prompt
 
     @staticmethod
-    def parse_rephrased_text(response: str) -> str:
+    def parse_rephrased_text(response: str) -> Optional[str]:
         """
         Parse the rephrased text from the response.
         :param response:
         :return: rephrased text
         """
-        if "Rephrased Text:" in response:
-            rephrased_text = response.split("Rephrased Text:")[1].strip()
-        elif "重述文本:" in response:
-            rephrased_text = response.split("重述文本:")[1].strip()
+        rephrased_match = re.search(
+            r"<rephrased_text>(.*?)</rephrased_text>", response, re.DOTALL
+        )
+        if rephrased_match:
+            rephrased_text = rephrased_match.group(1).strip()
         else:
-            rephrased_text = response.strip()
-        return rephrased_text.strip('"')
+            logger.warning("Failed to parse rephrased text from response: %s", response)
+            return None
+        return rephrased_text.strip('"').strip("'")
 
     @staticmethod
     def _build_prompt_for_question_generation(answer: str) -> str:
@@ -85,15 +88,13 @@ def _build_prompt_for_question_generation(answer: str) -> str:
 
     @staticmethod
     def parse_response(response: str) -> dict:
-        if response.startswith("Question:"):
-            question = response[len("Question:") :].strip()
-        elif response.startswith("问题："):
-            question = response[len("问题：") :].strip()
+        question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
+        if question_match:
+            question = question_match.group(1).strip()
         else:
-            question = response.strip()
-        return {
-            "question": question,
-        }
+            logger.warning("Failed to parse question from response: %s", response)
+            return {"question": ""}
+        return {"question": question.strip('"').strip("'")}
 
     async def generate(
         self,
@@ -110,9 +111,13 @@ async def generate(
         rephrasing_prompt = self.build_prompt(batch)
         response = await self.llm_client.generate_answer(rephrasing_prompt)
         context = self.parse_rephrased_text(response)
+        if not context:
+            return result
         question_generation_prompt = self._build_prompt_for_question_generation(context)
         response = await self.llm_client.generate_answer(question_generation_prompt)
         question = self.parse_response(response)["question"]
+        if not question:
+            return result
         logger.debug("Question: %s", question)
         logger.debug("Answer: %s", context)
         qa_pairs = {
diff --git a/graphgen/models/generator/atomic_generator.py b/graphgen/models/generator/atomic_generator.py
@@ -1,3 +1,4 @@
+import re
 from typing import Any
 
 from graphgen.bases import BaseGenerator
@@ -29,17 +30,18 @@ def parse_response(response: str) -> dict:
         :param response:
         :return:
         """
-        if "Question:" in response and "Answer:" in response:
-            question = response.split("Question:")[1].split("Answer:")[0].strip()
-            answer = response.split("Answer:")[1].strip()
-        elif "问题：" in response and "答案：" in response:
-            question = response.split("问题：")[1].split("答案：")[0].strip()
-            answer = response.split("答案：")[1].strip()
+        question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
+        answer_match = re.search(r"<answer>(.*?)</answer>", response, re.DOTALL)
+
+        if question_match and answer_match:
+            question = question_match.group(1).strip()
+            answer = answer_match.group(1).strip()
         else:
             logger.warning("Failed to parse response: %s", response)
             return {}
-        question = question.strip('"')
-        answer = answer.strip('"')
+
+        question = question.strip('"').strip("'")
+        answer = answer.strip('"').strip("'")
         logger.debug("Question: %s", question)
         logger.debug("Answer: %s", answer)
         return {
diff --git a/graphgen/models/generator/cot_generator.py b/graphgen/models/generator/cot_generator.py
@@ -1,3 +1,4 @@
+import re
 from typing import Any
 
 from graphgen.bases import BaseGenerator
@@ -67,22 +68,26 @@ def build_prompt_for_cot_generation(
 
     @staticmethod
     def parse_response(response: str) -> dict:
-        if "Question:" in response and "Reasoning-Path Design:" in response:
-            question = (
-                response.split("Question:")[1]
-                .split("Reasoning-Path Design:")[0]
-                .strip()
-            )
-            reasoning_path = response.split("Reasoning-Path Design:")[1].strip()
-        elif "问题：" in response and "推理路径设计：" in response:
-            question = response.split("问题：")[1].split("推理路径设计：")[0].strip()
-            reasoning_path = response.split("推理路径设计：")[1].strip()
+        """
+        Parse CoT template from response.
+        :param response:
+        :return: dict with question and reasoning_path
+        """
+        question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
+        reasoning_path_match = re.search(
+            r"<reasoning_path>(.*?)</reasoning_path>", response, re.DOTALL
+        )
+
+        if question_match and reasoning_path_match:
+            question = question_match.group(1).strip()
+            reasoning_path = reasoning_path_match.group(1).strip()
         else:
-            logger.warning("Failed to parse CoT template: %s", response)
+            logger.warning("Failed to parse response: %s", response)
             return {}
 
-        question = question.strip('"')
-        reasoning_path = reasoning_path.strip('"')
+        question = question.strip('"').strip("'")
+        reasoning_path = reasoning_path.strip('"').strip("'")
+
         logger.debug("CoT Question: %s", question)
         logger.debug("CoT Reasoning Path: %s", reasoning_path)
         return {
@@ -105,6 +110,8 @@ async def generate(
         prompt = self.build_prompt(batch)
         response = await self.llm_client.generate_answer(prompt)
         response = self.parse_response(response)
+        if not response:
+            return result
         question, reasoning_path = response["question"], response["reasoning_path"]
         prompt = self.build_prompt_for_cot_generation(batch, question, reasoning_path)
         cot_answer = await self.llm_client.generate_answer(prompt)
diff --git a/graphgen/models/generator/multi_hop_generator.py b/graphgen/models/generator/multi_hop_generator.py
@@ -1,3 +1,4 @@
+import re
 from typing import Any
 
 from graphgen.bases import BaseGenerator
@@ -32,17 +33,18 @@ def build_prompt(
 
     @staticmethod
     def parse_response(response: str) -> dict:
-        if "Question:" in response and "Answer:" in response:
-            question = response.split("Question:")[1].split("Answer:")[0].strip()
-            answer = response.split("Answer:")[1].strip()
-        elif "问题：" in response and "答案：" in response:
-            question = response.split("问题：")[1].split("答案：")[0].strip()
-            answer = response.split("答案：")[1].strip()
+        question_match = re.search(r"<question>(.*?)</question>", response, re.DOTALL)
+        answer_match = re.search(r"<answer>(.*?)</answer>", response, re.DOTALL)
+
+        if question_match and answer_match:
+            question = question_match.group(1).strip()
+            answer = answer_match.group(1).strip()
         else:
             logger.warning("Failed to parse response: %s", response)
             return {}
-        question = question.strip('"')
-        answer = answer.strip('"')
+
+        question = question.strip('"').strip("'")
+        answer = answer.strip('"').strip("'")
         logger.debug("Question: %s", question)
         logger.debug("Answer: %s", answer)
         return {
diff --git a/graphgen/models/generator/vqa_generator.py b/graphgen/models/generator/vqa_generator.py
@@ -1,3 +1,4 @@
+import re
 from typing import Any
 
 from graphgen.bases import BaseGenerator
@@ -38,25 +39,21 @@ def parse_response(response: str) -> Any:
         :return: QA pairs
         """
         qa_pairs = {}
-        qa_list = response.strip().split("\n\n")
-        for qa in qa_list:
-            if "Question:" in qa and "Answer:" in qa:
-                question = qa.split("Question:")[1].split("Answer:")[0].strip()
-                answer = qa.split("Answer:")[1].strip()
-            elif "问题：" in qa and "答案：" in qa:
-                question = qa.split("问题：")[1].split("答案：")[0].strip()
-                answer = qa.split("答案：")[1].strip()
-            else:
-                logger.error("Failed to parse QA pair: %s", qa)
-                continue
-            question = question.strip('"')
-            answer = answer.strip('"')
-            logger.debug("Question: %s", question)
-            logger.debug("Answer: %s", answer)
-            qa_pairs[compute_content_hash(question)] = {
-                "question": question,
-                "answer": answer,
-            }
+        pattern = r"<question>(.*?)</question>\s*<answer>(.*?)</answer>"
+        matches = re.findall(pattern, response, re.DOTALL)
+
+        if matches:
+            for question, answer in matches:
+                question = question.strip().strip('"').strip("'")
+                answer = answer.strip().strip('"').strip("'")
+                logger.debug("Question: %s", question)
+                logger.debug("Answer: %s", answer)
+                qa_pairs[compute_content_hash(question)] = {
+                    "question": question,
+                    "answer": answer,
+                }
+        else:
+            logger.warning("Error parsing the response %s", response)
         return qa_pairs
 
     async def generate(
diff --git a/graphgen/models/llm/local/vllm_wrapper.py b/graphgen/models/llm/local/vllm_wrapper.py
@@ -16,7 +16,7 @@ def __init__(
         model: str,
         tensor_parallel_size: int = 1,
         gpu_memory_utilization: float = 0.9,
-        temperature: float = 0.0,
+        temperature: float = 0.6,
         top_p: float = 1.0,
         topk: int = 5,
         **kwargs: Any,
@@ -66,7 +66,7 @@ async def generate_answer(
         sp = self.SamplingParams(
             temperature=self.temperature if self.temperature > 0 else 1.0,
             top_p=self.top_p if self.temperature > 0 else 1.0,
-            max_tokens=extra.get("max_new_tokens", 512),
+            max_tokens=extra.get("max_new_tokens", 2048),
         )
 
         result_generator = self.engine.generate(full_prompt, sp, request_id=request_id)
@@ -82,7 +82,7 @@ async def generate_answer(
 
     async def generate_topk_per_token(
         self, text: str, history: Optional[List[str]] = None, **extra: Any
-        ) -> List[Token]:
+    ) -> List[Token]:
         full_prompt = self._build_inputs(text, history)
         request_id = f"graphgen_topk_{uuid.uuid4()}"
 
@@ -110,7 +110,9 @@ async def generate_topk_per_token(
 
         candidate_tokens = []
         for _, logprob_obj in top_logprobs.items():
-            tok_str = logprob_obj.decoded_token.strip() if logprob_obj.decoded_token else ""
+            tok_str = (
+                logprob_obj.decoded_token.strip() if logprob_obj.decoded_token else ""
+            )
             prob = float(math.exp(logprob_obj.logprob))
             candidate_tokens.append(Token(tok_str, prob))
 
@@ -120,7 +122,7 @@ async def generate_topk_per_token(
             main_token = Token(
                 text=candidate_tokens[0].text,
                 prob=candidate_tokens[0].prob,
-                top_candidates=candidate_tokens
+                top_candidates=candidate_tokens,
             )
             return [main_token]
         return []
diff --git a/graphgen/operators/generate/generate_service.py b/graphgen/operators/generate/generate_service.py
@@ -61,6 +61,9 @@ def generate(self, items: list[dict]) -> list[dict]:
             unit="batch",
         )
 
+        # Filter out empty results
+        results = [res for res in results if res]
+
         results = self.generator.format_generation_results(
             results, output_data_format=self.data_format
         )
diff --git a/graphgen/templates/generation/aggregated_generation.py b/graphgen/templates/generation/aggregated_generation.py
@@ -132,6 +132,8 @@
    - Logical consistency throughout
    - Clear cause-and-effect relationships
 
+**Attention: Please directly provide the rephrased text without any additional content or analysis.**
+
 ################
 -ENTITIES-
 ################
@@ -175,6 +177,8 @@
     - 整体逻辑一致性
     - 清晰的因果关系
 
+**注意： 请你直接给出重述文本，不要输出任何额外的内容，也不要进行任何分析。**
+
 ################
 -实体-
 ################
@@ -191,32 +195,52 @@
 ################
 请在下方直接输出连贯的重述文本，不要输出任何额外的内容。
 
+输出格式：
+<rephrased_text>rephrased_text_here</rephrased_text>
+
 重述文本:
 """
 
 REQUIREMENT_EN = """
 ################
 Please directly output the coherent rephrased text below, without any additional content.
 
+Output format:
+<rephrased_text>rephrased_text_here</rephrased_text>
+
 Rephrased Text:
 """
 
 QUESTION_GENERATION_EN: str = """The answer to a question is provided. Please generate a question that corresponds to the answer.
 
-################
-Answer:
-{answer}
-################
+The answer for which a question needs to be generated is as follows:
+<answer>{answer}</answer>
+
+Please note the following requirements:
+1. Only output one question text without any additional explanations or analysis.
+2. Do not repeat the content of the answer or any fragments of it.
+3. The question must be independently understandable and fully match the answer.
+
+Output format:
+<question>question_text</question>
+
 Question:
 """
 
 QUESTION_GENERATION_ZH: str = """下面提供了一个问题的答案，请生成一个与答案对应的问题。
 
-################
-答案：
-{answer}
-################
-问题：
+需要生成问题的答案如下：
+<answer>{answer}</answer>
+
+请注意下列要求：
+1. 仅输出一个问题文本，不得包含任何额外说明或分析
+2. 不得重复答案内容或其中任何片段
+3. 问题必须可独立理解且与答案完全匹配
+
+输出格式：
+<question>question_text</question>
+
+问题:
 """
 
 AGGREGATED_GENERATION_PROMPT = {
diff --git a/graphgen/templates/generation/atomic_generation.py b/graphgen/templates/generation/atomic_generation.py
diff --git a/graphgen/templates/generation/cot_generation.py b/graphgen/templates/generation/cot_generation.py
diff --git a/graphgen/templates/generation/multi_hop_generation.py b/graphgen/templates/generation/multi_hop_generation.py
diff --git a/graphgen/templates/generation/vqa_generation.py b/graphgen/templates/generation/vqa_generation.py

Original file line number	Diff line number	Diff line change
`@@ -61,6 +61,9 @@ def generate(self, items: list[dict]) -> list[dict]:`
`61`	`61`	`unit="batch",`
`62`	`62`	`)`
`63`	`63`
	`64`	`+ # Filter out empty results`
	`65`	`+ results = [res for res in results if res]`
	`66`	`+`
`64`	`67`	`results = self.generator.format_generation_results(`
`65`	`68`	`results, output_data_format=self.data_format`
`66`	`69`	`)`