feat(graphgen): multi-sample when judging

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 7ae7955a4479 · 2025-01-13T16:18:35.000+08:00
diff --git a/baselines/Genie/genie.py b/baselines/Genie/genie.py
@@ -1,17 +1,20 @@
 # https://arxiv.org/pdf/2401.14367
+
 import os
 import json
 import argparse
 import asyncio
-
+from typing import List
 from dataclasses import dataclass
+from tqdm.asyncio import tqdm as tqdm_async
 from dotenv import load_dotenv
+
 from models import OpenAIModel
-from typing import List
 from utils import create_event_loop, compute_content_hash
-from tqdm.asyncio import tqdm as tqdm_async
 
-PROMPT_TEMPLATE = '''Instruction: Given the next [document], create a [question] and [answer] pair that are grounded in the main point of the document, don't add any additional information that is not in the document. The [question] is by an information-seeking user and the [answer] is provided by a helping AI Agent.
+PROMPT_TEMPLATE = '''Instruction: Given the next [document], create a [question] and [answer] pair that are grounded \
+in the main point of the document, don't add any additional information that is not in the document. The [question] is \
+by an information-seeking user and the [answer] is provided by a helping AI Agent.
 
 [document]: Scrumptious Sweet Co. factory ...
 
@@ -23,13 +26,16 @@
 
 ### Response:
 [question]: What is the plot of the show Schitt's Creek?
-[answer]: The show Schitt's Creek is about a wealthy family who loses their fortune and is forced to rebuild their lives in a small town. The show follows the family as they adjust to their new life in the town and learn to appreciate the simple things in life.
+[answer]: The show Schitt's Creek is about a wealthy family who loses their fortune and is forced to rebuild their \
+lives in a small town. The show follows the family as they adjust to their new life in the town and learn to \
+appreciate the simple things in life.
 
 [document]: 2016's countdown broke several Hottest 100 records ...
 
 ### Response:
 [question]: What was the most popular song on the 2016 Hottest 100?
-[answer]: The most popular song on the 2016 Hottest 100 was "Never Be Like You" by Flume. This was the first time that an electronic dance music producer topped the countdown.
+[answer]: The most popular song on the 2016 Hottest 100 was "Never Be Like You" by Flume. This was the first time that \
+an electronic dance music producer topped the countdown.
 
 [document]: In Greek mythology, Persephone ...
 
@@ -79,7 +85,7 @@ async def process_chunk(content: str):
                     'question': question,
                     'answer': answer
                 }
-            except Exception as e:
+            except Exception as e: # pylint: disable=broad-except
                 print(f"Error: {e}")
         return final_results
 
@@ -112,15 +118,15 @@ async def process_chunk(content: str):
     genie = Genie(llm_client=llm_client)
 
     if args.data_type == 'raw':
-        with open(args.input_file, "r") as f:
+        with open(args.input_file, "r", encoding='utf-8') as f:
             data = [json.loads(line) for line in f]
             data = [[chunk] for chunk in data]
     elif args.data_type == 'chunked':
-        with open(args.input_file, "r") as f:
+        with open(args.input_file, "r", encoding='utf-8') as f:
             data = json.load(f)
 
     results = genie.generate(data)
 
     # Save results
-    with open(args.output_file, "w") as f:
+    with open(args.output_file, "w", encoding='utf-8') as f:
         json.dump(results, f, indent=4, ensure_ascii=False)
diff --git a/generate.py b/generate.py
@@ -73,7 +73,7 @@
 
     graph_gen.insert(data, args.data_type)
 
-    graph_gen.judge(re_judge=False)
+    graph_gen.judge(re_judge=True, max_samples=3)
 
     graph_gen.traverse()
     with open(os.path.join(sys_path, "cache", "configs", f"graphgen_{unique_id}.yaml"), "w", encoding='utf-8') as f:
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -32,6 +32,9 @@ class GraphGen:
     graph_storage: NetworkXStorage = NetworkXStorage(
         working_dir, namespace="graph"
     )
+    rephrase_storage: JsonKVStorage = JsonKVStorage(
+        working_dir, namespace="rephrase"
+    )
     qa_storage: JsonKVStorage = JsonKVStorage(
         os.path.join(working_dir, "data", "graphgen"), namespace=f"qa-{unique_id}"
     )
@@ -159,19 +162,22 @@ async def _insert_done(self):
             tasks.append(cast(StorageNameSpace, storage_instance).index_done_callback())
         await asyncio.gather(*tasks)
 
-    def judge(self, re_judge=False):
+    def judge(self, re_judge=False, max_samples=1):
         loop = create_event_loop()
-        loop.run_until_complete(self.async_judge(re_judge))
+        loop.run_until_complete(self.async_judge(re_judge, max_samples))
 
-    async def async_judge(self, re_judge=False):
-        _update_relations = await judge_relations(self.teacher_llm_client, self.student_llm_client, self.graph_storage, re_judge)
+    async def async_judge(self, re_judge=False, max_samples=1):
+        _update_relations = await judge_relations(self.teacher_llm_client, self.student_llm_client,
+                                                  self.graph_storage, self.rephrase_storage, re_judge, max_samples)
         await _update_relations.index_done_callback()
+        await self.rephrase_storage.index_done_callback()
 
     def traverse(self):
         loop = create_event_loop()
         loop.run_until_complete(self.async_traverse())
 
     async def async_traverse(self):
-        results = await traverse_graph_by_edge(self.teacher_llm_client, self.tokenizer_instance, self.graph_storage, self.traverse_strategy)
+        results = await traverse_graph_by_edge(self.teacher_llm_client, self.tokenizer_instance,
+                                               self.graph_storage, self.traverse_strategy)
         await self.qa_storage.upsert(results)
         await self.qa_storage.index_done_callback()
diff --git a/graphgen/operators/judge_relations.py b/graphgen/operators/judge_relations.py
@@ -1,24 +1,28 @@
+import math
 import asyncio
-from models import NetworkXStorage
-from utils import logger, yes_no_loss, detect_main_language
-from templates import ANTI_DESCRIPTION_REPHRASING_PROMPT, STATEMENT_JUDGEMENT_PROMPT
-from models import OpenAIModel
 from tqdm.asyncio import tqdm as tqdm_async
+from models import NetworkXStorage, OpenAIModel, JsonKVStorage
+from utils import logger, yes_no_loss_entropy, detect_main_language
+from templates import DESCRIPTION_REPHRASING_PROMPT, STATEMENT_JUDGEMENT_PROMPT
 
 
 async def judge_relations(
         teacher_llm_client: OpenAIModel,
         student_llm_client: OpenAIModel,
         graph_storage: NetworkXStorage,
+        rephrase_storage: JsonKVStorage,
         re_judge: bool = False,
+        max_samples: int = 1,
         max_concurrent: int = 1000) -> NetworkXStorage:
     """
     Get all edges and judge them
 
     :param teacher_llm_client: generate statements
     :param student_llm_client: judge the statements to get comprehension loss
     :param graph_storage: graph storage instance
+    :param rephrase_storage: rephrase storage instance
     :param re_judge: re-judge the relations
+    :param max_samples: max samples for each edge
     :param max_concurrent: max concurrent
     :return:
     """
@@ -38,33 +42,48 @@ async def _judge_single_relation(
                 return source_id, target_id, edge_data
 
             description = edge_data["description"]
-
             language = "English" if detect_main_language(description) == "en" else "Chinese"
 
             try:
-                anti_description = await teacher_llm_client.generate_answer(
-                    ANTI_DESCRIPTION_REPHRASING_PROMPT[language]['TEMPLATE'].format(input_sentence=description)
-                )
-
-                judgement = await student_llm_client.generate_topk_per_token(
-                    STATEMENT_JUDGEMENT_PROMPT['TEMPLATE'].format(statement=description)
-                )
-                anti_judgement = await student_llm_client.generate_topk_per_token(
-                    STATEMENT_JUDGEMENT_PROMPT['TEMPLATE'].format(statement=anti_description)
-                )
-
-                loss = yes_no_loss(
-                    [judgement[0].top_candidates, anti_judgement[0].top_candidates],
-                    ['yes', 'no']
-                )
-
-                logger.info(f"Edge {source_id} -> {target_id} description: {description} loss: {loss}")
+                # 如果在rephrase_storage中已经存在，直接取出
+                descriptions = await rephrase_storage.get_by_id(description)
+                if not descriptions:
+                    # 多次采样，取平均
+                    descriptions = [(description, 'yes')]
+                    for i in range(max_samples):
+                        if i > 0:
+                            new_description = await teacher_llm_client.generate_answer(
+                                DESCRIPTION_REPHRASING_PROMPT[language]['TEMPLATE'].format(input_sentence=description),
+                                temperature=1
+                            )
+                            descriptions.append((new_description, 'yes'))
+                        new_anti_description = await teacher_llm_client.generate_answer(
+                            DESCRIPTION_REPHRASING_PROMPT[language]['ANTI_TEMPLATE'].format(input_sentence=description),
+                            temperature=1
+                        )
+                        descriptions.append((new_anti_description, 'no'))
+
+                    descriptions = list(set(descriptions))
+
+                    await rephrase_storage.upsert({description: descriptions})
+
+                judgements = []
+                gts = [gt for _, gt in descriptions]
+                for description, gt in descriptions:
+                    judgement = await student_llm_client.generate_topk_per_token(
+                        STATEMENT_JUDGEMENT_PROMPT['TEMPLATE'].format(statement=description)
+                    )
+                    judgements.append(judgement[0].top_candidates)
+
+                loss = yes_no_loss_entropy(judgements, gts)
+
+                logger.info("Edge %s -> %s description: %s loss: %s", source_id, target_id, description, loss)
 
                 edge_data["loss"] = loss
-            except Exception as e:
+            except Exception as e: # pylint: disable=broad-except
                 logger.error(f"Error in judging relation {source_id} -> {target_id}: {e}")
                 logger.info("Use default loss 0.1")
-                edge_data["loss"] = 0.1
+                edge_data["loss"] = -math.log(0.1)
 
             await graph_storage.update_edge(source_id, target_id, edge_data)
             return source_id, target_id, edge_data
@@ -77,6 +96,6 @@ async def _judge_single_relation(
             total=len(edges),
             desc="Judging relations"
     ):
-            results.append(await result)
+        results.append(await result)
 
     return graph_storage
diff --git a/models/llm/openai_model.py b/models/llm/openai_model.py
@@ -71,6 +71,9 @@ async def generate_topk_per_token(self, text: str, history: Optional[List[str]]
             kwargs["logprobs"] = True
             kwargs["top_logprobs"] = self.topk_per_token
 
+        # Limit max_tokens to 2 to avoid long completions
+        kwargs["max_tokens"] = 2
+
         completion = await self.client.chat.completions.create(
             model=self.model_name,
             **kwargs
@@ -85,8 +88,9 @@ async def generate_topk_per_token(self, text: str, history: Optional[List[str]]
         wait=wait_exponential(multiplier=1, min=4, max=10),
         retry=retry_if_exception_type((RateLimitError, APIConnectionError, APITimeoutError)),
     )
-    async def generate_answer(self, text: str, history: Optional[List[str]] = None) -> str:
+    async def generate_answer(self, text: str, history: Optional[List[str]] = None, temperature: int = 0) -> str:
         kwargs = self._pre_generate(text, history)
+        kwargs["temperature"] = temperature
 
         completion = await self.client.chat.completions.create(
             model=self.model_name,
diff --git a/templates/__init__.py b/templates/__init__.py
@@ -1,7 +1,7 @@
 from .kg_extraction import KG_EXTRACTION_PROMPT
 from .kg_summarization import KG_SUMMARIZATION_PROMPT
 from .search_judgement import SEARCH_JUDGEMENT_PROMPT
-from .anti_description_rephrasing import ANTI_DESCRIPTION_REPHRASING_PROMPT
+from .description_rephrasing import DESCRIPTION_REPHRASING_PROMPT
 from .statement_judgement import STATEMENT_JUDGEMENT_PROMPT
 from .answer_rephrasing import ANSWER_REPHRASING_PROMPT
 from .question_generation import QUESTION_GENERATION_PROMPT
diff --git a/templates/description_rephrasing.py b/templates/description_rephrasing.py
@@ -1,4 +1,4 @@
-TEMPLATE_EN: str = """-Goal-
+ANTI_TEMPLATE_EN: str = """-Goal-
 Transform the input sentence into its opposite meaning while:
 
 1. Preserving most of the original sentence structure
@@ -25,7 +25,7 @@
 Output:
 """
 
-TEMPLATE_ZH: str = """-目标-
+ANTI_TEMPLATE_ZH: str = """-目标-
 将输入句子转换为相反含义的句子，同时：
 
 1. 保留大部分原始句子结构
@@ -52,11 +52,66 @@
 输出：
 """
 
-ANTI_DESCRIPTION_REPHRASING_PROMPT= {
+TEMPLATE_ZH: str = """-目标-
+将输入句子转换为相同含义的句子，同时：
+
+1. 保留大部分原始句子结构
+2. 仅更改影响核心含义的关键词
+3. 保持相同的语气和风格
+4. 输出句子应该流畅且语法正确
+
+################
+-示例-
+################
+输入：
+明亮的阳光让每个人都感到充满活力和快乐。
+
+输出：
+明媚的阳光让每个人都感受到活力与快乐。
+
+################
+-真实数据-
+################
+输入：
+{input_sentence}
+################
+输出：
+"""
+
+TEMPLATE_EN: str = """-Goal-
+Transform the input sentence into a sentence with the same meaning while:
+
+1. Preserving most of the original sentence structure
+2. Changing only key words that affect the core meaning
+3. Maintaining the same tone and style
+4. The output sentence should be fluent and grammatically correct
+
+################
+-Examples-
+################
+Input:
+The bright sunshine made everyone feel energetic and happy.
+
+Output:
+The bright sunshine made everyone feel energetic and joyful.
+
+################
+-Real Data-
+################
+Input:
+{input_sentence}
+################
+Output:
+"""
+
+
+DESCRIPTION_REPHRASING_PROMPT= {
     "English": {
+        "ANTI_TEMPLATE": ANTI_TEMPLATE_EN,
         "TEMPLATE": TEMPLATE_EN
     },
     "Chinese": {
+        "ANTI_TEMPLATE": ANTI_TEMPLATE_ZH,
         "TEMPLATE": TEMPLATE_ZH
     }
 }
diff --git a/utils/__init__.py b/utils/__init__.py
@@ -5,5 +5,5 @@
                      load_json, write_json)
 from .hash import compute_content_hash, compute_args_hash
 from .detect_lang import detect_main_language, detect_if_chinese
-from .calculate_confidence import yes_no_loss
+from .calculate_confidence import yes_no_loss_entropy
 from .help_nltk import NLTKHelper
diff --git a/utils/calculate_confidence.py b/utils/calculate_confidence.py
@@ -10,30 +10,29 @@ def preprocess_tokens(tokens: List[Token]) -> List[Token]:
 def joint_probability(tokens: List[Token]) -> float:
     """Calculate joint probability of a list of tokens."""
     tokens = preprocess_tokens(tokens)
-    logprob_sum = sum([x.logprob for x in tokens])
+    logprob_sum = sum(x.logprob for x in tokens)
     return math.exp(logprob_sum / len(tokens))
 
 def min_prob(tokens: List[Token]) -> float:
     """Calculate the minimum probability of a list of tokens."""
     tokens = preprocess_tokens(tokens)
-    return min([x.prob for x in tokens])
+    return min(x.prob for x in tokens)
 
 def average_prob(tokens: List[Token]) -> float:
     """Calculate the average probability of a list of tokens."""
     tokens = preprocess_tokens(tokens)
-    return sum([x.prob for x in tokens]) / len(tokens)
+    return sum(x.prob for x in tokens) / len(tokens)
 
 def average_confidence(tokens: List[Token]) -> float:
     """Calculate the average confidence of a list of tokens."""
     tokens = preprocess_tokens(tokens)
-    confidence = [x.prob / sum([y.prob for y in x.top_candidates[:5]]) for x in tokens]
+    confidence = [x.prob / sum(y.prob for y in x.top_candidates[:5]) for x in tokens]
     return sum(confidence) / len(tokens)
 
 def yes_no_loss(tokens_list: List[List[Token]], ground_truth: List[str]) -> float:
     """Calculate the loss for yes/no question."""
     losses = []
-    for i in range(len(tokens_list)):
-        tokens = tokens_list[i]
+    for i, tokens in enumerate(tokens_list):
         token = tokens[0]
         assert token.text in ["yes", "no"]
         if token.text == ground_truth[i]:
@@ -45,8 +44,7 @@ def yes_no_loss(tokens_list: List[List[Token]], ground_truth: List[str]) -> floa
 def yes_no_loss_entropy(tokens_list: List[List[Token]], ground_truth: List[str]) -> float:
     """Calculate the loss for yes/no question using entropy."""
     losses = []
-    for i in range(len(tokens_list)):
-        tokens = tokens_list[i]
+    for i, tokens in enumerate(tokens_list):
         token = tokens[0]
         assert token.text in ["yes", "no"]
         if token.text == ground_truth[i]: