fix(graphgen): accelerate quizzing

ChenZiHong-Gavin · ChenZiHong-Gavin · commit 1636a28f68f7 · 2025-01-14T01:27:27.000+08:00
diff --git a/generate.py b/generate.py
@@ -75,7 +75,7 @@
 
     graph_gen.quiz(max_samples=3)
 
-    graph_gen.judge(re_judge=True)
+    graph_gen.judge(re_judge=False)
 
     graph_gen.traverse()
     with open(os.path.join(sys_path, "cache", "configs", f"graphgen_{unique_id}.yaml"), "w", encoding='utf-8') as f:
diff --git a/graphgen/operators/quiz_relations.py b/graphgen/operators/quiz_relations.py
@@ -42,22 +42,37 @@ async def _quiz_single_relation(
                 if not descriptions:
                     # 多次采样，取平均
                     descriptions = [(description, 'yes')]
+
+                    new_description_tasks = []
+                    new_anti_description_tasks = []
                     for i in range(max_samples):
                         if i > 0:
-                            new_description = await teacher_llm_client.generate_answer(
-                                DESCRIPTION_REPHRASING_PROMPT[language]['TEMPLATE'].format(input_sentence=description),
+                            new_description_tasks.append(
+                                teacher_llm_client.generate_answer(
+                                    DESCRIPTION_REPHRASING_PROMPT[language]['TEMPLATE'].format(
+                                        input_sentence=description),
+                                    temperature=1
+                                )
+                            )
+                        new_anti_description_tasks.append(
+                            teacher_llm_client.generate_answer(
+                                DESCRIPTION_REPHRASING_PROMPT[language]['ANTI_TEMPLATE'].format(
+                                    input_sentence=description),
                                 temperature=1
                             )
-                            descriptions.append((new_description, 'yes'))
-                        new_anti_description = await teacher_llm_client.generate_answer(
-                            DESCRIPTION_REPHRASING_PROMPT[language]['ANTI_TEMPLATE'].format(input_sentence=description),
-                            temperature=1
                         )
+
+                    new_descriptions = await asyncio.gather(*new_description_tasks)
+                    new_anti_descriptions = await asyncio.gather(*new_anti_description_tasks)
+
+                    for new_description in new_descriptions:
+                        descriptions.append((new_description, 'yes'))
+                    for new_anti_description in new_anti_descriptions:
                         descriptions.append((new_anti_description, 'no'))
 
                     descriptions = list(set(descriptions))
             except Exception as e: # pylint: disable=broad-except
-                logger.error(f"Error when quizzing edge {source_id} -> {target_id}: {e}")
+                logger.error("Error when quizzing edge %s -> %s: %s", source_id, target_id, e)
                 descriptions = [(description, 'yes')]
 
             await rephrase_storage.upsert({description: descriptions})
diff --git a/models/llm/tokenizer.py b/models/llm/tokenizer.py
@@ -1,11 +1,12 @@
-import tiktoken
 from dataclasses import dataclass
 from typing import List
+import tiktoken
 
 try:
     from transformers import AutoTokenizer
     TRANSFORMERS_AVAILABLE = True
 except ImportError:
+    AutoTokenizer = None
     TRANSFORMERS_AVAILABLE = False
 
 
@@ -18,11 +19,11 @@ def get_tokenizer(tokenizer_name: str = "cl100k_base"):
     """
     if tokenizer_name in tiktoken.list_encoding_names():
         return tiktoken.get_encoding(tokenizer_name)
-    elif TRANSFORMERS_AVAILABLE:
+    if TRANSFORMERS_AVAILABLE:
         try:
             return AutoTokenizer.from_pretrained(tokenizer_name)
         except Exception as e:
-            raise ValueError(f"Failed to load tokenizer from Hugging Face: {e}")
+            raise ValueError(f"Failed to load tokenizer from Hugging Face: {e}") from e
     else:
         raise ValueError("Hugging Face Transformers is not available, please install it first.")
 
diff --git a/models/llm/topk_token_model.py b/models/llm/topk_token_model.py
@@ -20,10 +20,10 @@ class TopkTokenModel:
     do_sample: bool = False
     temperature: float = 0
     max_tokens: int = 10240
-    repetition_penalty: float = 1.0
+    repetition_penalty: float = 1.05
     num_beams: int = 1
     topk: int = 50
-    topp: float = 0.1
+    topp: float = 0.95
 
     topk_per_token: int = 5  # number of topk tokens to generate for each token
 
diff --git a/models/strategy/travserse_strategy.py b/models/strategy/travserse_strategy.py
@@ -18,7 +18,7 @@ class TraverseStrategy(BaseStrategy):
     # 同一层中选边的策略（如果是双向拓展，同一层指的是两边连接的边的集合）
     edge_sampling: str = "max_loss" # "max_loss" or "min_loss" or "random"
     # 孤立节点的处理策略
-    isolated_node_strategy: str = "add" # "add" or "ignore"
+    isolated_node_strategy: str = "ignore" # "add" or "ignore"
     # 难度顺序 ["easy", "medium", "hard"], ["hard", "medium", "easy"], ["medium", "medium", "medium"]
     difficulty_order: list = field(default_factory=lambda: ["easy", "medium", "hard"])