feat(templates): specify 3 levels of rephrasing difficulty

ChenZiHong-Gavin · ChenZiHong-Gavin · commit bbb66ad53f8b · 2025-01-13T21:09:36.000+08:00
diff --git a/graphgen/graphgen.py b/graphgen/graphgen.py
@@ -177,8 +177,8 @@ def judge(self, re_judge=False):
         loop.run_until_complete(self.async_judge(re_judge))
 
     async def async_judge(self, re_judge=False):
-        _update_relations = await judge_relations(self.teacher_llm_client, self.student_llm_client,
-                                                  self.graph_storage, self.rephrase_storage, re_judge)
+        _update_relations = await judge_relations(self.student_llm_client, self.graph_storage,
+                                                  self.rephrase_storage, re_judge)
         await _update_relations.index_done_callback()
 
     def traverse(self):
diff --git a/graphgen/operators/judge_relations.py b/graphgen/operators/judge_relations.py
@@ -2,12 +2,11 @@
 import asyncio
 from tqdm.asyncio import tqdm as tqdm_async
 from models import NetworkXStorage, OpenAIModel, JsonKVStorage
-from utils import logger, yes_no_loss_entropy, detect_main_language
-from templates import DESCRIPTION_REPHRASING_PROMPT, STATEMENT_JUDGEMENT_PROMPT
+from utils import logger, yes_no_loss_entropy
+from templates import STATEMENT_JUDGEMENT_PROMPT
 
 
 async def judge_relations(
-        teacher_llm_client: OpenAIModel,
         student_llm_client: OpenAIModel,
         graph_storage: NetworkXStorage,
         rephrase_storage: JsonKVStorage,
@@ -16,7 +15,6 @@ async def judge_relations(
     """
     Get all edges and judge them
 
-    :param teacher_llm_client: generate statements
     :param student_llm_client: judge the statements to get comprehension loss
     :param graph_storage: graph storage instance
     :param rephrase_storage: rephrase storage instance
diff --git a/graphgen/operators/split_graph.py b/graphgen/operators/split_graph.py
@@ -1,10 +1,9 @@
-import asyncio
 import random
-
 from collections import defaultdict
-from models import NetworkXStorage, TraverseStrategy
 from tqdm.asyncio import tqdm as tqdm_async
-from utils import logger, create_event_loop
+from utils import logger
+
+from models import NetworkXStorage, TraverseStrategy
 
 
 async def _get_node_info(
@@ -103,7 +102,7 @@ def _get_level_n_edges_by_max_tokens(
 ) -> list:
     """
     Get level n edges for an edge.
-    n is decided by max_depth in traverse_strategy
+    n is decided by max_depth in traverse_strategy.
 
     :param edge_adj_list
     :param node_dict
diff --git a/graphgen/operators/traverse_graph.py b/graphgen/operators/traverse_graph.py
@@ -49,6 +49,14 @@ async def handle_node(node: dict) -> dict:
     await graph_storage.index_done_callback()
     return new_edges, new_nodes
 
+
+def get_loss_tercile(losses: list) -> (float, float):
+    losses = sorted(losses)
+    q1_index = int(len(losses) * (1 / 3))
+    q2_index = int(len(losses) * (2 / 3))
+
+    return losses[q1_index], losses[q2_index]
+
 async def traverse_graph_by_edge(
     llm_client: OpenAIModel,
     tokenizer: Tokenizer,
@@ -72,6 +80,7 @@ async def traverse_graph_by_edge(
     async def _process_nodes_and_edges(
             _process_nodes: list,
             _process_edges: list,
+            _difficulty: str
     ) -> str:
         entities = [
             f"{_process_node['node_id']}: {_process_node['description']}" for _process_node in _process_nodes
@@ -85,7 +94,7 @@ async def _process_nodes_and_edges(
         relations_str = "\n".join([f"{index + 1}. {relation}" for index, relation in enumerate(relations)])
 
         language = "Chinese" if detect_main_language(entities_str + relations_str) == "zh" else "English"
-        prompt = ANSWER_REPHRASING_PROMPT[language]['TEMPLATE'].format(
+        prompt = ANSWER_REPHRASING_PROMPT[_difficulty][language]['TEMPLATE'].format(
             language=language,
             entities=entities_str,
             relationships=relations_str
@@ -105,9 +114,12 @@ async def _process_single_batch(
         _process_batch: tuple
     ) -> dict:
         async with semaphore:
+            losses = [(edge[0], edge[1], edge[2]['loss']) for edge in _process_batch[1]]
+
             context = await _process_nodes_and_edges(
                 _process_batch[0],
                 _process_batch[1],
+                _process_batch[2]
             )
 
             language = "Chinese" if detect_main_language(context) == "zh" else "English"
@@ -125,8 +137,6 @@ async def _process_single_batch(
             pre_length = sum(node['length'] for node in _process_batch[0]) \
                          + sum(edge[2]['length'] for edge in _process_batch[1])
 
-            losses = [(edge[0], edge[1], edge[2]['loss']) for edge in _process_batch[1]]
-
             logger.info("%d nodes and %d edges processed", len(_process_batch[0]), len(_process_batch[1]))
             logger.info("Pre-length: %s", pre_length)
             logger.info("Question: %s Answer: %s", question, context)
@@ -135,7 +145,8 @@ async def _process_single_batch(
                 compute_content_hash(context): {
                     "question": question,
                     "answer": context,
-                    "losses": losses
+                    "losses": losses,
+                    "difficulty": _process_batch[2],
                 }
             }
 
@@ -152,6 +163,29 @@ async def _process_single_batch(
         traverse_strategy
     )
 
+    losses = []
+    for batch in processing_batches:
+        if len(batch[1]) == 0:
+            continue
+        loss = sum(edge[2]['loss'] for edge in batch[1]) / len(batch[1])
+        losses.append(loss)
+    q1, q2 = get_loss_tercile(losses)
+
+    for i, batch in enumerate(processing_batches):
+        if len(batch[1]) == 0:
+            processing_batches[i] = (batch[0], batch[1], "easy")
+            continue
+        loss = sum(edge[2]['loss'] for edge in batch[1]) / len(batch[1])
+        if loss < q1:
+            # easy
+            processing_batches[i] = (batch[0], batch[1], "easy")
+        elif loss < q2:
+            # medium
+            processing_batches[i] = (batch[0], batch[1], "medium")
+        else:
+            # hard
+            processing_batches[i] = (batch[0], batch[1], "hard")
+
     for result in tqdm_async(asyncio.as_completed(
         [_process_single_batch(batch) for batch in processing_batches]
     ), total=len(processing_batches), desc="Processing batches"):
diff --git a/templates/answer_rephrasing.py b/templates/answer_rephrasing.py
@@ -43,10 +43,6 @@
 ################
 {relationships}
 
-################
-Please directly output the rephrased text below, without any additional content.
-
-Rephrased Text:
 """
 
 TEMPLATE_ZH: str = """---角色---
@@ -92,18 +88,92 @@
 ################
 {relationships}
 
+"""
+
+EASY_REQUIREMENT_EN = """
+---Requirements---
+- Requires a concise and straightforward summary, focusing on core meaning.
+- Uses simple language, avoiding complex sentence structures.
+- Does not need excessive details or examples; just the basic concepts and relationships.
+
+################
+Please directly output the rephrased text below, without any additional content.
+
+Rephrased Text:
+"""
+
+EASY_REQUIREMENT_ZH = """
+---要求---
+- 要求简洁明了，主要传达核心意思。
+- 使用简单的语言，避免复杂的句子结构。
+- 不需要过多的细节或示例，只需基本概念和关系。
+
+################
+请在下方直接输出重述文本，不要输出任何额外的内容。
+
+重述文本:
+"""
+
+MEDIUM_REQUIREMENT_ZH = """
 ################
 请在下方直接输出重述文本，不要输出任何额外的内容。
 
 重述文本:
 """
 
 
+MEDIUM_REQUIREMENT_EN = """
+################
+Please directly output the rephrased text below, without any additional content.
+
+Rephrased Text:
+"""
+
+HARD_REQUIREMENT_EN = """
+---Requirements---
+- Requires an in-depth exploration of complex relationships and nuances.
+- Includes detailed background information, emphasizing logical consistency and complexity.
+
+################
+Please directly output the rephrased text below, without any additional content.
+
+Rephrased Text:
+"""
+
+HARD_REQUIREMENT_ZH = """
+---要求---
+- 需要深入探讨复杂的关系和细微差别。
+- 包括详细的背景信息，强调逻辑一致性和复杂性。
+
+################
+请在下方直接输出重述文本，不要输出任何额外的内容。
+
+重述文本:
+"""
+
 ANSWER_REPHRASING_PROMPT= {
-    "English": {
-        "TEMPLATE": TEMPLATE_EN
+    "easy": {
+        "English": {
+            "TEMPLATE": TEMPLATE_EN + EASY_REQUIREMENT_EN
+        },
+        "Chinese": {
+            "TEMPLATE": TEMPLATE_ZH + EASY_REQUIREMENT_ZH
+        }
+    },
+    "medium": {
+        "English": {
+            "TEMPLATE": TEMPLATE_EN + MEDIUM_REQUIREMENT_EN
+        },
+        "Chinese": {
+            "TEMPLATE": TEMPLATE_ZH + MEDIUM_REQUIREMENT_ZH
+        }
     },
-    "Chinese": {
-        "TEMPLATE": TEMPLATE_ZH
+    "hard": {
+        "English": {
+            "TEMPLATE": TEMPLATE_EN + HARD_REQUIREMENT_EN
+        },
+        "Chinese": {
+            "TEMPLATE": TEMPLATE_ZH + HARD_REQUIREMENT_ZH
+        }
     }
 }