refactor(baselines): refact some logics

ChenZiHong-Gavin · ChenZiHong-Gavin · commit cac7942ca9c8 · 2025-01-15T19:46:24.000+08:00
diff --git a/baselines/EntiGraph/entigraph.py b/baselines/EntiGraph/entigraph.py
@@ -3,15 +3,15 @@
 
 import os
 import json
+import random
 import asyncio
 import argparse
 from hashlib import md5
-
-from .inference.devapi import gptqa
-from .tasks.baseline_task import BaselineTask
-import random
 from tqdm.asyncio import tqdm as tqdm_async
 
+from baselines.EntiGraph.inference.devapi import gptqa
+from baselines.EntiGraph.tasks.baseline_task import BaselineTask
+
 
 def compute_content_hash(content, prefix: str = ""):
     return prefix + md5(content.encode()).hexdigest()
@@ -37,7 +37,7 @@ async def generate_entities(document_content: str,
             response = json.loads(completion)
             can_read_entities = response['entities']
             return response
-        except Exception as e:
+        except Exception as e: # pylint: disable=broad-except
             print(f"Failed to generate entities: {str(e)}")
             max_tries -= 1
 
@@ -101,17 +101,21 @@ async def generate_synthetic_data_for_document(input_file, data_type):
 
     async def generate_document_entities(doc):
         async with semaphore:
-            entities = await generate_entities(
-                doc.text,
-                task.openai_system_generate_entities,
-                model_name)
-            if not entities:
+            try:
+                entities = await generate_entities(
+                    doc.text,
+                    task.openai_system_generate_entities,
+                    model_name)
+                if not entities:
+                    return None
+                return {
+                    'document': doc.text,
+                    'entities': entities['entities'],
+                    'summary': entities['summary']
+                }
+            except Exception as e: # pylint: disable=broad-except
+                print(f"Error: {e}")
                 return None
-            return {
-                'document': doc.text,
-                'entities': entities['entities'],
-                'summary': entities['summary']
-            }
 
     entities_list = []
     for result in tqdm_async(
@@ -128,31 +132,38 @@ async def generate_document_entities(doc):
     for doc in entities_list:
         entities = doc['entities']
         temp = []
-        for i in range(len(entities)):
+        for i, entity_i in enumerate(entities):
             for j in range(i + 1, len(entities)):
-                pair = (doc['document'], entities[i], entities[j])
+                entity_j = entities[j]
+                pair = (doc['document'], entity_i, entity_j)
                 temp.append(pair)
 
-        # 由于数量太多，会产生很多垃圾数据，增加计算成本，因此限制同一个文档随机选10个
+        # Compute all possible combinations of entities is impractical, so we randomly sample 10 pairs
         pair_list.extend(random.sample(temp, min(len(temp), 10)))
 
 
     async def process_two_entity_relations(pair):
         async with semaphore:
-            document, entity1, entity2 = pair
-            response = await generate_two_entity_relations(
-                document, entity1, entity2,
-                task.openai_system_generate_two_entity_relations,
-                model_name)
-            return response
+            try:
+                document, entity1, entity2 = pair
+                response = await generate_two_entity_relations(
+                    document, entity1, entity2,
+                    task.openai_system_generate_two_entity_relations,
+                    model_name)
+                return response
+            except Exception as e: # pylint: disable=broad-except
+                print(f"Error: {e}")
+                return None
 
     corpus= []
     for result in tqdm_async(
             asyncio.as_completed([process_two_entity_relations(pair) for pair in pair_list]),
             total=len(pair_list),
             desc="Generating two entity relations"
     ):
-        corpus.append(await result)
+        result = await result
+        if result:
+            corpus.append(result)
 
     # triple_list = []
     # for doc in entities_list:
@@ -196,8 +207,9 @@ async def generate_qa_sft(content):
     ):
         try:
             result = await result
-            qa_sft_results.update(_post_process_synthetic_data(result))
-        except Exception as e:
+            if result:
+                qa_sft_results.update(_post_process_synthetic_data(result))
+        except Exception as e: # pylint: disable=broad-except
             print(f"Error: {e}")
 
     return qa_sft_results
@@ -225,5 +237,5 @@ async def generate_qa_sft(content):
     results = loop.run_until_complete(generate_synthetic_data_for_document(args.input_file, args.data_type))
 
     # Save results
-    with open(args.output_file, "w") as f:
+    with open(args.output_file, "w", encoding='utf-8') as f:
         json.dump(results, f, indent=4, ensure_ascii=False)
diff --git a/baselines/EntiGraph/entigraph_utils/io_utils.py b/baselines/EntiGraph/entigraph_utils/io_utils.py
diff --git a/baselines/EntiGraph/entigraph_utils/prompt_utils.py b/baselines/EntiGraph/entigraph_utils/prompt_utils.py
@@ -1,3 +1,5 @@
+# pylint: disable=C0301
+
 QUALITY_FEW_SHOT_COT_PROMPT = """## Example 1
 ### Question
 In the context of "Les Misérables", written by Victor Hugo in 1862, what is the main setting of the novel? There is only one correct choice.
diff --git a/baselines/EntiGraph/inference/devapi.py b/baselines/EntiGraph/inference/devapi.py
@@ -1,6 +1,6 @@
-from openai import AsyncOpenAI
-import dotenv
 import os
+import dotenv
+from openai import AsyncOpenAI
 
 dotenv.load_dotenv()
 
diff --git a/baselines/EntiGraph/tasks/baseline_task.py b/baselines/EntiGraph/tasks/baseline_task.py
@@ -1,8 +1,9 @@
 # Rewrite from https://github.com/ZitongYang/Synthetic_Continued_Pretraining/blob/main/tasks/quality.py
+
 import json
 from hashlib import md5
 
-from .task_abc import Document, Task
+from baselines.EntiGraph.tasks.task_abc import Document, Task
 from baselines.EntiGraph.entigraph_utils.prompt_utils import (
                                 OPENAI_API_SYSTEM_QUALITY_GENERATE_ENTITIES,
                                 OPENAI_API_SYSTEM_QUALITY_GENERATE_TWO_ENTITY_RELATIONS,
@@ -16,14 +17,19 @@ class BaselineTask(Task):
     openai_system_quality_qa_sft = OPENAI_API_SYSTEM_QUALITY_QA_SFT
     llama_cot_prompt = QUALITY_FEW_SHOT_COT_PROMPT
 
+    def __init__(self, input_file: str, data_type: str):
+        self._data = self._load_split(input_file, data_type)
+        self._create_documents()
+        self._dedup()
+
     @staticmethod
     def _load_split(input_file: str, data_type: str):
         if data_type == 'raw':
-            with open(input_file, "r") as f:
+            with open(input_file, "r", encoding='utf-8') as f:
                 data = [json.loads(line) for line in f]
                 data = [[chunk] for chunk in data]
         elif data_type == 'chunked':
-            with open(input_file, "r") as f:
+            with open(input_file, "r", encoding='utf-8') as f:
                 data = json.load(f)
 
         documents = []
@@ -49,11 +55,6 @@ def _dedup(self):
         self.documents = list(deuped_documents.values())
 
 
-    def __init__(self, input_file: str, data_type: str):
-        self._data = self._load_split(input_file, data_type)
-        self._create_documents()
-        self._dedup()
-
     def performance_stats(self):
         pass
 

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,5 @@`
	`1`	`+# pylint: disable=C0301`
	`2`	`+`
`1`	`3`	`QUALITY_FEW_SHOT_COT_PROMPT = """## Example 1`
`2`	`4`	`### Question`
`3`	`5`	`In the context of "Les Misérables", written by Victor Hugo in 1862, what is the main setting of the novel? There is only one correct choice.`