Feature/improve evaluation prompts (#147)

CHERRY-ui8 · web-flow · commit 7cc8212e20e2 · 2025-12-30T15:07:03.000+08:00
* refine: improve accuracy evaluation prompt definitions

* feat: add English prompts for consistency evaluation

* fix: fix too long prompt line

* fix: change hardcode language to auto detect

* fix: fix too long prompt line
diff --git a/graphgen/models/evaluator/kg/consistency_evaluator.py b/graphgen/models/evaluator/kg/consistency_evaluator.py
@@ -6,12 +6,9 @@
 from graphgen.bases import BaseGraphStorage, BaseKVStorage, BaseLLMWrapper
 from graphgen.bases.datatypes import Chunk
 from graphgen.templates.evaluation.kg.consistency_evaluation import (
-    ENTITY_DESCRIPTION_CONFLICT_PROMPT,
-    ENTITY_EXTRACTION_PROMPT,
-    ENTITY_TYPE_CONFLICT_PROMPT,
-    RELATION_CONFLICT_PROMPT,
+    CONSISTENCY_EVALUATION_PROMPT,
 )
-from graphgen.utils import logger
+from graphgen.utils import detect_main_language, logger
 
 
 class ConsistencyEvaluator:
@@ -194,7 +191,9 @@ def _extract_entity_from_chunk(
             # Clean entity_id: remove surrounding quotes if present
             clean_entity_id = self._clean_entity_id(entity_id)
 
-            prompt = ENTITY_EXTRACTION_PROMPT.format(
+            # Detect language and get appropriate prompt
+            lang = detect_main_language(chunk.content)
+            prompt = CONSISTENCY_EVALUATION_PROMPT[lang]["ENTITY_EXTRACTION"].format(
                 entity_name=clean_entity_id,
                 chunk_content=chunk.content[:2000]
                 if chunk.content
@@ -270,8 +269,11 @@ def _check_entity_type_consistency(
                 if entity_type
             ]
 
-            prompt = ENTITY_TYPE_CONFLICT_PROMPT.format(
-                entity_name=entity_id, type_extractions="\n".join(type_list)
+            # Detect language from type extraction text
+            type_text = "\n".join(type_list)
+            lang = detect_main_language(type_text)
+            prompt = CONSISTENCY_EVALUATION_PROMPT[lang]["ENTITY_TYPE_CONFLICT"].format(
+                entity_name=entity_id, type_extractions=type_text
             )
 
             response = asyncio.run(self.llm_client.generate_answer(prompt))
@@ -313,8 +315,11 @@ def _check_entity_description_consistency(
                 for chunk_id, description in valid_descriptions.items()
             ]
 
-            prompt = ENTITY_DESCRIPTION_CONFLICT_PROMPT.format(
-                entity_name=entity_id, descriptions="\n".join(desc_list)
+            # Detect language from description text
+            desc_text = "\n".join(desc_list)
+            lang = detect_main_language(desc_text)
+            prompt = CONSISTENCY_EVALUATION_PROMPT[lang]["ENTITY_DESCRIPTION_CONFLICT"].format(
+                entity_name=entity_id, descriptions=desc_text
             )
 
             response = asyncio.run(self.llm_client.generate_answer(prompt))
@@ -351,10 +356,13 @@ def _check_relation_consistency(
                 if relation
             ]
 
-            prompt = RELATION_CONFLICT_PROMPT.format(
+            # Detect language from relation description text
+            rel_text = "\n".join(rel_list)
+            lang = detect_main_language(rel_text)
+            prompt = CONSISTENCY_EVALUATION_PROMPT[lang]["RELATION_CONFLICT"].format(
                 source_entity=src_id,
                 target_entity=dst_id,
-                relation_descriptions="\n".join(rel_list),
+                relation_descriptions=rel_text,
             )
 
             response = asyncio.run(self.llm_client.generate_answer(prompt))
diff --git a/graphgen/templates/evaluation/kg/accuracy_evaluation.py b/graphgen/templates/evaluation/kg/accuracy_evaluation.py
@@ -1,15 +1,27 @@
 ENTITY_EVALUATION_PROMPT_ZH = """你是一个知识图谱质量评估专家。你的任务是从给定的文本块和提取的实体列表，评估实体提取的质量。
 
 评估维度：
-1. ACCURACY (准确性, 权重: 40%): 提取的实体是否正确，是否有误提取或错误识别
-2. COMPLETENESS (完整性, 权重: 40%): 是否遗漏了文本中的重要实体
-3. PRECISION (精确性, 权重: 20%): 提取的实体是否精确，命名是否准确
+1. ACCURACY (准确性, 权重: 40%): 提取的实体是否真实存在于文本中，是否存在误提取（False Positive）
+   - 检查：实体是否在文本中实际出现，是否将非实体文本误识别为实体
+   - 示例：文本提到"蛋白质A"，但提取了文本中不存在的"蛋白质B" → 准确性低
+   - 示例：将"研究显示"这样的非实体短语提取为实体 → 准确性低
+
+2. COMPLETENESS (完整性, 权重: 40%): 是否遗漏了文本中的重要实体（Recall）
+   - 检查：文本中的重要实体是否都被提取，是否存在遗漏（False Negative）
+   - 示例：文本提到5个重要蛋白质，但只提取了3个 → 完整性低
+   - 示例：所有关键实体都被提取 → 完整性高
+
+3. PRECISION (精确性, 权重: 20%): 提取的实体命名是否精确、边界是否准确、类型是否正确
+   - 检查：实体名称是否完整准确，边界是否正确，实体类型分类是否正确
+   - 示例：应提取"人类胰岛素受体蛋白"，但只提取了"胰岛素" → 精确性低（边界不准确）
+   - 示例：应分类为"蛋白质"，但分类为"基因" → 精确性低（类型错误）
+   - 示例：应提取"COVID-19"，但提取了"冠状病毒" → 精确性低（命名不够精确）
 
 评分标准（每个维度 0-1 分）：
-- EXCELLENT (0.8-1.0): 高质量提取
-- GOOD (0.6-0.79): 良好质量，有少量问题
-- ACCEPTABLE (0.4-0.59): 可接受，有明显问题
-- POOR (0.0-0.39): 质量差，需要改进
+- EXCELLENT (0.8-1.0): 高质量提取，错误率 < 20%
+- GOOD (0.6-0.79): 良好质量，有少量问题，错误率 20-40%
+- ACCEPTABLE (0.4-0.59): 可接受，有明显问题，错误率 40-60%
+- POOR (0.0-0.39): 质量差，需要改进，错误率 > 60%
 
 综合评分 = 0.4 × Accuracy + 0.4 × Completeness + 0.2 × Precision
 
@@ -38,15 +50,27 @@
 Your task is to evaluate the quality of entity extraction from a given text block and extracted entity list.
 
 Evaluation Dimensions:
-1. ACCURACY (Weight: 40%): Whether the extracted entities are correct, and if there are any false extractions or misidentifications
-2. COMPLETENESS (Weight: 40%): Whether important entities from the text are missing
-3. PRECISION (Weight: 20%): Whether the extracted entities are precise and accurately named
+1. ACCURACY (Weight: 40%): Whether the extracted entities actually exist in the text, and if there are any false extractions (False Positives)
+   - Check: Do entities actually appear in the text? Are non-entity phrases incorrectly identified as entities?
+   - Example: Text mentions "Protein A", but "Protein B" (not in text) is extracted → Low accuracy
+   - Example: Phrases like "research shows" are extracted as entities → Low accuracy
+
+2. COMPLETENESS (Weight: 40%): Whether important entities from the text are missing (Recall, False Negatives)
+   - Check: Are all important entities from the text extracted? Are there any omissions?
+   - Example: Text mentions 5 important proteins, but only 3 are extracted → Low completeness
+   - Example: All key entities are extracted → High completeness
+
+3. PRECISION (Weight: 20%): Whether extracted entities are precisely named, have correct boundaries, and correct types
+   - Check: Are entity names complete and accurate? Are boundaries correct? Are entity types correctly classified?
+   - Example: Should extract "Human Insulin Receptor Protein", but only "Insulin" is extracted → Low precision (incorrect boundary)
+   - Example: Should be classified as "Protein", but classified as "Gene" → Low precision (incorrect type)
+   - Example: Should extract "COVID-19", but "Coronavirus" is extracted → Low precision (naming not precise enough)
 
 Scoring Criteria (0-1 scale for each dimension):
-- EXCELLENT (0.8-1.0): High-quality extraction
-- GOOD (0.6-0.79): Good quality with minor issues
-- ACCEPTABLE (0.4-0.59): Acceptable with noticeable issues
-- POOR (0.0-0.39): Poor quality, needs improvement
+- EXCELLENT (0.8-1.0): High-quality extraction, error rate < 20%
+- GOOD (0.6-0.79): Good quality with minor issues, error rate 20-40%
+- ACCEPTABLE (0.4-0.59): Acceptable with noticeable issues, error rate 40-60%
+- POOR (0.0-0.39): Poor quality, needs improvement, error rate > 60%
 
 Overall Score = 0.4 × Accuracy + 0.4 × Completeness + 0.2 × Precision
 
@@ -74,15 +98,27 @@
 RELATION_EVALUATION_PROMPT_ZH = """你是一个知识图谱质量评估专家。你的任务是从给定的文本块和提取的关系列表，评估关系抽取的质量。
 
 评估维度：
-1. ACCURACY (准确性, 权重: 40%): 提取的关系是否正确，关系描述是否准确
-2. COMPLETENESS (完整性, 权重: 40%): 是否遗漏了文本中的重要关系
-3. PRECISION (精确性, 权重: 20%): 关系描述是否精确，是否过于宽泛
+1. ACCURACY (准确性, 权重: 40%): 提取的关系是否真实存在于文本中，是否存在误提取（False Positive）
+   - 检查：关系是否在文本中实际表达，是否将不存在的关系误识别为关系
+   - 示例：文本中A和B没有关系，但提取了"A-作用于->B" → 准确性低
+   - 示例：将文本中的并列关系误识别为因果关系 → 准确性低
+
+2. COMPLETENESS (完整性, 权重: 40%): 是否遗漏了文本中的重要关系（Recall）
+   - 检查：文本中表达的重要关系是否都被提取，是否存在遗漏（False Negative）
+   - 示例：文本明确表达了5个关系，但只提取了3个 → 完整性低
+   - 示例：所有关键关系都被提取 → 完整性高
+
+3. PRECISION (精确性, 权重: 20%): 关系描述是否精确，关系类型是否正确，是否过于宽泛
+   - 检查：关系类型是否准确，关系描述是否具体，是否使用了过于宽泛的关系类型
+   - 示例：应提取"抑制"关系，但提取了"影响"关系 → 精确性低（类型不够精确）
+   - 示例：应提取"直接结合"，但提取了"相关" → 精确性低（描述过于宽泛）
+   - 示例：关系方向是否正确（如"A激活B" vs "B被A激活"）→ 精确性检查
 
 评分标准（每个维度 0-1 分）：
-- EXCELLENT (0.8-1.0): 高质量提取
-- GOOD (0.6-0.79): 良好质量，有少量问题
-- ACCEPTABLE (0.4-0.59): 可接受，有明显问题
-- POOR (0.0-0.39): 质量差，需要改进
+- EXCELLENT (0.8-1.0): 高质量提取，错误率 < 20%
+- GOOD (0.6-0.79): 良好质量，有少量问题，错误率 20-40%
+- ACCEPTABLE (0.4-0.59): 可接受，有明显问题，错误率 40-60%
+- POOR (0.0-0.39): 质量差，需要改进，错误率 > 60%
 
 综合评分 = 0.4 × Accuracy + 0.4 × Completeness + 0.2 × Precision
 
@@ -111,15 +147,27 @@
 Your task is to evaluate the quality of relation extraction from a given text block and extracted relation list.
 
 Evaluation Dimensions:
-1. ACCURACY (Weight: 40%): Whether the extracted relations are correct and the relation descriptions are accurate
-2. COMPLETENESS (Weight: 40%): Whether important relations from the text are missing
-3. PRECISION (Weight: 20%): Whether the relation descriptions are precise and not overly broad
+1. ACCURACY (Weight: 40%): Whether the extracted relations actually exist in the text, and if there are any false extractions (False Positives)
+   - Check: Do relations actually appear in the text? Are non-existent relations incorrectly identified?
+   - Example: Text shows no relation between A and B, but "A-acts_on->B" is extracted → Low accuracy
+   - Example: A parallel relationship in text is misidentified as a causal relationship → Low accuracy
+
+2. COMPLETENESS (Weight: 40%): Whether important relations from the text are missing (Recall, False Negatives)
+   - Check: Are all important relations expressed in the text extracted? Are there any omissions?
+   - Example: Text explicitly expresses 5 relations, but only 3 are extracted → Low completeness
+   - Example: All key relations are extracted → High completeness
+
+3. PRECISION (Weight: 20%): Whether relation descriptions are precise, relation types are correct, and not overly broad
+   - Check: Are relation types accurate? Are relation descriptions specific? Are overly broad relation types used?
+   - Example: Should extract "inhibits" relation, but "affects" is extracted → Low precision (type not precise enough)
+   - Example: Should extract "directly binds", but "related" is extracted → Low precision (description too broad)
+   - Example: Is relation direction correct (e.g., "A activates B" vs "B is activated by A") → Precision check
 
 Scoring Criteria (0-1 scale for each dimension):
-- EXCELLENT (0.8-1.0): High-quality extraction
-- GOOD (0.6-0.79): Good quality with minor issues
-- ACCEPTABLE (0.4-0.59): Acceptable with noticeable issues
-- POOR (0.0-0.39): Poor quality, needs improvement
+- EXCELLENT (0.8-1.0): High-quality extraction, error rate < 20%
+- GOOD (0.6-0.79): Good quality with minor issues, error rate 20-40%
+- ACCEPTABLE (0.4-0.59): Acceptable with noticeable issues, error rate 40-60%
+- POOR (0.0-0.39): Poor quality, needs improvement, error rate > 60%
 
 Overall Score = 0.4 × Accuracy + 0.4 × Completeness + 0.2 × Precision
 
diff --git a/graphgen/templates/evaluation/kg/consistency_evaluation.py b/graphgen/templates/evaluation/kg/consistency_evaluation.py
@@ -1,4 +1,4 @@
-ENTITY_TYPE_CONFLICT_PROMPT = """你是一个知识图谱一致性评估专家。你的任务是判断同一个实体在不同文本块中被提取为不同的类型，是否存在语义冲突。
+ENTITY_TYPE_CONFLICT_PROMPT_ZH = """你是一个知识图谱一致性评估专家。你的任务是判断同一个实体在不同文本块中被提取为不同的类型，是否存在语义冲突。
 
 实体名称：{entity_name}
 
@@ -21,7 +21,38 @@
 }}
 """
 
-ENTITY_DESCRIPTION_CONFLICT_PROMPT = """你是一个知识图谱一致性评估专家。你的任务是判断同一个实体在不同文本块中的描述是否存在语义冲突。
+ENTITY_TYPE_CONFLICT_PROMPT_EN = (
+    """You are a Knowledge Graph Consistency Assessment Expert. """
+    """Your task is to determine whether there are semantic conflicts """
+    """when the same entity is extracted as different types in different text blocks.
+
+Entity Name: {entity_name}
+
+Type extraction results from different text blocks:
+{type_extractions}
+
+Preset entity type list (for reference):
+concept, date, location, keyword, organization, person, event, work, nature, """
+    """artificial, science, technology, mission, gene
+
+Please determine whether these types have semantic conflicts """
+    """(i.e., whether they describe the same category of things, """
+    """or if there are contradictions).
+Note: If types are just different expressions of the same concept """
+    """(such as concept and keyword), it may not be considered a serious conflict.
+
+Please return in JSON format:
+{{
+    "has_conflict": <true/false>,
+    "conflict_severity": <float between 0-1, where 0 means no conflict, 1 means severe conflict>,
+    "conflict_reasoning": "<reasoning for conflict judgment>",
+    "conflicting_types": ["<pairs of conflicting types>"],
+    "recommended_type": "<if there is a conflict, the recommended correct type (must be one of the preset types)>"
+}}
+"""
+)
+
+ENTITY_DESCRIPTION_CONFLICT_PROMPT_ZH = """你是一个知识图谱一致性评估专家。你的任务是判断同一个实体在不同文本块中的描述是否存在语义冲突。
 
 实体名称：{entity_name}
 
@@ -40,7 +71,32 @@
 }}
 """
 
-RELATION_CONFLICT_PROMPT = """你是一个知识图谱一致性评估专家。你的任务是判断同一对实体在不同文本块中的关系描述是否存在语义冲突。
+ENTITY_DESCRIPTION_CONFLICT_PROMPT_EN = (
+    """You are a Knowledge Graph Consistency Assessment Expert. """
+    """Your task is to determine whether there are semantic conflicts """
+    """in the descriptions of the same entity across different text blocks.
+
+Entity Name: {entity_name}
+
+Descriptions from different text blocks:
+{descriptions}
+
+Please determine whether these descriptions have semantic conflicts """
+    """(i.e., whether they describe the same entity, """
+    """or if there is contradictory information).
+
+Please return in JSON format:
+{{
+    "has_conflict": <true/false>,
+    "conflict_severity": <float between 0-1>,
+    "conflict_reasoning": "<reasoning for conflict judgment>",
+    "conflicting_descriptions": ["<pairs of conflicting descriptions>"],
+    "conflict_details": "<specific conflict content>"
+}}
+"""
+)
+
+RELATION_CONFLICT_PROMPT_ZH = """你是一个知识图谱一致性评估专家。你的任务是判断同一对实体在不同文本块中的关系描述是否存在语义冲突。
 
 实体对：{source_entity} -> {target_entity}
 
@@ -58,7 +114,29 @@
 }}
 """
 
-ENTITY_EXTRACTION_PROMPT = """从以下文本块中提取指定实体的类型和描述。
+RELATION_CONFLICT_PROMPT_EN = (
+    """You are a Knowledge Graph Consistency Assessment Expert. """
+    """Your task is to determine whether there are semantic conflicts """
+    """in the relation descriptions of the same entity pair across different text blocks.
+
+Entity Pair: {source_entity} -> {target_entity}
+
+Relation descriptions from different text blocks:
+{relation_descriptions}
+
+Please determine whether these relation descriptions have semantic conflicts.
+
+Please return in JSON format:
+{{
+    "has_conflict": <true/false>,
+    "conflict_severity": <float between 0-1>,
+    "conflict_reasoning": "<reasoning for conflict judgment>",
+    "conflicting_relations": ["<pairs of conflicting relation descriptions>"]
+}}
+"""
+)
+
+ENTITY_EXTRACTION_PROMPT_ZH = """从以下文本块中提取指定实体的类型和描述。
 
 **重要**：你只需要提取指定的实体，不要提取其他实体。
 
@@ -96,7 +174,55 @@
 }}
 """
 
+ENTITY_EXTRACTION_PROMPT_EN = """Extract the type and description of the specified entity from the following text block.
+
+**Important**: You should only extract the specified entity, do not extract other entities.
+
+Entity Name: {entity_name}
+
+Text Block:
+{chunk_content}
+
+Please find and extract the following information for **this entity only** (entity name: {entity_name}) from the text block:
+
+1. entity_type: Entity type, must be one of the following preset types (lowercase):
+   - concept: concept
+   - date: date
+   - location: location
+   - keyword: keyword
+   - organization: organization
+   - person: person
+   - event: event
+   - work: work
+   - nature: nature
+   - artificial: artificial
+   - science: science
+   - technology: technology
+   - mission: mission
+   - gene: gene
+
+   If the type cannot be determined, please use "concept" as the default value.
+
+2. description: Entity description (briefly describe the role and characteristics of this entity in the text)
+
+Please return in JSON format:
+{{
+    "entity_type": "<entity type (must be one of the preset types above)>",
+    "description": "<entity description>"
+}}
+"""
+
 CONSISTENCY_EVALUATION_PROMPT = {
-    "en": "",
-    "zh": ""
+    "zh": {
+        "ENTITY_TYPE_CONFLICT": ENTITY_TYPE_CONFLICT_PROMPT_ZH,
+        "ENTITY_DESCRIPTION_CONFLICT": ENTITY_DESCRIPTION_CONFLICT_PROMPT_ZH,
+        "RELATION_CONFLICT": RELATION_CONFLICT_PROMPT_ZH,
+        "ENTITY_EXTRACTION": ENTITY_EXTRACTION_PROMPT_ZH,
+    },
+    "en": {
+        "ENTITY_TYPE_CONFLICT": ENTITY_TYPE_CONFLICT_PROMPT_EN,
+        "ENTITY_DESCRIPTION_CONFLICT": ENTITY_DESCRIPTION_CONFLICT_PROMPT_EN,
+        "RELATION_CONFLICT": RELATION_CONFLICT_PROMPT_EN,
+        "ENTITY_EXTRACTION": ENTITY_EXTRACTION_PROMPT_EN,
+    },
 }