feat: add examples in metrics (#314)

e06084 · actions-user · web-flow · commit 7464aa2fd3ce · 2025-12-23T18:19:20.000+08:00
* feat: add Instruction Quality Evaluation

* feat: add examples in metrics

* 📚 Auto-update metrics documentation

---------

Co-authored-by: GitHub Action &lt;action@github.com&gt;
diff --git a/dingo/model/llm/instruction_quality/llm_instruction_clarity.py b/dingo/model/llm/instruction_quality/llm_instruction_clarity.py
@@ -38,7 +38,8 @@ class LLMInstructionClarity(BaseOpenAI):
         "metric_name": "LLMInstructionClarity",
         "description": "Evaluates instruction clarity across four dimensions: self-descriptiveness, consistency, specificity, and completeness",
         "paper_source": "IFEval (Google, 2023), Self-Instruct (UW, 2023)",
-        "evaluation_results": "Returns clarity score (0-10) and detailed analysis"
+        "evaluation_results": "Returns clarity score (0-10) and detailed analysis",
+        "examples": "examples/sft/evaluate_instruction_quality.py"
     }
 
     prompt = """
diff --git a/dingo/model/llm/instruction_quality/llm_task_difficulty.py b/dingo/model/llm/instruction_quality/llm_task_difficulty.py
@@ -38,7 +38,8 @@ class LLMTaskDifficulty(BaseOpenAI):
         "metric_name": "LLMTaskDifficulty",
         "description": "Evaluates task difficulty across cognitive complexity, step complexity, domain knowledge, and constraint density",
         "paper_source": "OpenAI Math Problem Difficulty (2024), Google DeepMind Task Complexity (2023)",
-        "evaluation_results": "Returns difficulty level (1-10) with detailed breakdown"
+        "evaluation_results": "Returns difficulty level (1-10) with detailed breakdown",
+        "examples": "examples/sft/evaluate_instruction_quality.py"
     }
 
     prompt = """
diff --git a/dingo/model/llm/rag/llm_rag_answer_relevancy.py b/dingo/model/llm/rag/llm_rag_answer_relevancy.py
@@ -39,6 +39,7 @@ class LLMRAGAnswerRelevancy(BaseOpenAI):
         "description": "评估答案是否直接回答问题，检测无关和冗余信息",
         "paper_title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",
         "paper_url": "https://arxiv.org/abs/2309.15217",
+        "examples": "examples/rag/dataset_rag_eval_baseline.py",
         "source_frameworks": "Ragas"
     }
 
diff --git a/dingo/model/llm/rag/llm_rag_context_precision.py b/dingo/model/llm/rag/llm_rag_context_precision.py
@@ -39,6 +39,7 @@ class LLMRAGContextPrecision(BaseOpenAI):
         "description": "评估检索上下文的精确度，包括相关性和排序质量",
         "paper_title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",
         "paper_url": "https://arxiv.org/abs/2309.15217",
+        "examples": "examples/rag/dataset_rag_eval_baseline.py",
         "source_frameworks": "Ragas"
     }
 
diff --git a/dingo/model/llm/rag/llm_rag_context_recall.py b/dingo/model/llm/rag/llm_rag_context_recall.py
@@ -43,6 +43,7 @@ class LLMRAGContextRecall(BaseOpenAI):
         "description": "评估检索上下文的完整性，判断上下文是否能支持答案中的所有陈述",
         "paper_title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",
         "paper_url": "https://arxiv.org/abs/2309.15217",
+        "examples": "examples/rag/dataset_rag_eval_baseline.py",
         "source_frameworks": "Ragas + DeepEval"
     }
 
diff --git a/dingo/model/llm/rag/llm_rag_context_relevancy.py b/dingo/model/llm/rag/llm_rag_context_relevancy.py
@@ -41,6 +41,7 @@ class LLMRAGContextRelevancy(BaseOpenAI):
         "description": "评估检索上下文与问题的相关性，检测噪声信息",
         "paper_title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",
         "paper_url": "https://arxiv.org/abs/2309.15217",
+        "examples": "examples/rag/dataset_rag_eval_baseline.py",
         "source_frameworks": "Ragas + DeepEval + TruLens"
     }
 
diff --git a/dingo/model/llm/rag/llm_rag_faithfulness.py b/dingo/model/llm/rag/llm_rag_faithfulness.py
@@ -39,6 +39,7 @@ class LLMRAGFaithfulness(BaseOpenAI):
         "description": "评估生成答案是否忠实于给定上下文，检测幻觉和编造信息",
         "paper_title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",
         "paper_url": "https://arxiv.org/abs/2309.15217",
+        "examples": "examples/rag/dataset_rag_eval_baseline.py",
         "source_frameworks": "Ragas + DeepEval"
     }
 
diff --git a/dingo/model/llm/text_quality/llm_text_quality_v5.py b/dingo/model/llm/text_quality/llm_text_quality_v5.py
@@ -12,6 +12,7 @@ class LLMTextQualityV5(BaseTextQuality):
         "paper_title": "WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages",
         "paper_url": "https://arxiv.org/abs/2501.14506",
         "paper_authors": "Yu et al., 2025",
+        "examples": "examples/llm_and_rule/llm_local.py",
         "evaluation_results": "docs/eval/prompt/redpajama_data_evaluated_by_prompt.md"
     }
     prompt = """
diff --git a/docs/metrics.md b/docs/metrics.md
diff --git a/examples/sft/evaluate_instruction_quality.py b/examples/sft/evaluate_instruction_quality.py
@@ -110,7 +110,7 @@ def evaluate_task_difficulty():
         "executor": {
             "max_workers": 5,
             "result_save": {
-                "bad": False,  # 难度评估通常不需要保存"bad"
+                "bad": True,
                 "good": True,  # 保存所有评估结果
                 "all_labels": True
             }
@@ -172,7 +172,7 @@ def evaluate_both():
 
     input_data = {
         "task_name": "comprehensive_instruction_evaluation",
-        "input_path": "test/data/instructions.jsonl",
+        "input_path": str(Path("test/data/instructions.jsonl")),
         "output_path": "outputs/instruction_comprehensive/",
         "dataset": {
             "source": "local",
@@ -246,101 +246,6 @@ def evaluate_both():
     return summary
 
 
-def analyze_difficulty_distribution():
-    """分析任务难度分布（用于数据集平衡）"""
-    print("=" * 80)
-    print("  任务难度分布分析")
-    print("=" * 80 + "\n")
-
-    input_data = {
-        "task_name": "difficulty_distribution_analysis",
-        "input_path": "test/data/instructions.jsonl",
-        "output_path": "outputs/difficulty_distribution/",
-        "dataset": {
-            "source": "local",
-            "format": "jsonl"
-        },
-        "executor": {
-            "max_workers": 10,
-            "result_save": {
-                "bad": False,
-                "good": True,
-                "all_labels": True
-            }
-        },
-        "evaluator": [
-            {
-                "fields": {"content": "instruction"},
-                "evals": [
-                    {
-                        "name": "LLMTaskDifficulty",
-                        "config": {
-                            "model": OPENAI_MODEL,
-                            "key": OPENAI_API_KEY,
-                            "api_url": OPENAI_BASE_URL
-                        }
-                    }
-                ]
-            }
-        ]
-    }
-
-    input_args = InputArgs(**input_data)
-    executor = Executor.exec_map["local"](input_args)
-    summary = executor.execute()
-
-    # 分析结果
-    good_list = executor.get_good_info_list()
-
-    # 统计难度分布
-    difficulty_counts = {
-        "Easy (0-3)": 0,
-        "Moderate (4-6)": 0,
-        "Hard (7-8)": 0,
-        "Expert (9-10)": 0
-    }
-
-    total_score = 0
-    for item in good_list:
-        eval_details = item.get('eval_details', {})
-        for field, details in eval_details.items():
-            for detail in details:
-                if detail.get('metric') == 'LLMTaskDifficulty':
-                    score = detail.get('score', 0)
-                    total_score += score
-
-                    if score <= 3:
-                        difficulty_counts["Easy (0-3)"] += 1
-                    elif score <= 6:
-                        difficulty_counts["Moderate (4-6)"] += 1
-                    elif score <= 8:
-                        difficulty_counts["Hard (7-8)"] += 1
-                    else:
-                        difficulty_counts["Expert (9-10)"] += 1
-
-    print("\n" + "=" * 80)
-    print("  难度分布分析")
-    print("=" * 80)
-    print(f"总数: {len(good_list)}")
-    if good_list:
-        print(f"平均难度: {total_score / len(good_list):.2f}/10")
-    print("\n难度级别分布:")
-    for level, count in difficulty_counts.items():
-        percentage = (count / len(good_list) * 100) if good_list else 0
-        print(f"  {level}: {count} ({percentage:.1f}%)")
-
-    print("\n💡 数据集平衡建议:")
-    # 理想分布: Easy 20%, Moderate 50%, Hard 25%, Expert 5%
-    if difficulty_counts["Easy (0-3)"] / len(good_list) > 0.3:
-        print("  ⚠️  简单任务过多，考虑增加难度或过滤部分简单任务")
-    if difficulty_counts["Moderate (4-6)"] / len(good_list) < 0.3:
-        print("  ⚠️  中等难度任务不足，这是 SFT 的核心部分")
-    if difficulty_counts["Hard (7-8)"] / len(good_list) > 0.4:
-        print("  ⚠️  困难任务过多，可能影响训练效率")
-
-    return summary
-
-
 if __name__ == "__main__":
     import sys
 
@@ -361,8 +266,6 @@ def analyze_difficulty_distribution():
         evaluate_instruction_clarity()
     elif mode == "difficulty":
         evaluate_task_difficulty()
-    elif mode == "distribution":
-        analyze_difficulty_distribution()
     else:
         evaluate_both()
 
diff --git a/scripts/generate_metrics.py b/scripts/generate_metrics.py
@@ -73,8 +73,8 @@ def generate_table_section(title: str, metrics: List[Dict[str, Any]]) -> str:
 
     # 表格头部
     table = f"### {title}\n\n"
-    table += "| Type | Metric | Description | Paper Source | Evaluation Results |\n"
-    table += "|------|--------|-------------|--------------|-------------------|\n"
+    table += "| Type | Metric | Description | Paper Source | Evaluation Results | Examples |\n"
+    table += "|------|--------|-------------|--------------|-------------------|----------|\n"
 
     # 对于rule类，按type分组合并；对于llm类，保持原有逻辑
     if title.startswith("Rule-Based") and "Quality Metrics" in title:
@@ -134,8 +134,20 @@ def generate_table_section(title: str, metrics: List[Dict[str, Any]]) -> str:
             else:
                 eval_results = "N/A"
 
+            # 处理示例链接
+            if first_metric.get('examples'):
+                # 修正相对路径
+                example_path = first_metric['examples']
+                if example_path.startswith('docs/'):
+                    example_path = example_path[5:]
+                elif example_path.startswith('examples/'):
+                    example_path = f"../{example_path}"
+                examples = f"[📝 View Example]({example_path})"
+            else:
+                examples = "N/A"
+
             table += f"| {type_name} | {combined_metrics} | " \
-                f"{combined_description} | {paper_source} | {eval_results} |\n"
+                f"{combined_description} | {paper_source} | {eval_results} | {examples} |\n"
     else:
         # 对于llm类，按类名排序；对于其他类型保持原有逻辑
         sort_key = lambda x: x.get('class_name', '')  # noqa: E731
@@ -182,8 +194,20 @@ def generate_table_section(title: str, metrics: List[Dict[str, Any]]) -> str:
             else:
                 eval_results = "N/A"
 
+            # 处理示例链接
+            if metric.get('examples'):
+                # 修正相对路径
+                example_path = metric['examples']
+                if example_path.startswith('docs/'):
+                    example_path = example_path[5:]
+                elif example_path.startswith('examples/'):
+                    example_path = f"../{example_path}"
+                examples = f"[📝 View Example]({example_path})"
+            else:
+                examples = "N/A"
+
             table += f"| {type_name} | {metric_name} | {description} | " \
-                f"{paper_source} | {eval_results} |\n"
+                f"{paper_source} | {eval_results} | {examples} |\n"
 
     table += "\n"
     return table

Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,8 @@ class LLMInstructionClarity(BaseOpenAI):`
`38`	`38`	`"metric_name": "LLMInstructionClarity",`
`39`	`39`	`"description": "Evaluates instruction clarity across four dimensions: self-descriptiveness, consistency, specificity, and completeness",`
`40`	`40`	`"paper_source": "IFEval (Google, 2023), Self-Instruct (UW, 2023)",`
`41`		`- "evaluation_results": "Returns clarity score (0-10) and detailed analysis"`
	`41`	`+ "evaluation_results": "Returns clarity score (0-10) and detailed analysis",`
	`42`	`+ "examples": "examples/sft/evaluate_instruction_quality.py"`
`42`	`43`	`}`
`43`	`44`
`44`	`45`	`prompt = """`
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,8 @@ class LLMTaskDifficulty(BaseOpenAI):`
`38`	`38`	`"metric_name": "LLMTaskDifficulty",`
`39`	`39`	`"description": "Evaluates task difficulty across cognitive complexity, step complexity, domain knowledge, and constraint density",`
`40`	`40`	`"paper_source": "OpenAI Math Problem Difficulty (2024), Google DeepMind Task Complexity (2023)",`
`41`		`- "evaluation_results": "Returns difficulty level (1-10) with detailed breakdown"`
	`41`	`+ "evaluation_results": "Returns difficulty level (1-10) with detailed breakdown",`
	`42`	`+ "examples": "examples/sft/evaluate_instruction_quality.py"`
`42`	`43`	`}`
`43`	`44`
`44`	`45`	`prompt = """`
Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,7 @@ class LLMRAGAnswerRelevancy(BaseOpenAI):`
`39`	`39`	`"description": "评估答案是否直接回答问题，检测无关和冗余信息",`
`40`	`40`	`"paper_title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",`
`41`	`41`	`"paper_url": "https://arxiv.org/abs/2309.15217",`
	`42`	`+ "examples": "examples/rag/dataset_rag_eval_baseline.py",`
`42`	`43`	`"source_frameworks": "Ragas"`
`43`	`44`	`}`
`44`	`45`
Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,7 @@ class LLMRAGContextPrecision(BaseOpenAI):`
`39`	`39`	`"description": "评估检索上下文的精确度，包括相关性和排序质量",`
`40`	`40`	`"paper_title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",`
`41`	`41`	`"paper_url": "https://arxiv.org/abs/2309.15217",`
	`42`	`+ "examples": "examples/rag/dataset_rag_eval_baseline.py",`
`42`	`43`	`"source_frameworks": "Ragas"`
`43`	`44`	`}`
`44`	`45`
Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@ class LLMRAGContextRecall(BaseOpenAI):`
`43`	`43`	`"description": "评估检索上下文的完整性，判断上下文是否能支持答案中的所有陈述",`
`44`	`44`	`"paper_title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",`
`45`	`45`	`"paper_url": "https://arxiv.org/abs/2309.15217",`
	`46`	`+ "examples": "examples/rag/dataset_rag_eval_baseline.py",`
`46`	`47`	`"source_frameworks": "Ragas + DeepEval"`
`47`	`48`	`}`
`48`	`49`
Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,7 @@ class LLMRAGContextRelevancy(BaseOpenAI):`
`41`	`41`	`"description": "评估检索上下文与问题的相关性，检测噪声信息",`
`42`	`42`	`"paper_title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",`
`43`	`43`	`"paper_url": "https://arxiv.org/abs/2309.15217",`
	`44`	`+ "examples": "examples/rag/dataset_rag_eval_baseline.py",`
`44`	`45`	`"source_frameworks": "Ragas + DeepEval + TruLens"`
`45`	`46`	`}`
`46`	`47`
Original file line number	Diff line number	Diff line change
`@@ -39,6 +39,7 @@ class LLMRAGFaithfulness(BaseOpenAI):`
`39`	`39`	`"description": "评估生成答案是否忠实于给定上下文，检测幻觉和编造信息",`
`40`	`40`	`"paper_title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",`
`41`	`41`	`"paper_url": "https://arxiv.org/abs/2309.15217",`
	`42`	`+ "examples": "examples/rag/dataset_rag_eval_baseline.py",`
`42`	`43`	`"source_frameworks": "Ragas + DeepEval"`
`43`	`44`	`}`
`44`	`45`
Original file line number	Diff line number	Diff line change
`@@ -12,6 +12,7 @@ class LLMTextQualityV5(BaseTextQuality):`
`12`	`12`	`"paper_title": "WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages",`
`13`	`13`	`"paper_url": "https://arxiv.org/abs/2501.14506",`
`14`	`14`	`"paper_authors": "Yu et al., 2025",`
	`15`	`+ "examples": "examples/llm_and_rule/llm_local.py",`
`15`	`16`	`"evaluation_results": "docs/eval/prompt/redpajama_data_evaluated_by_prompt.md"`
`16`	`17`	`}`
`17`	`18`	`prompt = """`