Skip to content

Commit 7464aa2

Browse files
e06084actions-user
andauthored
feat: add examples in metrics (#314)
* feat: add Instruction Quality Evaluation * feat: add examples in metrics * 📚 Auto-update metrics documentation --------- Co-authored-by: GitHub Action <[email protected]>
1 parent 146d604 commit 7464aa2

File tree

11 files changed

+121
-186
lines changed

11 files changed

+121
-186
lines changed

dingo/model/llm/instruction_quality/llm_instruction_clarity.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ class LLMInstructionClarity(BaseOpenAI):
3838
"metric_name": "LLMInstructionClarity",
3939
"description": "Evaluates instruction clarity across four dimensions: self-descriptiveness, consistency, specificity, and completeness",
4040
"paper_source": "IFEval (Google, 2023), Self-Instruct (UW, 2023)",
41-
"evaluation_results": "Returns clarity score (0-10) and detailed analysis"
41+
"evaluation_results": "Returns clarity score (0-10) and detailed analysis",
42+
"examples": "examples/sft/evaluate_instruction_quality.py"
4243
}
4344

4445
prompt = """

dingo/model/llm/instruction_quality/llm_task_difficulty.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,8 @@ class LLMTaskDifficulty(BaseOpenAI):
3838
"metric_name": "LLMTaskDifficulty",
3939
"description": "Evaluates task difficulty across cognitive complexity, step complexity, domain knowledge, and constraint density",
4040
"paper_source": "OpenAI Math Problem Difficulty (2024), Google DeepMind Task Complexity (2023)",
41-
"evaluation_results": "Returns difficulty level (1-10) with detailed breakdown"
41+
"evaluation_results": "Returns difficulty level (1-10) with detailed breakdown",
42+
"examples": "examples/sft/evaluate_instruction_quality.py"
4243
}
4344

4445
prompt = """

dingo/model/llm/rag/llm_rag_answer_relevancy.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ class LLMRAGAnswerRelevancy(BaseOpenAI):
3939
"description": "评估答案是否直接回答问题,检测无关和冗余信息",
4040
"paper_title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",
4141
"paper_url": "https://arxiv.org/abs/2309.15217",
42+
"examples": "examples/rag/dataset_rag_eval_baseline.py",
4243
"source_frameworks": "Ragas"
4344
}
4445

dingo/model/llm/rag/llm_rag_context_precision.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ class LLMRAGContextPrecision(BaseOpenAI):
3939
"description": "评估检索上下文的精确度,包括相关性和排序质量",
4040
"paper_title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",
4141
"paper_url": "https://arxiv.org/abs/2309.15217",
42+
"examples": "examples/rag/dataset_rag_eval_baseline.py",
4243
"source_frameworks": "Ragas"
4344
}
4445

dingo/model/llm/rag/llm_rag_context_recall.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ class LLMRAGContextRecall(BaseOpenAI):
4343
"description": "评估检索上下文的完整性,判断上下文是否能支持答案中的所有陈述",
4444
"paper_title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",
4545
"paper_url": "https://arxiv.org/abs/2309.15217",
46+
"examples": "examples/rag/dataset_rag_eval_baseline.py",
4647
"source_frameworks": "Ragas + DeepEval"
4748
}
4849

dingo/model/llm/rag/llm_rag_context_relevancy.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ class LLMRAGContextRelevancy(BaseOpenAI):
4141
"description": "评估检索上下文与问题的相关性,检测噪声信息",
4242
"paper_title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",
4343
"paper_url": "https://arxiv.org/abs/2309.15217",
44+
"examples": "examples/rag/dataset_rag_eval_baseline.py",
4445
"source_frameworks": "Ragas + DeepEval + TruLens"
4546
}
4647

dingo/model/llm/rag/llm_rag_faithfulness.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ class LLMRAGFaithfulness(BaseOpenAI):
3939
"description": "评估生成答案是否忠实于给定上下文,检测幻觉和编造信息",
4040
"paper_title": "RAGAS: Automated Evaluation of Retrieval Augmented Generation",
4141
"paper_url": "https://arxiv.org/abs/2309.15217",
42+
"examples": "examples/rag/dataset_rag_eval_baseline.py",
4243
"source_frameworks": "Ragas + DeepEval"
4344
}
4445

dingo/model/llm/text_quality/llm_text_quality_v5.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ class LLMTextQualityV5(BaseTextQuality):
1212
"paper_title": "WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages",
1313
"paper_url": "https://arxiv.org/abs/2501.14506",
1414
"paper_authors": "Yu et al., 2025",
15+
"examples": "examples/llm_and_rule/llm_local.py",
1516
"evaluation_results": "docs/eval/prompt/redpajama_data_evaluated_by_prompt.md"
1617
}
1718
prompt = """

docs/metrics.md

Lines changed: 81 additions & 81 deletions
Large diffs are not rendered by default.

examples/sft/evaluate_instruction_quality.py

Lines changed: 2 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def evaluate_task_difficulty():
110110
"executor": {
111111
"max_workers": 5,
112112
"result_save": {
113-
"bad": False, # 难度评估通常不需要保存"bad"
113+
"bad": True,
114114
"good": True, # 保存所有评估结果
115115
"all_labels": True
116116
}
@@ -172,7 +172,7 @@ def evaluate_both():
172172

173173
input_data = {
174174
"task_name": "comprehensive_instruction_evaluation",
175-
"input_path": "test/data/instructions.jsonl",
175+
"input_path": str(Path("test/data/instructions.jsonl")),
176176
"output_path": "outputs/instruction_comprehensive/",
177177
"dataset": {
178178
"source": "local",
@@ -246,101 +246,6 @@ def evaluate_both():
246246
return summary
247247

248248

249-
def analyze_difficulty_distribution():
250-
"""分析任务难度分布(用于数据集平衡)"""
251-
print("=" * 80)
252-
print(" 任务难度分布分析")
253-
print("=" * 80 + "\n")
254-
255-
input_data = {
256-
"task_name": "difficulty_distribution_analysis",
257-
"input_path": "test/data/instructions.jsonl",
258-
"output_path": "outputs/difficulty_distribution/",
259-
"dataset": {
260-
"source": "local",
261-
"format": "jsonl"
262-
},
263-
"executor": {
264-
"max_workers": 10,
265-
"result_save": {
266-
"bad": False,
267-
"good": True,
268-
"all_labels": True
269-
}
270-
},
271-
"evaluator": [
272-
{
273-
"fields": {"content": "instruction"},
274-
"evals": [
275-
{
276-
"name": "LLMTaskDifficulty",
277-
"config": {
278-
"model": OPENAI_MODEL,
279-
"key": OPENAI_API_KEY,
280-
"api_url": OPENAI_BASE_URL
281-
}
282-
}
283-
]
284-
}
285-
]
286-
}
287-
288-
input_args = InputArgs(**input_data)
289-
executor = Executor.exec_map["local"](input_args)
290-
summary = executor.execute()
291-
292-
# 分析结果
293-
good_list = executor.get_good_info_list()
294-
295-
# 统计难度分布
296-
difficulty_counts = {
297-
"Easy (0-3)": 0,
298-
"Moderate (4-6)": 0,
299-
"Hard (7-8)": 0,
300-
"Expert (9-10)": 0
301-
}
302-
303-
total_score = 0
304-
for item in good_list:
305-
eval_details = item.get('eval_details', {})
306-
for field, details in eval_details.items():
307-
for detail in details:
308-
if detail.get('metric') == 'LLMTaskDifficulty':
309-
score = detail.get('score', 0)
310-
total_score += score
311-
312-
if score <= 3:
313-
difficulty_counts["Easy (0-3)"] += 1
314-
elif score <= 6:
315-
difficulty_counts["Moderate (4-6)"] += 1
316-
elif score <= 8:
317-
difficulty_counts["Hard (7-8)"] += 1
318-
else:
319-
difficulty_counts["Expert (9-10)"] += 1
320-
321-
print("\n" + "=" * 80)
322-
print(" 难度分布分析")
323-
print("=" * 80)
324-
print(f"总数: {len(good_list)}")
325-
if good_list:
326-
print(f"平均难度: {total_score / len(good_list):.2f}/10")
327-
print("\n难度级别分布:")
328-
for level, count in difficulty_counts.items():
329-
percentage = (count / len(good_list) * 100) if good_list else 0
330-
print(f" {level}: {count} ({percentage:.1f}%)")
331-
332-
print("\n💡 数据集平衡建议:")
333-
# 理想分布: Easy 20%, Moderate 50%, Hard 25%, Expert 5%
334-
if difficulty_counts["Easy (0-3)"] / len(good_list) > 0.3:
335-
print(" ⚠️ 简单任务过多,考虑增加难度或过滤部分简单任务")
336-
if difficulty_counts["Moderate (4-6)"] / len(good_list) < 0.3:
337-
print(" ⚠️ 中等难度任务不足,这是 SFT 的核心部分")
338-
if difficulty_counts["Hard (7-8)"] / len(good_list) > 0.4:
339-
print(" ⚠️ 困难任务过多,可能影响训练效率")
340-
341-
return summary
342-
343-
344249
if __name__ == "__main__":
345250
import sys
346251

@@ -361,8 +266,6 @@ def analyze_difficulty_distribution():
361266
evaluate_instruction_clarity()
362267
elif mode == "difficulty":
363268
evaluate_task_difficulty()
364-
elif mode == "distribution":
365-
analyze_difficulty_distribution()
366269
else:
367270
evaluate_both()
368271

0 commit comments

Comments
 (0)