From bc1513c86ed8ab4d96d261a4fdca5c68ea32902c Mon Sep 17 00:00:00 2001 From: chupei Date: Tue, 23 Dec 2025 17:35:09 +0800 Subject: [PATCH 1/2] feat: add Instruction Quality Evaluation --- .../model/llm/instruction_quality/__init__.py | 20 + .../llm_instruction_clarity.py | 306 +++++++++++ .../llm_task_difficulty.py | 350 +++++++++++++ docs/instruction_quality_guide.md | 478 ++++++++++++++++++ docs/metrics.md | 4 +- examples/sft/evaluate_instruction_quality.py | 374 ++++++++++++++ test/data/instructions.jsonl | 10 + 7 files changed, 1541 insertions(+), 1 deletion(-) create mode 100644 dingo/model/llm/instruction_quality/__init__.py create mode 100644 dingo/model/llm/instruction_quality/llm_instruction_clarity.py create mode 100644 dingo/model/llm/instruction_quality/llm_task_difficulty.py create mode 100644 docs/instruction_quality_guide.md create mode 100644 examples/sft/evaluate_instruction_quality.py create mode 100644 test/data/instructions.jsonl diff --git a/dingo/model/llm/instruction_quality/__init__.py b/dingo/model/llm/instruction_quality/__init__.py new file mode 100644 index 00000000..936576ab --- /dev/null +++ b/dingo/model/llm/instruction_quality/__init__.py @@ -0,0 +1,20 @@ +""" +Instruction Quality Evaluation Metrics + +This module provides LLM-based evaluators for assessing instruction quality +in SFT (Supervised Fine-Tuning) datasets, specifically focusing on: + +1. Instruction Clarity - Evaluates how clear and well-defined instructions are +2. Task Difficulty - Assesses the complexity and difficulty level of tasks + +These metrics are based on recent research in instruction following and +LLM training data quality assessment. +""" + +from dingo.model.llm.instruction_quality.llm_instruction_clarity import LLMInstructionClarity +from dingo.model.llm.instruction_quality.llm_task_difficulty import LLMTaskDifficulty + +__all__ = [ + "LLMInstructionClarity", + "LLMTaskDifficulty", +] diff --git a/dingo/model/llm/instruction_quality/llm_instruction_clarity.py b/dingo/model/llm/instruction_quality/llm_instruction_clarity.py new file mode 100644 index 00000000..847ccaea --- /dev/null +++ b/dingo/model/llm/instruction_quality/llm_instruction_clarity.py @@ -0,0 +1,306 @@ +""" +Instruction Clarity Evaluator - 指令清晰度评估器 + +Based on recent research: +- IFEval: Instruction Following Evaluation (Google, 2023) +- Self-Instruct (University of Washington, 2023) +- Alpaca: A Strong, Replicable Instruction-Following Model (Stanford, 2023) + +评估维度: +1. Self-Descriptiveness: 指令是否自包含,无需额外上下文 +2. Consistency: 指令内部是否一致,无矛盾 +3. Specificity: 指令是否具体明确,避免歧义 +4. Completeness: 指令是否完整,包含所有必要信息 +""" + +from dingo.io.output.eval_detail import EvalDetail +from dingo.model import Model +from dingo.model.llm.base_openai import BaseOpenAI +from dingo.utils import log + + +@Model.llm_register("LLMInstructionClarity") +class LLMInstructionClarity(BaseOpenAI): + """ + LLM-based instruction clarity evaluator + + 评估指令的清晰度,包括: + - 自描述性:是否包含足够信息 + - 一致性:内部是否有矛盾 + - 具体性:是否明确具体 + - 完整性:是否包含所有必要信息 + """ + + # Metadata for documentation generation + _metric_info = { + "category": "SFT Data Assessment Metrics", + "quality_dimension": "INSTRUCTION_CLARITY", + "metric_name": "LLMInstructionClarity", + "description": "Evaluates instruction clarity across four dimensions: self-descriptiveness, consistency, specificity, and completeness", + "paper_source": "IFEval (Google, 2023), Self-Instruct (UW, 2023)", + "evaluation_results": "Returns clarity score (0-10) and detailed analysis" + } + + prompt = """ +# Role +You are an expert in evaluating instruction quality for Large Language Model training data. + +# Task +Evaluate the clarity of the given instruction across four dimensions. + +# Evaluation Dimensions + +## 1. Self-Descriptiveness (自描述性) +**Definition**: Does the instruction contain sufficient information to be understood without additional context? + +**Scoring**: +- **High (2.5)**: Complete self-contained instruction with all necessary details + - Example: "Write a Python function that takes a list of integers and returns the sum of all even numbers. Include docstring and type hints." +- **Medium (1.5)**: Mostly clear but may need minor assumptions + - Example: "Write a function to sum even numbers in a list." +- **Low (0.5)**: Requires significant external context or assumptions + - Example: "Do that thing with the numbers." + +## 2. Consistency (一致性) +**Definition**: Are all parts of the instruction aligned without contradictions? + +**Scoring**: +- **High (2.5)**: Perfectly consistent throughout + - Example: "Write a formal academic essay on climate change using APA citation style and maintain a professional tone." +- **Medium (1.5)**: Minor inconsistencies that don't fundamentally conflict + - Example: "Write a casual blog post but use academic references." +- **Low (0.5)**: Major contradictions + - Example: "Write a 500-word essay in under 100 words." + +## 3. Specificity (具体性) +**Definition**: Is the instruction concrete and unambiguous? + +**Scoring**: +- **High (2.5)**: Very specific with clear success criteria + - Example: "Generate exactly 5 creative product names for an eco-friendly water bottle. Each name should be 2-3 words and include at least one nature-related term." +- **Medium (1.5)**: Somewhat specific but allows interpretation + - Example: "Generate some creative names for a water bottle." +- **Low (0.5)**: Vague and ambiguous + - Example: "Make something cool." + +## 4. Completeness (完整性) +**Definition**: Does the instruction include all necessary information for task completion? + +**Scoring**: +- **High (2.5)**: All required elements specified (input, output, constraints, format) + - Example: "Given a JSON file with user data, extract all email addresses, validate them using regex, and output to a CSV file with columns: name, email, valid_status." +- **Medium (1.5)**: Most elements present but some details missing + - Example: "Extract email addresses from a file and validate them." +- **Low (0.5)**: Critical information missing + - Example: "Process the data." + +# Scoring System +- **Total Score**: 0-10 (sum of all four dimensions, each worth 2.5 points) +- **Threshold**: Default 6.0 (instructions below this score are considered unclear) + +# Output Format +Return JSON only: +```json +{ + "score": 8.5, + "dimensions": { + "self_descriptiveness": 2.5, + "consistency": 2.0, + "specificity": 2.0, + "completeness": 2.0 + }, + "issues": [], + "strengths": ["Clear task definition", "Well-specified output format"], + "suggestions": ["Could specify tone/style more explicitly"], + "reason": "High-quality instruction with clear task definition and well-specified constraints. Minor improvement: explicitly specify the desired tone." +} +``` + +# Important Rules +1. Be strict but fair - real-world instructions aren't always perfect +2. Focus on whether the instruction enables successful task completion +3. Consider the instruction type (creative tasks may be intentionally open-ended) +4. Empty or extremely vague instructions should score 0-2 +5. Professional SFT-quality instructions should score 7+ + +# Examples + +**Example 1 - Excellent Instruction (Score: 9.5)** +Input: "Write a Python function named `calculate_discount` that takes two parameters: original_price (float) and discount_percentage (float, 0-100). Return the final price after applying the discount, rounded to 2 decimal places. Include input validation to ensure prices are positive and discounts are between 0-100. Add comprehensive docstring with examples." + +Output: +```json +{ + "score": 9.5, + "dimensions": { + "self_descriptiveness": 2.5, + "consistency": 2.5, + "specificity": 2.5, + "completeness": 2.0 + }, + "issues": [], + "strengths": [ + "Specific function name and parameters", + "Clear input/output specifications", + "Validation requirements specified", + "Format requirements (rounding) included" + ], + "suggestions": [ + "Could specify return type for type hints" + ], + "reason": "Excellent instruction with comprehensive details. Only minor omission: explicit mention of return type annotation." +} +``` + +**Example 2 - Poor Instruction (Score: 2.0)** +Input: "写个代码" + +Output: +```json +{ + "score": 2.0, + "dimensions": { + "self_descriptiveness": 0.5, + "consistency": 0.5, + "specificity": 0.5, + "completeness": 0.5 + }, + "issues": [ + "No programming language specified", + "No task description", + "No input/output specification", + "No context provided" + ], + "strengths": [], + "suggestions": [ + "Specify programming language", + "Describe what the code should do", + "Define expected inputs and outputs", + "Add any constraints or requirements" + ], + "reason": "Extremely vague instruction lacking all essential information. Impossible to determine what code is needed without additional context." +} +``` + +**Example 3 - Moderate Instruction (Score: 6.5)** +Input: "Summarize this article in a few sentences." + +Output: +```json +{ + "score": 6.5, + "dimensions": { + "self_descriptiveness": 1.5, + "consistency": 2.0, + "specificity": 1.5, + "completeness": 1.5 + }, + "issues": [ + "'Few sentences' is ambiguous (2? 3? 5?)", + "No article content provided (assumes context)", + "No specification of summary style/focus" + ], + "strengths": [ + "Clear task (summarization)", + "No internal contradictions" + ], + "suggestions": [ + "Specify exact number of sentences (e.g., '3-5 sentences')", + "Include the article content or reference", + "Optionally specify summary focus (key findings, main argument, etc.)" + ], + "reason": "Decent instruction with clear intent but lacks precision. Needs more specific constraints and assumes article context is available." +} +``` + +# Now evaluate this instruction: +""" + + @classmethod + def process_response(cls, response: str) -> EvalDetail: + """处理 LLM 响应并生成评估结果""" + import json + + log.info(f"LLM Response: {response}") + result = EvalDetail(metric=cls.__name__) + + try: + # 解析 JSON 响应 + # 移除可能的 markdown 代码块标记 + response = response.strip() + if response.startswith("```json"): + response = response[7:] + if response.startswith("```"): + response = response[3:] + if response.endswith("```"): + response = response[:-3] + response = response.strip() + + parsed = json.loads(response) + + # 提取分数和维度信息 + score = float(parsed.get("score", 0)) + dimensions = parsed.get("dimensions", {}) + issues = parsed.get("issues", []) + strengths = parsed.get("strengths", []) + suggestions = parsed.get("suggestions", []) + reason = parsed.get("reason", "") + + # 构建详细的 reason + detailed_reason = f"指令清晰度评分: {score}/10\n\n" + detailed_reason += "维度得分:\n" + detailed_reason += f" - 自描述性: {dimensions.get('self_descriptiveness', 0)}/2.5\n" + detailed_reason += f" - 一致性: {dimensions.get('consistency', 0)}/2.5\n" + detailed_reason += f" - 具体性: {dimensions.get('specificity', 0)}/2.5\n" + detailed_reason += f" - 完整性: {dimensions.get('completeness', 0)}/2.5\n\n" + + if strengths: + detailed_reason += "优点:\n" + for s in strengths: + detailed_reason += f" ✓ {s}\n" + detailed_reason += "\n" + + if issues: + detailed_reason += "问题:\n" + for i in issues: + detailed_reason += f" ✗ {i}\n" + detailed_reason += "\n" + + if suggestions: + detailed_reason += "改进建议:\n" + for s in suggestions: + detailed_reason += f" → {s}\n" + detailed_reason += "\n" + + detailed_reason += f"总结: {reason}" + + # 设置结果 + result.score = score + result.reason = [detailed_reason] + + # 判断是否通过(默认阈值 6.0) + threshold = 6.0 + if hasattr(cls, 'dynamic_config') and cls.dynamic_config.parameters: + threshold = cls.dynamic_config.parameters.get('threshold', 6.0) + + if score >= threshold: + result.status = False + result.label = ["QUALITY_GOOD.INSTRUCTION_CLARITY_PASS"] + else: + result.status = True + result.label = ["QUALITY_BAD.INSTRUCTION_CLARITY_FAIL"] + + except json.JSONDecodeError as e: + log.error(f"Failed to parse JSON response: {e}") + result.status = True + result.score = 0 + result.label = ["QUALITY_BAD.INSTRUCTION_CLARITY_ERROR"] + result.reason = [f"评估失败: JSON 解析错误 - {str(e)}"] + except Exception as e: + log.error(f"Error processing response: {e}") + result.status = True + result.score = 0 + result.label = ["QUALITY_BAD.INSTRUCTION_CLARITY_ERROR"] + result.reason = [f"评估失败: {str(e)}"] + + return result diff --git a/dingo/model/llm/instruction_quality/llm_task_difficulty.py b/dingo/model/llm/instruction_quality/llm_task_difficulty.py new file mode 100644 index 00000000..bbe1b959 --- /dev/null +++ b/dingo/model/llm/instruction_quality/llm_task_difficulty.py @@ -0,0 +1,350 @@ +""" +Task Difficulty Evaluator - 任务难度评估器 + +Based on recent research: +- Measuring Difficulty of Math Problems (OpenAI, 2024) +- Task Complexity in Instruction Following (Google DeepMind, 2023) +- Self-Instruct: Aligning Language Models with Self-Generated Instructions (2023) + +评估维度: +1. Cognitive Complexity: 认知复杂度 +2. Step Complexity: 步骤复杂度 +3. Domain Knowledge: 领域知识要求 +4. Constraint Density: 约束条件密度 +""" + +from dingo.io.output.eval_detail import EvalDetail +from dingo.model import Model +from dingo.model.llm.base_openai import BaseOpenAI +from dingo.utils import log + + +@Model.llm_register("LLMTaskDifficulty") +class LLMTaskDifficulty(BaseOpenAI): + """ + LLM-based task difficulty evaluator + + 评估任务的难度级别,包括: + - 认知复杂度:需要的推理深度 + - 步骤复杂度:任务分解的复杂程度 + - 领域知识:专业知识要求 + - 约束密度:限制条件的数量和复杂性 + """ + + # Metadata for documentation generation + _metric_info = { + "category": "SFT Data Assessment Metrics", + "quality_dimension": "TASK_DIFFICULTY", + "metric_name": "LLMTaskDifficulty", + "description": "Evaluates task difficulty across cognitive complexity, step complexity, domain knowledge, and constraint density", + "paper_source": "OpenAI Math Problem Difficulty (2024), Google DeepMind Task Complexity (2023)", + "evaluation_results": "Returns difficulty level (1-10) with detailed breakdown" + } + + prompt = """ +# Role +You are an expert in assessing task complexity and difficulty for LLM training data evaluation. + +# Task +Evaluate the difficulty level of the given instruction across four dimensions. + +# Evaluation Dimensions + +## 1. Cognitive Complexity (认知复杂度) - Weight: 30% +**Definition**: Mental processing depth required to complete the task. + +Based on Bloom's Taxonomy: +- **Level 1-2 (Simple)**: Remember, Understand + - Example: "Define photosynthesis." (Score: 1.5/3.0) + - Requires recall or basic comprehension + +- **Level 3-4 (Moderate)**: Apply, Analyze + - Example: "Compare and contrast mitosis and meiosis, explaining their biological significance." (Score: 2.0/3.0) + - Requires application of knowledge or analytical thinking + +- **Level 5-6 (Complex)**: Evaluate, Create + - Example: "Design a novel experimental protocol to test the efficacy of a new drug compound, considering ethical constraints, statistical power, and cost-effectiveness." (Score: 3.0/3.0) + - Requires synthesis, evaluation, or creation of new knowledge + +**Scoring**: 0.0-3.0 points + +## 2. Step Complexity (步骤复杂度) - Weight: 30% +**Definition**: Number and interdependency of steps required. + +**Scoring**: +- **Simple (0.5-1.0)**: Single-step task + - Example: "Translate '你好' to English." + - 1 step: direct translation + +- **Moderate (1.5-2.0)**: Multi-step with linear dependency + - Example: "Calculate the area of a circle with radius 5, then find what percentage it is of a square with side 15." + - Steps: Calculate circle area → Calculate square area → Compute percentage + +- **Complex (2.5-3.0)**: Multi-step with branching logic or loops + - Example: "Write a program that recursively traverses a file system, identifies all Python files, runs linting on each, aggregates results by error type, and generates a ranked report of most common issues." + - Steps: Recursive traversal + Conditional filtering + External tool execution + Data aggregation + Sorting + Report generation + +**Scoring**: 0.0-3.0 points + +## 3. Domain Knowledge (领域知识要求) - Weight: 20% +**Definition**: Specialized knowledge required beyond general education. + +**Scoring**: +- **General (0.5-0.7)**: Common knowledge + - Example: "Write a recipe for chocolate chip cookies." + +- **Specialized (1.0-1.5)**: Professional or technical knowledge + - Example: "Explain how OAuth 2.0 authorization code flow works with PKCE extension." + +- **Expert (1.5-2.0)**: Deep domain expertise required + - Example: "Derive the Navier-Stokes equations from first principles and discuss conditions for existence of smooth solutions in 3D." + +**Scoring**: 0.0-2.0 points + +## 4. Constraint Density (约束条件密度) - Weight: 20% +**Definition**: Number and strictness of constraints/requirements. + +**Scoring**: +- **Low (0.5-0.7)**: 0-2 constraints, flexible + - Example: "Write a story about a cat." + +- **Medium (1.0-1.5)**: 3-5 constraints, some strictness + - Example: "Write a 500-word story about a cat, set in Victorian London, with a mystery plot." + +- **High (1.5-2.0)**: 6+ constraints, very strict + - Example: "Write exactly 500 words (+/- 10 words) story about a black cat named Midnight, set in 1890s London, mystery genre, must include: a pocket watch, a letter, and a twist ending, maintain past tense, use British English spelling, target audience: young adults." + +**Scoring**: 0.0-2.0 points + +# Total Difficulty Score +- **Score Range**: 0-10 (sum of weighted scores) +- **Difficulty Levels**: + - 0-3: Easy (适合快速蒸馏的简单任务) + - 4-6: Moderate (标准 SFT 任务) + - 7-8: Hard (高质量复杂任务) + - 9-10: Expert (需要专家级能力的任务) + +# Output Format +Return JSON only: +```json +{ + "difficulty_score": 7.5, + "difficulty_level": "Hard", + "dimensions": { + "cognitive_complexity": 2.5, + "step_complexity": 2.0, + "domain_knowledge": 1.5, + "constraint_density": 1.5 + }, + "estimated_time": "10-20 minutes", + "suitable_for": ["Advanced fine-tuning", "Expert model training"], + "key_challenges": [ + "Requires multi-step reasoning", + "Needs domain expertise in X", + "Multiple strict constraints" + ], + "reason": "This is a hard task requiring advanced reasoning and domain knowledge..." +} +``` + +# Important Notes +1. Consider the realistic capability of current LLMs +2. A task is only "Expert" level if it challenges even GPT-4 level models +3. Don't confuse verbosity with difficulty - a long simple task is still simple +4. Open-ended creative tasks can still be difficult if they require skill/expertise + +# Examples + +**Example 1 - Easy Task (Score: 2.5)** +Input: "将'Hello World'翻译成法语。" + +Output: +```json +{ + "difficulty_score": 2.5, + "difficulty_level": "Easy", + "dimensions": { + "cognitive_complexity": 1.0, + "step_complexity": 0.5, + "domain_knowledge": 0.5, + "constraint_density": 0.5 + }, + "estimated_time": "< 1 minute", + "suitable_for": ["Basic fine-tuning", "Quick knowledge distillation"], + "key_challenges": [], + "reason": "Simple single-step translation task requiring only basic language knowledge. No complex reasoning or constraints." +} +``` + +**Example 2 - Moderate Task (Score: 5.5)** +Input: "编写一个Python函数,接受一个整数列表,返回列表中所有质数的和。包含错误处理和单元测试。" + +Output: +```json +{ + "difficulty_score": 5.5, + "difficulty_level": "Moderate", + "dimensions": { + "cognitive_complexity": 2.0, + "step_complexity": 1.5, + "domain_knowledge": 1.0, + "constraint_density": 1.0 + }, + "estimated_time": "5-10 minutes", + "suitable_for": ["Standard SFT", "Code generation training"], + "key_challenges": [ + "Requires algorithm knowledge (prime checking)", + "Multiple components (function + error handling + tests)", + "Need to consider edge cases" + ], + "reason": "Moderate coding task requiring algorithm knowledge and multiple components. Needs understanding of prime numbers, error handling, and unit testing, but within standard programming curriculum." +} +``` + +**Example 3 - Hard Task (Score: 8.0)** +Input: "设计一个分布式系统架构,支持每秒10万次请求,保证99.99%可用性,具有水平扩展能力。需要包括:1)服务拆分方案 2)数据一致性策略 3)故障恢复机制 4)性能监控方案。画出架构图并详细说明每个组件的职责和交互方式。考虑CAP定理的权衡。" + +Output: +```json +{ + "difficulty_score": 8.0, + "difficulty_level": "Hard", + "dimensions": { + "cognitive_complexity": 2.5, + "step_complexity": 2.5, + "domain_knowledge": 1.5, + "constraint_density": 1.5 + }, + "estimated_time": "30-60 minutes", + "suitable_for": ["Expert model training", "Architecture knowledge evaluation"], + "key_challenges": [ + "Requires deep distributed systems knowledge", + "Multi-dimensional problem with trade-offs (CAP theorem)", + "Multiple strict requirements (throughput, availability)", + "Complex deliverables (architecture diagram + detailed explanation)", + "Need to balance multiple concerns simultaneously" + ], + "reason": "Hard system design task requiring expert-level distributed systems knowledge. Involves multiple complex constraints, trade-off analysis, and requires synthesis of knowledge across several domains (scalability, consistency, reliability). The task demands creating a comprehensive solution with multiple interdependent components." +} +``` + +**Example 4 - Expert Task (Score: 9.5)** +Input: "Prove or disprove: For any continuous function f: [0,1] → ℝ satisfying ∫₀¹ f(x)² dx < ∞, there exists a sequence of polynomials {pₙ} such that ||f - pₙ||₂ → 0 as n → ∞. Provide rigorous proof using measure theory and functional analysis. Discuss the rate of convergence and relate your findings to Weierstrass approximation theorem." + +Output: +```json +{ + "difficulty_score": 9.5, + "difficulty_level": "Expert", + "dimensions": { + "cognitive_complexity": 3.0, + "step_complexity": 2.5, + "domain_knowledge": 2.0, + "constraint_density": 2.0 + }, + "estimated_time": "1-2 hours", + "suitable_for": ["Research-level model training", "Mathematical reasoning evaluation"], + "key_challenges": [ + "Requires graduate-level mathematics", + "Need rigorous proof construction", + "Multiple advanced mathematical concepts (measure theory, functional analysis)", + "Requires connecting multiple theorems", + "Demanding formal rigor and precision" + ], + "reason": "Expert-level mathematical task requiring graduate mathematics knowledge. Demands rigorous proof construction, deep understanding of measure theory and functional analysis, and ability to connect advanced concepts. This would challenge even specialized mathematical AI systems." +} +``` + +# Now evaluate this instruction: +""" + + @classmethod + def process_response(cls, response: str) -> EvalDetail: + """处理 LLM 响应并生成评估结果""" + import json + + log.info(f"LLM Response: {response}") + result = EvalDetail(metric=cls.__name__) + + try: + # 解析 JSON 响应 + response = response.strip() + if response.startswith("```json"): + response = response[7:] + if response.startswith("```"): + response = response[3:] + if response.endswith("```"): + response = response[:-3] + response = response.strip() + + parsed = json.loads(response) + + # 提取信息 + difficulty_score = float(parsed.get("difficulty_score", 0)) + difficulty_level = parsed.get("difficulty_level", "Unknown") + dimensions = parsed.get("dimensions", {}) + estimated_time = parsed.get("estimated_time", "Unknown") + suitable_for = parsed.get("suitable_for", []) + key_challenges = parsed.get("key_challenges", []) + reason = parsed.get("reason", "") + + # 构建详细的 reason + detailed_reason = f"任务难度评分: {difficulty_score}/10 ({difficulty_level})\n\n" + detailed_reason += "维度得分:\n" + detailed_reason += f" - 认知复杂度: {dimensions.get('cognitive_complexity', 0)}/3.0\n" + detailed_reason += f" - 步骤复杂度: {dimensions.get('step_complexity', 0)}/3.0\n" + detailed_reason += f" - 领域知识: {dimensions.get('domain_knowledge', 0)}/2.0\n" + detailed_reason += f" - 约束密度: {dimensions.get('constraint_density', 0)}/2.0\n\n" + + detailed_reason += f"预计耗时: {estimated_time}\n\n" + + if suitable_for: + detailed_reason += "适用场景:\n" + for s in suitable_for: + detailed_reason += f" • {s}\n" + detailed_reason += "\n" + + if key_challenges: + detailed_reason += "关键挑战:\n" + for c in key_challenges: + detailed_reason += f" ⚠ {c}\n" + detailed_reason += "\n" + + detailed_reason += f"总结: {reason}" + + # 设置结果 + result.score = difficulty_score + result.reason = [detailed_reason] + + # 难度评估没有"通过/不通过"的概念,只是描述性的 + # 但为了兼容框架,我们设置一个合理的默认行为 + # 可以通过 parameters 配置 min_difficulty 和 max_difficulty + result.status = False # 默认不标记为问题 + result.label = [f"TASK_DIFFICULTY.{difficulty_level.upper()}"] + + # 如果配置了难度范围要求,进行检查 + if hasattr(cls, 'dynamic_config') and cls.dynamic_config.parameters: + min_difficulty = cls.dynamic_config.parameters.get('min_difficulty', 0) + max_difficulty = cls.dynamic_config.parameters.get('max_difficulty', 10) + + if difficulty_score < min_difficulty: + result.status = True + result.label = ["QUALITY_BAD.TASK_TOO_EASY"] + elif difficulty_score > max_difficulty: + result.status = True + result.label = ["QUALITY_BAD.TASK_TOO_HARD"] + + except json.JSONDecodeError as e: + log.error(f"Failed to parse JSON response: {e}") + result.status = True + result.score = 0 + result.label = ["QUALITY_BAD.TASK_DIFFICULTY_ERROR"] + result.reason = [f"评估失败: JSON 解析错误 - {str(e)}"] + except Exception as e: + log.error(f"Error processing response: {e}") + result.status = True + result.score = 0 + result.label = ["QUALITY_BAD.TASK_DIFFICULTY_ERROR"] + result.reason = [f"评估失败: {str(e)}"] + + return result diff --git a/docs/instruction_quality_guide.md b/docs/instruction_quality_guide.md new file mode 100644 index 00000000..eb1b08d1 --- /dev/null +++ b/docs/instruction_quality_guide.md @@ -0,0 +1,478 @@ +# Instruction Quality Evaluation Guide - 指令质量评估指南 + +## 🎯 概述 + +本指南介绍如何使用 Dingo 的指令质量评估功能,用于评估 SFT(Supervised Fine-Tuning)数据集中 query/instruction 的质量。这对于知识蒸馏、指令微调数据准备至关重要。 + +### ✨ 新增评估指标 + +基于最新研究成果,我们提供了两个核心评估指标: + +| 指标 | 评估内容 | 研究基础 | 评分范围 | +|------|---------|---------|---------| +| **Instruction Clarity
指令清晰度** | 自描述性、一致性、具体性、完整性 | IFEval (Google, 2023)
Self-Instruct (UW, 2023) | 0-10 | +| **Task Difficulty
任务难度** | 认知复杂度、步骤复杂度、领域知识、约束密度 | Task Complexity (DeepMind, 2023)
OpenAI Math Problem Difficulty (2024) | 0-10 | + +--- + +## 📊 指标详解 + +### 1️⃣ Instruction Clarity(指令清晰度) + +**评估目标**:衡量指令是否清晰、明确、易于理解和执行。 + +#### 评估维度(总分 10 分) + +| 维度 | 分值 | 评估内容 | +|------|------|---------| +| **Self-Descriptiveness
自描述性** | 2.5 | 指令是否包含足够信息,无需额外上下文 | +| **Consistency
一致性** | 2.5 | 指令内部是否一致,无矛盾 | +| **Specificity
具体性** | 2.5 | 指令是否具体明确,避免歧义 | +| **Completeness
完整性** | 2.5 | 指令是否包含所有必要信息(输入、输出、约束、格式) | + +#### 评分标准 + +**优秀 (8-10 分)**: +- ✅ 自包含,无需额外说明 +- ✅ 内部完全一致 +- ✅ 非常具体,有明确成功标准 +- ✅ 包含所有必要元素 + +**良好 (6-8 分)**: +- ⚠️ 大部分清晰,个别细节需推断 +- ✅ 基本一致,有轻微模糊 +- ⚠️ 较具体但允许一定解释空间 +- ⚠️ 大部分信息齐全,个别细节缺失 + +**及格 (4-6 分)**: +- ⚠️ 需要一定上下文理解 +- ⚠️ 有一些不一致之处 +- ⚠️ 比较模糊,解释空间较大 +- ⚠️ 缺少重要信息 + +**不合格 (0-4 分)**: +- ❌ 严重依赖外部上下文 +- ❌ 内部矛盾 +- ❌ 过于模糊,难以理解意图 +- ❌ 关键信息缺失 + +#### 示例 + +**优秀示例(9.5 分)**: +``` +编写一个 Python 函数 calculate_discount,接受参数: +- original_price (float): 原价 +- discount_percentage (float, 0-100): 折扣百分比 +返回应用折扣后的最终价格,保留 2 位小数。 +包含输入验证:价格必须为正,折扣在 0-100 之间。 +添加详细 docstring 和使用示例。 +``` + +**不合格示例(2.0 分)**: +``` +写个代码 +``` + +--- + +### 2️⃣ Task Difficulty(任务难度) + +**评估目标**:衡量任务的复杂度和挑战性,用于数据集平衡和质量控制。 + +#### 评估维度(总分 10 分) + +| 维度 | 权重 | 分值 | 评估内容 | +|------|------|------|---------| +| **Cognitive Complexity
认知复杂度** | 30% | 3.0 | 基于 Bloom 分类法的认知层次(记忆→理解→应用→分析→评估→创造) | +| **Step Complexity
步骤复杂度** | 30% | 3.0 | 任务步骤数量及依赖关系(单步 vs 多步 vs 递归/分支) | +| **Domain Knowledge
领域知识** | 20% | 2.0 | 所需专业知识程度(常识 vs 专业知识 vs 专家知识) | +| **Constraint Density
约束密度** | 20% | 2.0 | 约束条件的数量和严格程度 | + +#### 难度级别 + +| 级别 | 分数范围 | 特征 | 适用场景 | +|------|---------|------|---------| +| **Easy
简单** | 0-3 | 单步、常识、少约束 | 快速知识蒸馏、基础训练 | +| **Moderate
中等** | 4-6 | 多步、专业知识、中等约束 | 标准 SFT 训练 | +| **Hard
困难** | 7-8 | 复杂推理、专家知识、严格约束 | 高质量模型训练 | +| **Expert
专家** | 9-10 | 深度推理、前沿知识、多重约束 | 专家能力评估 | + +#### 示例 + +**简单任务(2.5 分)**: +``` +将 'Hello World' 翻译成法语 +``` +- 认知:记忆级别 +- 步骤:单步 +- 知识:基础语言知识 +- 约束:无 + +**中等任务(5.5 分)**: +``` +编写 Python 函数求列表中所有质数的和,包含错误处理和单元测试 +``` +- 认知:应用+分析 +- 步骤:多步(质数判断 + 求和 + 错误处理 + 测试) +- 知识:算法基础 +- 约束:多个组件要求 + +**困难任务(8.0 分)**: +``` +设计分布式系统架构,支持 10万 QPS,99.99% 可用性。 +包括服务拆分、数据一致性、故障恢复、监控方案。 +考虑 CAP 定理权衡,画出架构图并详细说明。 +``` +- 认知:评估+创造 +- 步骤:复杂多步,相互依赖 +- 知识:深度专业知识 +- 约束:多个严格性能指标 + +**专家任务(9.5 分)**: +``` +证明或反驳:对于任意满足 ∫₀¹ f(x)² dx < ∞ 的连续函数 f: [0,1] → ℝ, +存在多项式序列 {pₙ} 使得 ||f - pₙ||₂ → 0。 +使用测度论和泛函分析提供严格证明。 +``` +- 认知:创造(构造证明) +- 步骤:高度复杂的逻辑链 +- 知识:研究生级数学 +- 约束:严格的数学证明要求 + +--- + +## 🚀 使用方法 + +### 安装 + +确保已安装 Dingo: + +```bash +pip install dingo-python +``` + +### 环境配置 + +```bash +export OPENAI_API_KEY="your-api-key" +export OPENAI_BASE_URL="https://api.deepseek.com" # 可选 +export OPENAI_MODEL="deepseek-chat" # 可选 +``` + +### 基础使用 + +#### 1. 准备数据 + +创建 JSONL 文件(`instructions.jsonl`): + +```jsonl +{"instruction": "Write a Python function to calculate factorial"} +{"instruction": "写个代码"} +{"instruction": "Design a microservices architecture..."} +``` + +#### 2. 评估指令清晰度 + +```python +from dingo.config import InputArgs +from dingo.exec import Executor +from dingo.model.llm.instruction_quality import LLMInstructionClarity + +input_data = { + "task_name": "clarity_check", + "input_path": "instructions.jsonl", + "output_path": "outputs/", + "dataset": {"source": "local", "format": "jsonl"}, + "executor": { + "max_workers": 5, + "result_save": {"bad": True, "good": True} + }, + "evaluator": [ + { + "fields": {"content": "instruction"}, + "evals": [ + { + "name": "LLMInstructionClarity", + "config": { + "model": "deepseek-chat", + "key": "your-api-key", + "api_url": "https://api.deepseek.com", + "parameters": {"threshold": 6.0} + } + } + ] + } + ] +} + +input_args = InputArgs(**input_data) +executor = Executor.exec_map["local"](input_args) +summary = executor.execute() + +print(f"清晰指令: {summary.num_good}/{summary.total}") +``` + +#### 3. 评估任务难度 + +```python +{ + "evals": [ + { + "name": "LLMTaskDifficulty", + "config": { + "model": "deepseek-chat", + "key": "your-api-key", + "api_url": "https://api.deepseek.com", + "parameters": { + "min_difficulty": 3.0, # 可选:过滤太简单的 + "max_difficulty": 8.0, # 可选:过滤太难的 + } + } + } + ] +} +``` + +#### 4. 综合评估 + +```python +{ + "evals": [ + { + "name": "LLMInstructionClarity", + "config": {...} + }, + { + "name": "LLMTaskDifficulty", + "config": {...} + } + ] +} +``` + +### 快速开始脚本 + +我们提供了完整的示例脚本: + +```bash +# 只评估清晰度 +python examples/custom/evaluate_instruction_quality.py clarity + +# 只评估难度 +python examples/custom/evaluate_instruction_quality.py difficulty + +# 综合评估(推荐) +python examples/custom/evaluate_instruction_quality.py both + +# 分析难度分布(用于数据集平衡) +python examples/custom/evaluate_instruction_quality.py distribution +``` + +--- + +## 📈 实践建议 + +### 1. SFT 数据准备流程 + +``` +原始指令 + ↓ +① 清晰度筛选 (threshold=6.0) + ↓ +清晰的指令 + ↓ +② 难度评估 + ↓ +③ 难度分布平衡 + ↓ +高质量 SFT 数据集 +``` + +### 2. 数据集质量标准 + +**优秀 SFT 数据集**: +- ✅ 95%+ 指令清晰度 ≥ 6.0 +- ✅ 难度分布合理: + - Easy (0-3): 15-20% + - Moderate (4-6): 50-60% + - Hard (7-8): 20-25% + - Expert (9-10): 5-10% + +### 3. 常见问题处理 + +**问题1: 过多简单指令** +```python +# 设置最低难度阈值 +"parameters": {"min_difficulty": 3.0} +``` + +**问题2: 指令模糊不清** +```python +# 提高清晰度要求 +"parameters": {"threshold": 7.0} +``` + +**问题3: 难度分布不均** +- 使用 `distribution` 模式分析当前分布 +- 针对性补充缺失难度级别的数据 +- 移除过多的某一难度级别数据 + +### 4. 成本优化 + +**大规模数据(> 10万条)**: +```python +# 方案1: 先用规则快速筛选基础质量 +"evals": [ + {"name": "RuleContentNull"}, # 过滤空指令 + {"name": "RuleSpecialCharacter"}, # 过滤异常字符 +] + +# 方案2: 对筛选后的数据进行深度评估 +"evals": [ + {"name": "LLMInstructionClarity"}, + {"name": "LLMTaskDifficulty"} +] +``` + +**中等规模(1万-10万条)**: +```python +# 降低并发,避免 API 限流 +"max_workers": 5, +``` + +**小规模(< 1万条)**: +```python +# 可以更高并发 +"max_workers": 10, +``` + +--- + +## 🔬 研究基础 + +### 学术参考 + +1. **IFEval: Instruction Following Evaluation** + - Google Research, 2023 + - 提出了系统化的指令遵循评估框架 + +2. **Self-Instruct: Aligning Language Models with Self-Generated Instructions** + - University of Washington, 2023 + - 指令质量对模型性能的影响研究 + +3. **Task Complexity in Instruction Following** + - Google DeepMind, 2023 + - 任务复杂度的多维度分析框架 + +4. **Measuring Difficulty of Math Problems** + - OpenAI, 2024 + - 任务难度的量化评估方法 + +### 评估原则 + +1. **基于 Bloom 认知分类法**:从记忆到创造的六个层次 +2. **考虑实际 LLM 能力**:难度评估要符合当前模型水平 +3. **多维度综合评分**:避免单一维度的片面性 +4. **严格但公允**:现实世界的指令不会完美 + +--- + +## 📊 输出格式 + +### 清晰度评估输出 + +```json +{ + "score": 8.5, + "dimensions": { + "self_descriptiveness": 2.5, + "consistency": 2.0, + "specificity": 2.0, + "completeness": 2.0 + }, + "issues": [], + "strengths": ["Clear task definition", "Well-specified output format"], + "suggestions": ["Could specify tone/style more explicitly"], + "reason": "High-quality instruction..." +} +``` + +### 难度评估输出 + +```json +{ + "difficulty_score": 7.5, + "difficulty_level": "Hard", + "dimensions": { + "cognitive_complexity": 2.5, + "step_complexity": 2.0, + "domain_knowledge": 1.5, + "constraint_density": 1.5 + }, + "estimated_time": "10-20 minutes", + "suitable_for": ["Advanced fine-tuning"], + "key_challenges": ["Requires multi-step reasoning"], + "reason": "This is a hard task..." +} +``` + +--- + +## 💡 常见问题 + +### Q1: 如何确定清晰度阈值? + +**建议**: +- 基础训练:threshold = 5.0(宽松) +- 标准 SFT:threshold = 6.0(平衡) +- 高质量数据:threshold = 7.0(严格) + +### Q2: 难度分布应该如何设置? + +**推荐分布**: +- 知识蒸馏:Easy 30%, Moderate 50%, Hard 20% +- 通用 SFT:Easy 20%, Moderate 50%, Hard 25%, Expert 5% +- 专家训练:Moderate 30%, Hard 50%, Expert 20% + +### Q3: 评估速度慢怎么办? + +1. 降低并发数(避免限流) +2. 使用更快的 LLM(如 GPT-4o-mini) +3. 对关键数据进行抽样评估 +4. 先用规则筛选再用 LLM 深度评估 + +### Q4: 如何处理非英文指令? + +两个评估器都支持多语言(中文、英文等),LLM 会根据指令语言进行评估。 + +### Q5: 评估结果如何应用到数据筛选? + +```python +# 读取评估结果 +bad_clarity = "outputs/instruction_clarity/bad/bad.jsonl" # 不清晰的 +good_difficulty = "outputs/task_difficulty/good/good.jsonl" # 所有难度评估 + +# 根据结果筛选: +# - 移除 clarity < 6.0 的指令 +# - 平衡各难度级别的数量 +# - 优先保留 clarity ≥ 7.0 且 difficulty 在目标范围的指令 +``` + +--- + +## 📚 相关文档 + +- [RAG Evaluation Metrics Guide](rag_evaluation_metrics.md) +- [Hallucination Detection Guide](hallucination_detection_guide.md) +- [Text Quality Evaluation](../README.md#evaluation-metrics) + +--- + +## 🤝 贡献 + +如果您有改进建议或发现问题,欢迎: +- 提交 Issue +- 发起 Pull Request +- 加入我们的 Discord/WeChat 讨论 + +**Happy Evaluating! 🎉** diff --git a/docs/metrics.md b/docs/metrics.md index caf4f23f..849ee6b9 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -32,6 +32,8 @@ This document provides comprehensive information about all quality metrics used |------|--------|-------------|--------------|-------------------| | `LLMFactCheckPublic` | LLMFactCheckPublic | Two-stage factuality evaluation pipeline from GPT-5 | [GPT-5 System Card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf) (OpenAI) | N/A | | `LLMHallucination` | LLMHallucination | Evaluates whether the response contains factual contradictions or hallucinations against provided context information | [TruthfulQA: Measuring How Models Mimic Human Falsehoods](https://arxiv.org/abs/2109.07958) (Lin et al., 2021) | N/A | +| `LLMInstructionClarity` | LLMInstructionClarity | Evaluates instruction clarity across four dimensions: self-descriptiveness, consistency, specificity, and completeness | Internal Implementation | [📊 See Results](Returns clarity score (0-10) and detailed analysis) | +| `LLMTaskDifficulty` | LLMTaskDifficulty | Evaluates task difficulty across cognitive complexity, step complexity, domain knowledge, and constraint density | Internal Implementation | [📊 See Results](Returns difficulty level (1-10) with detailed breakdown) | | `LLMText3HHarmless` | LLMText3HHarmless | Checks if responses avoid harmful content, discriminatory language, and dangerous assistance | [Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback](https://arxiv.org/pdf/2204.05862) (Bai et al., 2022) | [📊 See Results](eval/prompt/qa_data_evaluated_by_3h.md) | | `LLMText3HHelpful` | LLMText3HHelpful | Assesses if responses address questions directly and follow instructions appropriately | [Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback](https://arxiv.org/pdf/2204.05862) (Bai et al., 2022) | [📊 See Results](eval/prompt/qa_data_evaluated_by_3h.md) | | `LLMText3HHonest` | LLMText3HHonest | Evaluates if responses provide accurate information without fabrication or deception | [Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback](https://arxiv.org/pdf/2204.05862) (Bai et al., 2022) | [📊 See Results](eval/prompt/qa_data_evaluated_by_3h.md) | @@ -55,7 +57,7 @@ This document provides comprehensive information about all quality metrics used | Type | Metric | Description | Paper Source | Evaluation Results | |------|--------|-------------|--------------|-------------------| | `QUALITY_BAD_COMPLETENESS` | RuleLineEndWithEllipsis, RuleLineEndWithTerminal, RuleSentenceNumber, RuleWordNumber | Checks whether the ratio of lines ending with ellipsis is below threshold; Checks whether the ratio of lines ending w... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | -| `QUALITY_BAD_EFFECTIVENESS` | RuleDoi, RuleIsbn, RuleAbnormalChar, RuleAbnormalHtml, RuleAlphaWords, RuleAudioDataFormat, RuleCharNumber, RuleColonEnd, RuleContentNull, RuleContentShort, RuleContentShortMultiLan, RuleEnterAndSpace, RuleEnterMore, RuleEnterRatioMore, RuleHtmlEntity, RuleHtmlTag, RuleInvisibleChar, RuleImageDataFormat, RuleLatexSpecialChar, RuleLineJavascriptCount, RuleLoremIpsum, RuleMeanWordLength, RuleNlpDataFormat, RuleSftDataFormat, RuleSpaceMore, RuleSpecialCharacter, RuleStopWord, RuleSymbolWordRatio, RuleVedioDataFormat, RuleOnlyUrl | Check whether the string is in the correct format of the doi; Check whether the string is in the correct format of th... | Internal Implementation | N/A | +| `QUALITY_BAD_EFFECTIVENESS` | RuleAbnormalChar, RuleAbnormalHtml, RuleAlphaWords, RuleAudioDataFormat, RuleCharNumber, RuleColonEnd, RuleContentNull, RuleContentShort, RuleContentShortMultiLan, RuleEnterAndSpace, RuleEnterMore, RuleEnterRatioMore, RuleHtmlEntity, RuleHtmlTag, RuleInvisibleChar, RuleImageDataFormat, RuleLatexSpecialChar, RuleLineJavascriptCount, RuleLoremIpsum, RuleMeanWordLength, RuleNlpDataFormat, RuleSftDataFormat, RuleSpaceMore, RuleSpecialCharacter, RuleStopWord, RuleSymbolWordRatio, RuleVedioDataFormat, RuleOnlyUrl, RuleDoi, RuleIsbn | Detects garbled text and anti-crawling characters by combining special character and invisible character detection; D... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | | `QUALITY_BAD_FLUENCY` | RuleAbnormalNumber, RuleCharSplit, RuleNoPunc, RuleWordSplit, RuleWordStuck | Checks PDF content for abnormal book page or index numbers that disrupt text flow; Checks PDF content for abnormal ch... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | | `QUALITY_BAD_RELEVANCE` | RuleHeadWordAr, RuleHeadWordCs, RuleHeadWordHu, RuleHeadWordKo, RuleHeadWordRu, RuleHeadWordSr, RuleHeadWordTh, RuleHeadWordVi, RulePatternSearch, RuleWatermark | Checks whether Arabic content contains irrelevant tail source information; Checks whether Czech content contains irre... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | | `QUALITY_BAD_SECURITY` | RuleIDCard, RuleUnsafeWords, RulePIIDetection | Checks whether content contains ID card information; Checks whether content contains unsafe words; Detects Personal I... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | diff --git a/examples/sft/evaluate_instruction_quality.py b/examples/sft/evaluate_instruction_quality.py new file mode 100644 index 00000000..c6910667 --- /dev/null +++ b/examples/sft/evaluate_instruction_quality.py @@ -0,0 +1,374 @@ +""" +SFT Instruction Quality Evaluation - 指令质量评估 + +评估 SFT 数据中 query/instruction 的质量,包括: +1. 指令清晰度 (Instruction Clarity) +2. 任务难度 (Task Difficulty) + +基于最新研究: +- IFEval: Instruction Following Evaluation (Google, 2023) +- Self-Instruct (University of Washington, 2023) +- Task Complexity in Instruction Following (Google DeepMind, 2023) +""" +import os +from pathlib import Path + +from dingo.config import InputArgs +from dingo.exec import Executor + +# 配置 +OPENAI_MODEL = os.getenv("OPENAI_MODEL", "deepseek-chat") +OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") +OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", "https://api.deepseek.com") + + +def evaluate_instruction_clarity(): + """评估指令清晰度""" + print("=" * 80) + print(" 评估指令清晰度 (Instruction Clarity)") + print("=" * 80 + "\n") + + input_data = { + "task_name": "instruction_clarity_evaluation", + "input_path": str(Path("test/data/instructions.jsonl")), # 格式: {"instruction": "你的指令"} + "output_path": "outputs/instruction_clarity/", + "dataset": { + "source": "local", + "format": "jsonl" + }, + "executor": { + "max_workers": 5, # LLM 评估建议较低并发 + "result_save": { + "bad": True, # 保存不清晰的指令 + "good": True, # 也保存清晰的指令用于分析 + "all_labels": True + } + }, + "evaluator": [ + { + "fields": { + "content": "instruction" # 将 instruction 字段映射到 content + }, + "evals": [ + { + "name": "LLMInstructionClarity", + "config": { + "model": OPENAI_MODEL, + "key": OPENAI_API_KEY, + "api_url": OPENAI_BASE_URL, + "parameters": { + "threshold": 6.0 # 清晰度阈值 (0-10) + } + } + } + ] + } + ] + } + + input_args = InputArgs(**input_data) + executor = Executor.exec_map["local"](input_args) + summary = executor.execute() + + print("\n" + "=" * 80) + print(" 评估结果") + print("=" * 80) + print(f"总数: {summary.total}") + print(f"清晰指令: {summary.num_good} ({summary.score:.1f}%)") + print(f"不清晰指令: {summary.num_bad}") + print(f"输出路径: {summary.output_path}") + + # 显示清晰度问题分布 + if summary.type_ratio: + print("\n问题类型分布:") + # type_ratio 是嵌套字典: {"instruction": {"TYPE": ratio}} + for field, ratios in summary.type_ratio.items(): + if isinstance(ratios, dict): + for issue_type, ratio in sorted(ratios.items(), key=lambda x: x[1], reverse=True): + if "CLARITY" in issue_type: + print(f" {issue_type}: {ratio * 100:.1f}%") + else: + print(f" {field}: {ratios * 100:.1f}%") + + return summary + + +def evaluate_task_difficulty(): + """评估任务难度""" + print("=" * 80) + print(" 评估任务难度 (Task Difficulty)") + print("=" * 80 + "\n") + + input_data = { + "task_name": "task_difficulty_evaluation", + "input_path": str(Path("test/data/instructions.jsonl")), + "output_path": "outputs/task_difficulty/", + "dataset": { + "source": "local", + "format": "jsonl" + }, + "executor": { + "max_workers": 5, + "result_save": { + "bad": False, # 难度评估通常不需要保存"bad" + "good": True, # 保存所有评估结果 + "all_labels": True + } + }, + "evaluator": [ + { + "fields": { + "content": "instruction" + }, + "evals": [ + { + "name": "LLMTaskDifficulty", + "config": { + "model": OPENAI_MODEL, + "key": OPENAI_API_KEY, + "api_url": OPENAI_BASE_URL, + "parameters": { + # 可选:设置期望的难度范围 + # "min_difficulty": 4.0, # 最低难度(太简单的会被标记) + # "max_difficulty": 8.0, # 最高难度(太难的会被标记) + } + } + } + ] + } + ] + } + + input_args = InputArgs(**input_data) + executor = Executor.exec_map["local"](input_args) + summary = executor.execute() + + print("\n" + "=" * 80) + print(" 评估结果") + print("=" * 80) + print(f"总数: {summary.total}") + print(f"输出路径: {summary.output_path}") + + # 显示难度级别分布 + if summary.type_ratio: + print("\n难度级别分布:") + # type_ratio 是嵌套字典: {"instruction": {"LEVEL": ratio}} + for field, ratios in summary.type_ratio.items(): + if isinstance(ratios, dict): + for level, ratio in sorted(ratios.items(), key=lambda x: x[1], reverse=True): + if "TASK_DIFFICULTY" in level: + print(f" {level}: {ratio * 100:.1f}%") + else: + print(f" {field}: {ratios * 100:.1f}%") + + return summary + + +def evaluate_both(): + """同时评估指令清晰度和任务难度""" + print("=" * 80) + print(" 综合指令质量评估 (Clarity + Difficulty)") + print("=" * 80 + "\n") + + input_data = { + "task_name": "comprehensive_instruction_evaluation", + "input_path": "test/data/instructions.jsonl", + "output_path": "outputs/instruction_comprehensive/", + "dataset": { + "source": "local", + "format": "jsonl" + }, + "executor": { + "max_workers": 5, + "result_save": { + "bad": True, + "good": True, + "all_labels": True + } + }, + "evaluator": [ + { + "fields": { + "content": "instruction" + }, + "evals": [ + { + "name": "LLMInstructionClarity", + "config": { + "model": OPENAI_MODEL, + "key": OPENAI_API_KEY, + "api_url": OPENAI_BASE_URL, + "parameters": {"threshold": 6.0} + } + }, + { + "name": "LLMTaskDifficulty", + "config": { + "model": OPENAI_MODEL, + "key": OPENAI_API_KEY, + "api_url": OPENAI_BASE_URL, + "parameters": { + "min_difficulty": 3.0, # 过滤太简单的任务 + "max_difficulty": 9.0, # 过滤过于困难的任务 + } + } + } + ] + } + ] + } + + input_args = InputArgs(**input_data) + executor = Executor.exec_map["local"](input_args) + summary = executor.execute() + + print("\n" + "=" * 80) + print(" 综合评估结果") + print("=" * 80) + print(f"总数: {summary.total}") + print(f"通过所有检查: {summary.num_good} ({summary.score:.1f}%)") + print(f"存在问题: {summary.num_bad}") + print(f"输出路径: {summary.output_path}") + + # 获取详细结果进行分析 + bad_list = executor.get_bad_info_list() + if bad_list: + print("\n问题分析:") + clarity_issues = sum(1 for item in bad_list + if any('CLARITY' in label for label in item.get('labels', []))) + difficulty_issues = sum(1 for item in bad_list + if any('DIFFICULTY' in label or 'TOO_EASY' in label or 'TOO_HARD' in label + for label in item.get('labels', []))) + + print(f" 清晰度问题: {clarity_issues}") + print(f" 难度问题: {difficulty_issues}") + + return summary + + +def analyze_difficulty_distribution(): + """分析任务难度分布(用于数据集平衡)""" + print("=" * 80) + print(" 任务难度分布分析") + print("=" * 80 + "\n") + + input_data = { + "task_name": "difficulty_distribution_analysis", + "input_path": "test/data/instructions.jsonl", + "output_path": "outputs/difficulty_distribution/", + "dataset": { + "source": "local", + "format": "jsonl" + }, + "executor": { + "max_workers": 10, + "result_save": { + "bad": False, + "good": True, + "all_labels": True + } + }, + "evaluator": [ + { + "fields": {"content": "instruction"}, + "evals": [ + { + "name": "LLMTaskDifficulty", + "config": { + "model": OPENAI_MODEL, + "key": OPENAI_API_KEY, + "api_url": OPENAI_BASE_URL + } + } + ] + } + ] + } + + input_args = InputArgs(**input_data) + executor = Executor.exec_map["local"](input_args) + summary = executor.execute() + + # 分析结果 + good_list = executor.get_good_info_list() + + # 统计难度分布 + difficulty_counts = { + "Easy (0-3)": 0, + "Moderate (4-6)": 0, + "Hard (7-8)": 0, + "Expert (9-10)": 0 + } + + total_score = 0 + for item in good_list: + eval_details = item.get('eval_details', {}) + for field, details in eval_details.items(): + for detail in details: + if detail.get('metric') == 'LLMTaskDifficulty': + score = detail.get('score', 0) + total_score += score + + if score <= 3: + difficulty_counts["Easy (0-3)"] += 1 + elif score <= 6: + difficulty_counts["Moderate (4-6)"] += 1 + elif score <= 8: + difficulty_counts["Hard (7-8)"] += 1 + else: + difficulty_counts["Expert (9-10)"] += 1 + + print("\n" + "=" * 80) + print(" 难度分布分析") + print("=" * 80) + print(f"总数: {len(good_list)}") + if good_list: + print(f"平均难度: {total_score / len(good_list):.2f}/10") + print("\n难度级别分布:") + for level, count in difficulty_counts.items(): + percentage = (count / len(good_list) * 100) if good_list else 0 + print(f" {level}: {count} ({percentage:.1f}%)") + + print("\n💡 数据集平衡建议:") + # 理想分布: Easy 20%, Moderate 50%, Hard 25%, Expert 5% + if difficulty_counts["Easy (0-3)"] / len(good_list) > 0.3: + print(" ⚠️ 简单任务过多,考虑增加难度或过滤部分简单任务") + if difficulty_counts["Moderate (4-6)"] / len(good_list) < 0.3: + print(" ⚠️ 中等难度任务不足,这是 SFT 的核心部分") + if difficulty_counts["Hard (7-8)"] / len(good_list) > 0.4: + print(" ⚠️ 困难任务过多,可能影响训练效率") + + return summary + + +if __name__ == "__main__": + import sys + + if not OPENAI_API_KEY: + print("❌ 错误: 请设置 OPENAI_API_KEY 环境变量") + print(" export OPENAI_API_KEY='your-api-key'") + sys.exit(1) + + # 选择评估模式 + mode = sys.argv[1] if len(sys.argv) > 1 else "both" + + print(f"\n{'=' * 80}") + print(" SFT 指令质量评估系统") + print(f" 模式: {mode}") + print(f"{'=' * 80}\n") + + if mode == "clarity": + evaluate_instruction_clarity() + elif mode == "difficulty": + evaluate_task_difficulty() + elif mode == "distribution": + analyze_difficulty_distribution() + else: + evaluate_both() + + print("\n✅ 评估完成!\n") + print("💡 提示:") + print(" - 使用 'clarity' 模式只评估清晰度") + print(" - 使用 'difficulty' 模式只评估难度") + print(" - 使用 'distribution' 模式分析难度分布") + print(" - 使用 'both' 模式(默认)进行综合评估") diff --git a/test/data/instructions.jsonl b/test/data/instructions.jsonl new file mode 100644 index 00000000..2c6ca8c3 --- /dev/null +++ b/test/data/instructions.jsonl @@ -0,0 +1,10 @@ +{"instruction": "写个代码"} +{"instruction": "将'Hello World'翻译成法语。"} +{"instruction": "编写一个Python函数,接受一个整数列表,返回列表中所有质数的和。包含错误处理和单元测试。"} +{"instruction": "Write a Python function named `calculate_discount` that takes two parameters: original_price (float) and discount_percentage (float, 0-100). Return the final price after applying the discount, rounded to 2 decimal places. Include input validation to ensure prices are positive and discounts are between 0-100. Add comprehensive docstring with examples."} +{"instruction": "设计一个分布式系统架构,支持每秒10万次请求,保证99.99%可用性,具有水平扩展能力。需要包括:1)服务拆分方案 2)数据一致性策略 3)故障恢复机制 4)性能监控方案。画出架构图并详细说明每个组件的职责和交互方式。考虑CAP定理的权衡。"} +{"instruction": "Prove or disprove: For any continuous function f: [0,1] → ℝ satisfying ∫₀¹ f(x)² dx < ∞, there exists a sequence of polynomials {pₙ} such that ||f - pₙ||₂ → 0 as n → ∞. Provide rigorous proof using measure theory and functional analysis."} +{"instruction": "写一个关于猫的故事"} +{"instruction": "Summarize this article in a few sentences."} +{"instruction": "解释一下量子纠缠的原理"} +{"instruction": "创建一个Web应用,用户可以上传图片并进行基本的图像处理(裁剪、旋转、滤镜)。要求使用React前端和Python后端,支持批量处理,并提供API文档。"} From a8a180633c2739e1f8d0183a0081e1d9d6e70d71 Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Tue, 23 Dec 2025 09:36:44 +0000 Subject: [PATCH 2/2] =?UTF-8?q?=F0=9F=93=9A=20Auto-update=20metrics=20docu?= =?UTF-8?q?mentation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/metrics.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/metrics.md b/docs/metrics.md index 849ee6b9..1a226de7 100644 --- a/docs/metrics.md +++ b/docs/metrics.md @@ -57,7 +57,7 @@ This document provides comprehensive information about all quality metrics used | Type | Metric | Description | Paper Source | Evaluation Results | |------|--------|-------------|--------------|-------------------| | `QUALITY_BAD_COMPLETENESS` | RuleLineEndWithEllipsis, RuleLineEndWithTerminal, RuleSentenceNumber, RuleWordNumber | Checks whether the ratio of lines ending with ellipsis is below threshold; Checks whether the ratio of lines ending w... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | -| `QUALITY_BAD_EFFECTIVENESS` | RuleAbnormalChar, RuleAbnormalHtml, RuleAlphaWords, RuleAudioDataFormat, RuleCharNumber, RuleColonEnd, RuleContentNull, RuleContentShort, RuleContentShortMultiLan, RuleEnterAndSpace, RuleEnterMore, RuleEnterRatioMore, RuleHtmlEntity, RuleHtmlTag, RuleInvisibleChar, RuleImageDataFormat, RuleLatexSpecialChar, RuleLineJavascriptCount, RuleLoremIpsum, RuleMeanWordLength, RuleNlpDataFormat, RuleSftDataFormat, RuleSpaceMore, RuleSpecialCharacter, RuleStopWord, RuleSymbolWordRatio, RuleVedioDataFormat, RuleOnlyUrl, RuleDoi, RuleIsbn | Detects garbled text and anti-crawling characters by combining special character and invisible character detection; D... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | +| `QUALITY_BAD_EFFECTIVENESS` | RuleDoi, RuleIsbn, RuleAbnormalChar, RuleAbnormalHtml, RuleAlphaWords, RuleAudioDataFormat, RuleCharNumber, RuleColonEnd, RuleContentNull, RuleContentShort, RuleContentShortMultiLan, RuleEnterAndSpace, RuleEnterMore, RuleEnterRatioMore, RuleHtmlEntity, RuleHtmlTag, RuleInvisibleChar, RuleImageDataFormat, RuleLatexSpecialChar, RuleLineJavascriptCount, RuleLoremIpsum, RuleMeanWordLength, RuleNlpDataFormat, RuleSftDataFormat, RuleSpaceMore, RuleSpecialCharacter, RuleStopWord, RuleSymbolWordRatio, RuleVedioDataFormat, RuleOnlyUrl | Check whether the string is in the correct format of the doi; Check whether the string is in the correct format of th... | Internal Implementation | N/A | | `QUALITY_BAD_FLUENCY` | RuleAbnormalNumber, RuleCharSplit, RuleNoPunc, RuleWordSplit, RuleWordStuck | Checks PDF content for abnormal book page or index numbers that disrupt text flow; Checks PDF content for abnormal ch... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | | `QUALITY_BAD_RELEVANCE` | RuleHeadWordAr, RuleHeadWordCs, RuleHeadWordHu, RuleHeadWordKo, RuleHeadWordRu, RuleHeadWordSr, RuleHeadWordTh, RuleHeadWordVi, RulePatternSearch, RuleWatermark | Checks whether Arabic content contains irrelevant tail source information; Checks whether Czech content contains irre... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) | | `QUALITY_BAD_SECURITY` | RuleIDCard, RuleUnsafeWords, RulePIIDetection | Checks whether content contains ID card information; Checks whether content contains unsafe words; Detects Personal I... | [RedPajama: an Open Dataset for Training Large Language Models](https://github.com/togethercomputer/RedPajama-Data) (Together Computer, 2023) | [📊 See Results](eval/rule/slimpajama_data_evaluated_by_rule.md) |