diff --git a/dingo/model/llm/text_quality/base_text_quality.py b/dingo/model/llm/text_quality/base_text_quality.py new file mode 100644 index 00000000..4785ab14 --- /dev/null +++ b/dingo/model/llm/text_quality/base_text_quality.py @@ -0,0 +1,60 @@ +""" +Base class for text quality evaluators with shared response processing logic. +""" + +import json + +from dingo.io.output.eval_detail import EvalDetail +from dingo.model.llm.base_openai import BaseOpenAI +from dingo.model.response.response_class import ResponseScoreTypeNameReason + + +class BaseTextQuality(BaseOpenAI): + """ + Base class for text quality evaluators. + Provides shared response processing logic for LLMTextQualityV4 and V5. + """ + + @classmethod + def process_response(cls, response: str) -> EvalDetail: + """ + Process LLM response and convert to EvalDetail. + + Handles: + - Cleanup of markdown code blocks (```json and ```) + - JSON parsing + - Creation of EvalDetail with proper status, score, label, and reason + + Args: + response: Raw response string from LLM + + Returns: + EvalDetail object with evaluation results + """ + # Cleanup markdown code blocks + if response.startswith("```json"): + response = response[7:] + elif response.startswith("```"): # Changed to elif for safety + response = response[3:] + if response.endswith("```"): + response = response[:-3] + response = response.strip() + + # Parse JSON response + response_json = json.loads(response) + response_model = ResponseScoreTypeNameReason(**response_json) + + # Create EvalDetail with all required fields + # status = False for Good quality (no issues found) + # status = True for Bad quality (issues found) + is_good = response_model.type == "Good" + + result = EvalDetail( + metric=cls.__name__, + status=not is_good, # True if Bad (issues found), False if Good + score=response_model.score, + label=["QUALITY_GOOD"] if is_good else [f"{response_model.type}.{response_model.name}"], + reason=[response_model.reason] + ) + + return result diff --git a/dingo/model/llm/text_quality/llm_text_quality_v4.py b/dingo/model/llm/text_quality/llm_text_quality_v4.py index cd593243..69357800 100644 --- a/dingo/model/llm/text_quality/llm_text_quality_v4.py +++ b/dingo/model/llm/text_quality/llm_text_quality_v4.py @@ -1,13 +1,13 @@ from dingo.model import Model -from dingo.model.llm.base_openai import BaseOpenAI +from dingo.model.llm.text_quality.base_text_quality import BaseTextQuality @Model.llm_register("LLMTextQualityV4") -class LLMTextQualityV4(BaseOpenAI): +class LLMTextQualityV4(BaseTextQuality): # Metadata for documentation generation _metric_info = { "category": "Pretrain Text Quality Assessment Metrics", - "metric_name": "PromptTextQualityV4", + "metric_name": "LLMTextQualityV4", "description": "Enhanced text quality evaluation covering completeness (formulas, tables, code), effectiveness (garbled text, spacing), similarity (duplicates), and security (politics, prohibited content)", "paper_title": "WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages", "paper_url": "https://arxiv.org/abs/2501.14506", @@ -67,3 +67,4 @@ class LLMTextQualityV4(BaseOpenAI): # Input content """ + # process_response method is now inherited from BaseTextQuality diff --git a/dingo/model/llm/text_quality/llm_text_quality_v5.py b/dingo/model/llm/text_quality/llm_text_quality_v5.py new file mode 100644 index 00000000..cab5fa0a --- /dev/null +++ b/dingo/model/llm/text_quality/llm_text_quality_v5.py @@ -0,0 +1,177 @@ +from dingo.model import Model +from dingo.model.llm.text_quality.base_text_quality import BaseTextQuality + + +@Model.llm_register("LLMTextQualityV5") +class LLMTextQualityV5(BaseTextQuality): + # Metadata for documentation generation + _metric_info = { + "category": "Pretrain Text Quality Assessment Metrics", + "metric_name": "LLMTextQualityV5", + "description": "Impact-driven text quality evaluation for LLM pretraining, focusing on structural completeness, readability, diversity, and safety with quantitative thresholds", + "paper_title": "WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages", + "paper_url": "https://arxiv.org/abs/2501.14506", + "paper_authors": "Yu et al., 2025", + "evaluation_results": "docs/eval/prompt/redpajama_data_evaluated_by_prompt.md" + } + prompt = """ +# Role +You are an expert in assessing pretraining data quality for large language models. + +# Goal +Evaluate whether this text is suitable for LLM pretraining. Focus on issues that would negatively impact model learning, not minor imperfections. + +# Quality Dimensions + +## 1. Completeness (结构完整性) +**Impact**: Broken structures prevent models from learning correct formatting patterns. + +**Check for**: +- **Error_Formula**: Mathematical expressions with **unmatched delimiters** or **unclosed environments** + + ⚠️ **Normal patterns (DO NOT flag)**: + - Mixing inline ($...$) and display ($$...$$) formulas + - Using \\begin{{align}}...\\end{{align}} within $$...$$ + - Line breaks with \\\\ in alignment environments + - HTML tags: x, 2 for subscripts/superscripts + - Mixing LaTeX and HTML in web-extracted content + + ✅ **Only flag when**: + - Delimiters unmatched: $ without closing $ (LaTeX context, not dollar signs) + - Environments unclosed: \\begin{{align}} without \\end{{align}} + - Syntax broken: \\frac{{a}}{{b missing closing }} + - HTML tags unclosed: text without + + ⚠️ **Important**: Distinguish LaTeX $ from dollar signs ($100) + - Dollar sign: "$100", "$5.99" (followed by numbers) → NOT LaTeX + - LaTeX delimiter: "$x$", "$\\alpha$" (contains math symbols) → IS LaTeX + - Example: "The price is $100 and equation $x=y$ costs $50" has 4 dollar symbols but only 2 are LaTeX delimiters (and they match) + + - Example (BAD): "$x^2 + y^2 is broken here $$a = b$$$" + (First LaTeX $ never closes, extra $ at end) + - Example (GOOD): "The item costs $100 and satisfies $x^2 + y^2 = z^2$ where price is $50" + (Dollar signs for money + proper LaTeX pair) + - Impact: Only flag errors that prevent >50% of mainstream parsers (pdflatex, MathJax, KaTeX, Pandoc, Jupyter) from rendering + +- **Error_Table**: Table structures that are malformed or unreadable + - Example (BAD): Misaligned columns, missing headers, or garbled HTML tags + - Impact: Models cannot learn proper table representation + +- **Error_Code**: Code blocks with formatting corruption + - Example (BAD): Line numbers mixed with code, broken syntax highlighting markers + - Impact: Teaches incorrect code structure + +**Key Question**: "Can the model learn proper formatting from this structure?" + +--- + +## 2. Effectiveness (可读性) +**Impact**: Noise prevents models from learning meaningful semantic patterns. + +**Check for**: +- **Error_Garbled_Characters**: Encoding issues or anti-crawler artifacts + - Example (BAD): "’" (broken UTF-8), "□□□" (placeholder chars), "" (BOM) + - Threshold: >1% of characters are garbled + - Impact: Corrupts token distributions + +- **Error_Words_Stuck**: Missing spaces break tokenization + - Example (BAD): "Thequickbrownfoxjumpsoverthelazydog" + - Threshold: >1% of text has word boundaries missing + - Impact: Wrong subword tokenization patterns + +- **Error_Lack_Punctuation**: Sentence boundaries unclear + - Example (BAD): "I like apples they are red also I like oranges" + - Impact: Models cannot learn sentence segmentation + +**Key Question**: "Would a human find this readable and coherent?" + +--- + +## 3. Similarity (重复性) +**Impact**: Repetitive content reduces training efficiency and causes memorization. + +**Check for**: +- **Error_Duplicate**: Excessive repetition that dominates the text + - Example (BAD): "I like blue. I like blue. I like blue. I like blue..." (>30% duplicate) + - Threshold: Same sentence/phrase repeats >5 times OR duplicate ratio >30% + - Impact: Over-represents certain patterns + +**Key Question**: "Does this text provide diverse training signal?" + +--- + +## 4. Security (安全性) +**Impact**: Harmful content should not be learned by models. + +**Check for**: +- **Error_Politics**: Content promoting extremism, terrorism, ethnic hatred +- **Error_Prohibition**: Violence, pornography, gambling, drugs + +**Key Question**: "Is this content safe for model training?" + +--- + +# Evaluation Principles + +1. **Focus on Training Impact**: Only flag issues that significantly harm LLM learning +2. **Severity Matters**: Minor typos are OK; systemic corruption is not +3. **Context Awareness**: Academic formulas are expected in papers; garbled text never is +4. **Threshold-Based**: Use quantitative checks (>1%, >30%, >5 times) when possible + +--- + +# Workflow + +1. **Quick Scan**: Does the text look generally readable and well-formed? +2. **Identify Category**: If problematic, which dimension is most severely affected? +3. **Verify Impact**: Would this issue meaningfully harm model training? +4. **Assign Label**: + - Score: 1 (suitable for training) or 0 (unsuitable) + - Type: 'Good' OR one of ['Completeness', 'Effectiveness', 'Similarity', 'Security'] + - Name: Specific error type (see above) + - Reason: Brief explanation (1-2 sentences) + +--- + +# Output Format +Return JSON only: {"score": 0/1, "type": "", "name": "", "reason": ""} + +# Examples + +**Example 1 (Good - Simple)**: +Input: "The Pythagorean theorem states that $a^2 + b^2 = c^2$ for right triangles." +Output: {"score": 1, "type": "Good", "name": "None", "reason": "Clear, well-formatted text with proper LaTeX"} + +**Example 1.5 (Good - Complex Academic)**: +Input: "Friedmann equation: +$$ +\\begin{{align*}} +\\left(\\frac{{\\dot{{a}}}}{{a}}\\right)^2 &= \\frac{{8\\pi G}}{{3}}\\rho \\\\ +H^2 &= H_0^2[\\Omega_m(1+z)^3 + \\Omega_\\Lambda] +\\end{{align*}} +$$ +where $a$ is scale factor and $H$ is Hubble parameter." +Output: {{"score": 1, "type": "Good", "name": "None", "reason": "Well-formed multi-line equations with proper alignment"}} + +**Example 1.6 (Good - Mixed HTML/LaTeX)**: +Input: "The eigenstate $\\psi_n$ where n is quantum number and energy E2 = m2c4" +Output: {{"score": 1, "type": "Good", "name": "None", "reason": "Normal mix of LaTeX and HTML tags from web content"}} + +**Example 2 (Bad - Completeness)**: +Input: "The formula $x^2 + y^2 is broken here $$a = b$$$" +Output: {"score": 0, "type": "Completeness", "name": "Error_Formula", "reason": "Unmatched delimiters: first $ never closes, extra $ at end"} + +**Example 3 (Bad - Effectiveness)**: +Input: "Theappleisredandtasty�withsomegarbledtext□□" +Output: {"score": 0, "type": "Effectiveness", "name": "Error_Garbled_Characters", "reason": "Contains encoding corruption (�, □) and missing spaces (>1% of text)"} + +**Example 4 (Bad - Similarity)**: +Input: "Blue is nice. Blue is nice. Blue is nice. Blue is nice. Blue is nice. Blue is nice." +Output: {"score": 0, "type": "Similarity", "name": "Error_Duplicate", "reason": "Same sentence repeats 6 times, indicating low content diversity"} + +--- + +# Input content to evaluate: + +""" + # process_response method is now inherited from BaseTextQuality diff --git a/examples/dataset/s3.py b/examples/dataset/s3.py index 2652442f..5aca028c 100644 --- a/examples/dataset/s3.py +++ b/examples/dataset/s3.py @@ -11,10 +11,15 @@ S3_ENDPOINT_URL = os.getenv("S3_ENDPOINT_URL", "https://s3.amazonaws.com") S3_BUCKET = os.getenv("S3_BUCKET", "your_bucket_name") # qa-huawei - # LLM 配置信息 - OPENAI_MODEL = 'deepseek-chat' - OPENAI_URL = 'https://api.deepseek.com/v1' - OPENAI_KEY = os.getenv("OPENAI_KEY") + OPENAI_MODEL = os.getenv("OPENAI_MODEL", "deepseek-chat") + OPENAI_URL = os.getenv("OPENAI_BASE_URL", "https://api.deepseek.com/v1") + OPENAI_KEY = os.getenv("OPENAI_API_KEY", "") + + llm_config = { + "model": OPENAI_MODEL, + "key": OPENAI_KEY, + "api_url": OPENAI_URL, + } input_data = { # 数据文件路径 @@ -37,30 +42,22 @@ # 执行器配置 "executor": { + "max_workers": 10, + "batch_size": 10, "result_save": { + "good": True, "bad": True, - "good": True + "all_labels": True } }, "evaluator": [ { "fields": {"content": "content"}, "evals": [ - {"name": "RuleColonEnd"} + {"name": "LLMTextQualityV4", "config": llm_config} ] } ] - - # # 评估器配置 - # "evaluator": { - # "llm_config": { - # "LLMTextQualityPromptBase": { - # "model": OPENAI_MODEL, - # "key": OPENAI_KEY, - # "api_url": OPENAI_URL, - # } - # } - # } } # 创建 InputArgs 实例 diff --git a/examples/llm_and_rule/llm_local.py b/examples/llm_and_rule/llm_local.py index d80adaf5..76adefe0 100644 --- a/examples/llm_and_rule/llm_local.py +++ b/examples/llm_and_rule/llm_local.py @@ -1,14 +1,29 @@ +import os +from pathlib import Path + from dingo.config import InputArgs from dingo.exec import Executor +OPENAI_MODEL = os.getenv("OPENAI_MODEL", "deepseek-chat") +OPENAI_URL = os.getenv("OPENAI_BASE_URL", "https://api.deepseek.com/v1") +OPENAI_KEY = os.getenv("OPENAI_API_KEY", "") + +llm_config = { + "model": OPENAI_MODEL, + "key": OPENAI_KEY, + "api_url": OPENAI_URL, +} + if __name__ == '__main__': input_data = { - "input_path": "../../test/data/test_local_jsonl.jsonl", + "input_path": str(Path("test/data/test_local_jsonl.jsonl")), "dataset": { "source": "local", "format": "jsonl", }, "executor": { + "max_workers": 10, + "batch_size": 10, "result_save": { "bad": True, "good": True @@ -18,7 +33,7 @@ { "fields": {"content": "content"}, "evals": [ - {"name": "LLMTextRepeat", "config": {"key": "", "api_url": ""}} + {"name": "LLMTextQualityV5", "config": llm_config} ] } ] diff --git a/test/scripts/model/llm/test_text_quality_v5.py b/test/scripts/model/llm/test_text_quality_v5.py new file mode 100644 index 00000000..ae36c338 --- /dev/null +++ b/test/scripts/model/llm/test_text_quality_v5.py @@ -0,0 +1,101 @@ +""" +测试 LLMTextQualityV5 优化后的 prompt 效果 +""" +import json + +import pytest + +from dingo.model.llm.text_quality.llm_text_quality_v5 import LLMTextQualityV5 + + +class TestLLMTextQualityV5: + """测试 V5 版本的文本质量评估""" + + def test_good_quality_text_response(self): + """测试解析 Good 质量文本的响应""" + response = json.dumps({ + "score": 1, + "type": "Good", + "name": "None", + "reason": "Clear, well-formatted text with proper LaTeX" + }) + + result = LLMTextQualityV5.process_response(response) + + assert result.status is False + assert result.label == ["QUALITY_GOOD"] + assert result.reason == ["Clear, well-formatted text with proper LaTeX"] + assert result.metric == "LLMTextQualityV5" + + def test_completeness_error_response(self): + """测试解析 Completeness 错误的响应""" + response = json.dumps({ + "score": 0, + "type": "Completeness", + "name": "Error_Formula", + "reason": "Inconsistent delimiters: mixed $$ and $ without proper closure" + }) + + result = LLMTextQualityV5.process_response(response) + + assert result.status is True + assert result.label == ["Completeness.Error_Formula"] + assert "Inconsistent delimiters" in result.reason[0] + assert result.metric == "LLMTextQualityV5" + + def test_effectiveness_error_response(self): + """测试解析 Effectiveness 错误的响应""" + response = json.dumps({ + "score": 0, + "type": "Effectiveness", + "name": "Error_Garbled_Characters", + "reason": "Contains encoding corruption (�, □) and missing spaces (>1% of text)" + }) + + result = LLMTextQualityV5.process_response(response) + + assert result.status is True + assert result.label == ["Effectiveness.Error_Garbled_Characters"] + assert "encoding corruption" in result.reason[0] + + def test_similarity_error_response(self): + """测试解析 Similarity 错误的响应""" + response = json.dumps({ + "score": 0, + "type": "Similarity", + "name": "Error_Duplicate", + "reason": "Same sentence repeats 6 times, indicating low content diversity" + }) + + result = LLMTextQualityV5.process_response(response) + + assert result.status is True + assert result.label == ["Similarity.Error_Duplicate"] + assert "repeats 6 times" in result.reason[0] + + def test_security_error_response(self): + """测试解析 Security 错误的响应""" + response = json.dumps({ + "score": 0, + "type": "Security", + "name": "Error_Prohibition", + "reason": "Contains prohibited content" + }) + + result = LLMTextQualityV5.process_response(response) + + assert result.status is True + assert result.label == ["Security.Error_Prohibition"] + + def test_markdown_code_block_cleanup(self): + """测试 markdown 代码块清理""" + response_with_markdown = '```json\n{"score": 1, "type": "Good", "name": "None", "reason": "Test"}\n```' + + result = LLMTextQualityV5.process_response(response_with_markdown) + + assert result.status is False + assert result.label == ["QUALITY_GOOD"] + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "--tb=short"])