diff --git a/dingo/model/llm/text_quality/base_text_quality.py b/dingo/model/llm/text_quality/base_text_quality.py
new file mode 100644
index 00000000..4785ab14
--- /dev/null
+++ b/dingo/model/llm/text_quality/base_text_quality.py
@@ -0,0 +1,60 @@
+"""
+Base class for text quality evaluators with shared response processing logic.
+"""
+
+import json
+
+from dingo.io.output.eval_detail import EvalDetail
+from dingo.model.llm.base_openai import BaseOpenAI
+from dingo.model.response.response_class import ResponseScoreTypeNameReason
+
+
+class BaseTextQuality(BaseOpenAI):
+ """
+ Base class for text quality evaluators.
+ Provides shared response processing logic for LLMTextQualityV4 and V5.
+ """
+
+ @classmethod
+ def process_response(cls, response: str) -> EvalDetail:
+ """
+ Process LLM response and convert to EvalDetail.
+
+ Handles:
+ - Cleanup of markdown code blocks (```json and ```)
+ - JSON parsing
+ - Creation of EvalDetail with proper status, score, label, and reason
+
+ Args:
+ response: Raw response string from LLM
+
+ Returns:
+ EvalDetail object with evaluation results
+ """
+ # Cleanup markdown code blocks
+ if response.startswith("```json"):
+ response = response[7:]
+ elif response.startswith("```"): # Changed to elif for safety
+ response = response[3:]
+ if response.endswith("```"):
+ response = response[:-3]
+ response = response.strip()
+
+ # Parse JSON response
+ response_json = json.loads(response)
+ response_model = ResponseScoreTypeNameReason(**response_json)
+
+ # Create EvalDetail with all required fields
+ # status = False for Good quality (no issues found)
+ # status = True for Bad quality (issues found)
+ is_good = response_model.type == "Good"
+
+ result = EvalDetail(
+ metric=cls.__name__,
+ status=not is_good, # True if Bad (issues found), False if Good
+ score=response_model.score,
+ label=["QUALITY_GOOD"] if is_good else [f"{response_model.type}.{response_model.name}"],
+ reason=[response_model.reason]
+ )
+
+ return result
diff --git a/dingo/model/llm/text_quality/llm_text_quality_v4.py b/dingo/model/llm/text_quality/llm_text_quality_v4.py
index cd593243..69357800 100644
--- a/dingo/model/llm/text_quality/llm_text_quality_v4.py
+++ b/dingo/model/llm/text_quality/llm_text_quality_v4.py
@@ -1,13 +1,13 @@
from dingo.model import Model
-from dingo.model.llm.base_openai import BaseOpenAI
+from dingo.model.llm.text_quality.base_text_quality import BaseTextQuality
@Model.llm_register("LLMTextQualityV4")
-class LLMTextQualityV4(BaseOpenAI):
+class LLMTextQualityV4(BaseTextQuality):
# Metadata for documentation generation
_metric_info = {
"category": "Pretrain Text Quality Assessment Metrics",
- "metric_name": "PromptTextQualityV4",
+ "metric_name": "LLMTextQualityV4",
"description": "Enhanced text quality evaluation covering completeness (formulas, tables, code), effectiveness (garbled text, spacing), similarity (duplicates), and security (politics, prohibited content)",
"paper_title": "WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages",
"paper_url": "https://arxiv.org/abs/2501.14506",
@@ -67,3 +67,4 @@ class LLMTextQualityV4(BaseOpenAI):
# Input content
"""
+ # process_response method is now inherited from BaseTextQuality
diff --git a/dingo/model/llm/text_quality/llm_text_quality_v5.py b/dingo/model/llm/text_quality/llm_text_quality_v5.py
new file mode 100644
index 00000000..cab5fa0a
--- /dev/null
+++ b/dingo/model/llm/text_quality/llm_text_quality_v5.py
@@ -0,0 +1,177 @@
+from dingo.model import Model
+from dingo.model.llm.text_quality.base_text_quality import BaseTextQuality
+
+
+@Model.llm_register("LLMTextQualityV5")
+class LLMTextQualityV5(BaseTextQuality):
+ # Metadata for documentation generation
+ _metric_info = {
+ "category": "Pretrain Text Quality Assessment Metrics",
+ "metric_name": "LLMTextQualityV5",
+ "description": "Impact-driven text quality evaluation for LLM pretraining, focusing on structural completeness, readability, diversity, and safety with quantitative thresholds",
+ "paper_title": "WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages",
+ "paper_url": "https://arxiv.org/abs/2501.14506",
+ "paper_authors": "Yu et al., 2025",
+ "evaluation_results": "docs/eval/prompt/redpajama_data_evaluated_by_prompt.md"
+ }
+ prompt = """
+# Role
+You are an expert in assessing pretraining data quality for large language models.
+
+# Goal
+Evaluate whether this text is suitable for LLM pretraining. Focus on issues that would negatively impact model learning, not minor imperfections.
+
+# Quality Dimensions
+
+## 1. Completeness (结构完整性)
+**Impact**: Broken structures prevent models from learning correct formatting patterns.
+
+**Check for**:
+- **Error_Formula**: Mathematical expressions with **unmatched delimiters** or **unclosed environments**
+
+ ⚠️ **Normal patterns (DO NOT flag)**:
+ - Mixing inline ($...$) and display ($$...$$) formulas
+ - Using \\begin{{align}}...\\end{{align}} within $$...$$
+ - Line breaks with \\\\ in alignment environments
+ - HTML tags: x, 2 for subscripts/superscripts
+ - Mixing LaTeX and HTML in web-extracted content
+
+ ✅ **Only flag when**:
+ - Delimiters unmatched: $ without closing $ (LaTeX context, not dollar signs)
+ - Environments unclosed: \\begin{{align}} without \\end{{align}}
+ - Syntax broken: \\frac{{a}}{{b missing closing }}
+ - HTML tags unclosed: text without
+
+ ⚠️ **Important**: Distinguish LaTeX $ from dollar signs ($100)
+ - Dollar sign: "$100", "$5.99" (followed by numbers) → NOT LaTeX
+ - LaTeX delimiter: "$x$", "$\\alpha$" (contains math symbols) → IS LaTeX
+ - Example: "The price is $100 and equation $x=y$ costs $50" has 4 dollar symbols but only 2 are LaTeX delimiters (and they match)
+
+ - Example (BAD): "$x^2 + y^2 is broken here $$a = b$$$"
+ (First LaTeX $ never closes, extra $ at end)
+ - Example (GOOD): "The item costs $100 and satisfies $x^2 + y^2 = z^2$ where price is $50"
+ (Dollar signs for money + proper LaTeX pair)
+ - Impact: Only flag errors that prevent >50% of mainstream parsers (pdflatex, MathJax, KaTeX, Pandoc, Jupyter) from rendering
+
+- **Error_Table**: Table structures that are malformed or unreadable
+ - Example (BAD): Misaligned columns, missing headers, or garbled HTML tags
+ - Impact: Models cannot learn proper table representation
+
+- **Error_Code**: Code blocks with formatting corruption
+ - Example (BAD): Line numbers mixed with code, broken syntax highlighting markers
+ - Impact: Teaches incorrect code structure
+
+**Key Question**: "Can the model learn proper formatting from this structure?"
+
+---
+
+## 2. Effectiveness (可读性)
+**Impact**: Noise prevents models from learning meaningful semantic patterns.
+
+**Check for**:
+- **Error_Garbled_Characters**: Encoding issues or anti-crawler artifacts
+ - Example (BAD): "’" (broken UTF-8), "□□□" (placeholder chars), "" (BOM)
+ - Threshold: >1% of characters are garbled
+ - Impact: Corrupts token distributions
+
+- **Error_Words_Stuck**: Missing spaces break tokenization
+ - Example (BAD): "Thequickbrownfoxjumpsoverthelazydog"
+ - Threshold: >1% of text has word boundaries missing
+ - Impact: Wrong subword tokenization patterns
+
+- **Error_Lack_Punctuation**: Sentence boundaries unclear
+ - Example (BAD): "I like apples they are red also I like oranges"
+ - Impact: Models cannot learn sentence segmentation
+
+**Key Question**: "Would a human find this readable and coherent?"
+
+---
+
+## 3. Similarity (重复性)
+**Impact**: Repetitive content reduces training efficiency and causes memorization.
+
+**Check for**:
+- **Error_Duplicate**: Excessive repetition that dominates the text
+ - Example (BAD): "I like blue. I like blue. I like blue. I like blue..." (>30% duplicate)
+ - Threshold: Same sentence/phrase repeats >5 times OR duplicate ratio >30%
+ - Impact: Over-represents certain patterns
+
+**Key Question**: "Does this text provide diverse training signal?"
+
+---
+
+## 4. Security (安全性)
+**Impact**: Harmful content should not be learned by models.
+
+**Check for**:
+- **Error_Politics**: Content promoting extremism, terrorism, ethnic hatred
+- **Error_Prohibition**: Violence, pornography, gambling, drugs
+
+**Key Question**: "Is this content safe for model training?"
+
+---
+
+# Evaluation Principles
+
+1. **Focus on Training Impact**: Only flag issues that significantly harm LLM learning
+2. **Severity Matters**: Minor typos are OK; systemic corruption is not
+3. **Context Awareness**: Academic formulas are expected in papers; garbled text never is
+4. **Threshold-Based**: Use quantitative checks (>1%, >30%, >5 times) when possible
+
+---
+
+# Workflow
+
+1. **Quick Scan**: Does the text look generally readable and well-formed?
+2. **Identify Category**: If problematic, which dimension is most severely affected?
+3. **Verify Impact**: Would this issue meaningfully harm model training?
+4. **Assign Label**:
+ - Score: 1 (suitable for training) or 0 (unsuitable)
+ - Type: 'Good' OR one of ['Completeness', 'Effectiveness', 'Similarity', 'Security']
+ - Name: Specific error type (see above)
+ - Reason: Brief explanation (1-2 sentences)
+
+---
+
+# Output Format
+Return JSON only: {"score": 0/1, "type": "", "name": "", "reason": ""}
+
+# Examples
+
+**Example 1 (Good - Simple)**:
+Input: "The Pythagorean theorem states that $a^2 + b^2 = c^2$ for right triangles."
+Output: {"score": 1, "type": "Good", "name": "None", "reason": "Clear, well-formatted text with proper LaTeX"}
+
+**Example 1.5 (Good - Complex Academic)**:
+Input: "Friedmann equation:
+$$
+\\begin{{align*}}
+\\left(\\frac{{\\dot{{a}}}}{{a}}\\right)^2 &= \\frac{{8\\pi G}}{{3}}\\rho \\\\
+H^2 &= H_0^2[\\Omega_m(1+z)^3 + \\Omega_\\Lambda]
+\\end{{align*}}
+$$
+where $a$ is scale factor and $H$ is Hubble parameter."
+Output: {{"score": 1, "type": "Good", "name": "None", "reason": "Well-formed multi-line equations with proper alignment"}}
+
+**Example 1.6 (Good - Mixed HTML/LaTeX)**:
+Input: "The eigenstate $\\psi_n$ where n is quantum number and energy E2 = m2c4"
+Output: {{"score": 1, "type": "Good", "name": "None", "reason": "Normal mix of LaTeX and HTML tags from web content"}}
+
+**Example 2 (Bad - Completeness)**:
+Input: "The formula $x^2 + y^2 is broken here $$a = b$$$"
+Output: {"score": 0, "type": "Completeness", "name": "Error_Formula", "reason": "Unmatched delimiters: first $ never closes, extra $ at end"}
+
+**Example 3 (Bad - Effectiveness)**:
+Input: "Theappleisredandtasty�withsomegarbledtext□□"
+Output: {"score": 0, "type": "Effectiveness", "name": "Error_Garbled_Characters", "reason": "Contains encoding corruption (�, □) and missing spaces (>1% of text)"}
+
+**Example 4 (Bad - Similarity)**:
+Input: "Blue is nice. Blue is nice. Blue is nice. Blue is nice. Blue is nice. Blue is nice."
+Output: {"score": 0, "type": "Similarity", "name": "Error_Duplicate", "reason": "Same sentence repeats 6 times, indicating low content diversity"}
+
+---
+
+# Input content to evaluate:
+
+"""
+ # process_response method is now inherited from BaseTextQuality
diff --git a/examples/dataset/s3.py b/examples/dataset/s3.py
index 2652442f..5aca028c 100644
--- a/examples/dataset/s3.py
+++ b/examples/dataset/s3.py
@@ -11,10 +11,15 @@
S3_ENDPOINT_URL = os.getenv("S3_ENDPOINT_URL", "https://s3.amazonaws.com")
S3_BUCKET = os.getenv("S3_BUCKET", "your_bucket_name") # qa-huawei
- # LLM 配置信息
- OPENAI_MODEL = 'deepseek-chat'
- OPENAI_URL = 'https://api.deepseek.com/v1'
- OPENAI_KEY = os.getenv("OPENAI_KEY")
+ OPENAI_MODEL = os.getenv("OPENAI_MODEL", "deepseek-chat")
+ OPENAI_URL = os.getenv("OPENAI_BASE_URL", "https://api.deepseek.com/v1")
+ OPENAI_KEY = os.getenv("OPENAI_API_KEY", "")
+
+ llm_config = {
+ "model": OPENAI_MODEL,
+ "key": OPENAI_KEY,
+ "api_url": OPENAI_URL,
+ }
input_data = {
# 数据文件路径
@@ -37,30 +42,22 @@
# 执行器配置
"executor": {
+ "max_workers": 10,
+ "batch_size": 10,
"result_save": {
+ "good": True,
"bad": True,
- "good": True
+ "all_labels": True
}
},
"evaluator": [
{
"fields": {"content": "content"},
"evals": [
- {"name": "RuleColonEnd"}
+ {"name": "LLMTextQualityV4", "config": llm_config}
]
}
]
-
- # # 评估器配置
- # "evaluator": {
- # "llm_config": {
- # "LLMTextQualityPromptBase": {
- # "model": OPENAI_MODEL,
- # "key": OPENAI_KEY,
- # "api_url": OPENAI_URL,
- # }
- # }
- # }
}
# 创建 InputArgs 实例
diff --git a/examples/llm_and_rule/llm_local.py b/examples/llm_and_rule/llm_local.py
index d80adaf5..76adefe0 100644
--- a/examples/llm_and_rule/llm_local.py
+++ b/examples/llm_and_rule/llm_local.py
@@ -1,14 +1,29 @@
+import os
+from pathlib import Path
+
from dingo.config import InputArgs
from dingo.exec import Executor
+OPENAI_MODEL = os.getenv("OPENAI_MODEL", "deepseek-chat")
+OPENAI_URL = os.getenv("OPENAI_BASE_URL", "https://api.deepseek.com/v1")
+OPENAI_KEY = os.getenv("OPENAI_API_KEY", "")
+
+llm_config = {
+ "model": OPENAI_MODEL,
+ "key": OPENAI_KEY,
+ "api_url": OPENAI_URL,
+}
+
if __name__ == '__main__':
input_data = {
- "input_path": "../../test/data/test_local_jsonl.jsonl",
+ "input_path": str(Path("test/data/test_local_jsonl.jsonl")),
"dataset": {
"source": "local",
"format": "jsonl",
},
"executor": {
+ "max_workers": 10,
+ "batch_size": 10,
"result_save": {
"bad": True,
"good": True
@@ -18,7 +33,7 @@
{
"fields": {"content": "content"},
"evals": [
- {"name": "LLMTextRepeat", "config": {"key": "", "api_url": ""}}
+ {"name": "LLMTextQualityV5", "config": llm_config}
]
}
]
diff --git a/test/scripts/model/llm/test_text_quality_v5.py b/test/scripts/model/llm/test_text_quality_v5.py
new file mode 100644
index 00000000..ae36c338
--- /dev/null
+++ b/test/scripts/model/llm/test_text_quality_v5.py
@@ -0,0 +1,101 @@
+"""
+测试 LLMTextQualityV5 优化后的 prompt 效果
+"""
+import json
+
+import pytest
+
+from dingo.model.llm.text_quality.llm_text_quality_v5 import LLMTextQualityV5
+
+
+class TestLLMTextQualityV5:
+ """测试 V5 版本的文本质量评估"""
+
+ def test_good_quality_text_response(self):
+ """测试解析 Good 质量文本的响应"""
+ response = json.dumps({
+ "score": 1,
+ "type": "Good",
+ "name": "None",
+ "reason": "Clear, well-formatted text with proper LaTeX"
+ })
+
+ result = LLMTextQualityV5.process_response(response)
+
+ assert result.status is False
+ assert result.label == ["QUALITY_GOOD"]
+ assert result.reason == ["Clear, well-formatted text with proper LaTeX"]
+ assert result.metric == "LLMTextQualityV5"
+
+ def test_completeness_error_response(self):
+ """测试解析 Completeness 错误的响应"""
+ response = json.dumps({
+ "score": 0,
+ "type": "Completeness",
+ "name": "Error_Formula",
+ "reason": "Inconsistent delimiters: mixed $$ and $ without proper closure"
+ })
+
+ result = LLMTextQualityV5.process_response(response)
+
+ assert result.status is True
+ assert result.label == ["Completeness.Error_Formula"]
+ assert "Inconsistent delimiters" in result.reason[0]
+ assert result.metric == "LLMTextQualityV5"
+
+ def test_effectiveness_error_response(self):
+ """测试解析 Effectiveness 错误的响应"""
+ response = json.dumps({
+ "score": 0,
+ "type": "Effectiveness",
+ "name": "Error_Garbled_Characters",
+ "reason": "Contains encoding corruption (�, □) and missing spaces (>1% of text)"
+ })
+
+ result = LLMTextQualityV5.process_response(response)
+
+ assert result.status is True
+ assert result.label == ["Effectiveness.Error_Garbled_Characters"]
+ assert "encoding corruption" in result.reason[0]
+
+ def test_similarity_error_response(self):
+ """测试解析 Similarity 错误的响应"""
+ response = json.dumps({
+ "score": 0,
+ "type": "Similarity",
+ "name": "Error_Duplicate",
+ "reason": "Same sentence repeats 6 times, indicating low content diversity"
+ })
+
+ result = LLMTextQualityV5.process_response(response)
+
+ assert result.status is True
+ assert result.label == ["Similarity.Error_Duplicate"]
+ assert "repeats 6 times" in result.reason[0]
+
+ def test_security_error_response(self):
+ """测试解析 Security 错误的响应"""
+ response = json.dumps({
+ "score": 0,
+ "type": "Security",
+ "name": "Error_Prohibition",
+ "reason": "Contains prohibited content"
+ })
+
+ result = LLMTextQualityV5.process_response(response)
+
+ assert result.status is True
+ assert result.label == ["Security.Error_Prohibition"]
+
+ def test_markdown_code_block_cleanup(self):
+ """测试 markdown 代码块清理"""
+ response_with_markdown = '```json\n{"score": 1, "type": "Good", "name": "None", "reason": "Test"}\n```'
+
+ result = LLMTextQualityV5.process_response(response_with_markdown)
+
+ assert result.status is False
+ assert result.label == ["QUALITY_GOOD"]
+
+
+if __name__ == "__main__":
+ pytest.main([__file__, "-v", "--tb=short"])