Skip to content

Commit 1375e14

Browse files
authored
feat: add LLMTextQualityV5 (#294)
* feat: add LLMTextQualityV5 * x * x
1 parent 99ab8ab commit 1375e14

File tree

6 files changed

+373
-22
lines changed

6 files changed

+373
-22
lines changed
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
"""
2+
Base class for text quality evaluators with shared response processing logic.
3+
"""
4+
5+
import json
6+
7+
from dingo.io.output.eval_detail import EvalDetail
8+
from dingo.model.llm.base_openai import BaseOpenAI
9+
from dingo.model.response.response_class import ResponseScoreTypeNameReason
10+
11+
12+
class BaseTextQuality(BaseOpenAI):
13+
"""
14+
Base class for text quality evaluators.
15+
Provides shared response processing logic for LLMTextQualityV4 and V5.
16+
"""
17+
18+
@classmethod
19+
def process_response(cls, response: str) -> EvalDetail:
20+
"""
21+
Process LLM response and convert to EvalDetail.
22+
23+
Handles:
24+
- Cleanup of markdown code blocks (```json and ```)
25+
- JSON parsing
26+
- Creation of EvalDetail with proper status, score, label, and reason
27+
28+
Args:
29+
response: Raw response string from LLM
30+
31+
Returns:
32+
EvalDetail object with evaluation results
33+
"""
34+
# Cleanup markdown code blocks
35+
if response.startswith("```json"):
36+
response = response[7:]
37+
elif response.startswith("```"): # Changed to elif for safety
38+
response = response[3:]
39+
if response.endswith("```"):
40+
response = response[:-3]
41+
response = response.strip()
42+
43+
# Parse JSON response
44+
response_json = json.loads(response)
45+
response_model = ResponseScoreTypeNameReason(**response_json)
46+
47+
# Create EvalDetail with all required fields
48+
# status = False for Good quality (no issues found)
49+
# status = True for Bad quality (issues found)
50+
is_good = response_model.type == "Good"
51+
52+
result = EvalDetail(
53+
metric=cls.__name__,
54+
status=not is_good, # True if Bad (issues found), False if Good
55+
score=response_model.score,
56+
label=["QUALITY_GOOD"] if is_good else [f"{response_model.type}.{response_model.name}"],
57+
reason=[response_model.reason]
58+
)
59+
60+
return result

dingo/model/llm/text_quality/llm_text_quality_v4.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
from dingo.model import Model
2-
from dingo.model.llm.base_openai import BaseOpenAI
2+
from dingo.model.llm.text_quality.base_text_quality import BaseTextQuality
33

44

55
@Model.llm_register("LLMTextQualityV4")
6-
class LLMTextQualityV4(BaseOpenAI):
6+
class LLMTextQualityV4(BaseTextQuality):
77
# Metadata for documentation generation
88
_metric_info = {
99
"category": "Pretrain Text Quality Assessment Metrics",
10-
"metric_name": "PromptTextQualityV4",
10+
"metric_name": "LLMTextQualityV4",
1111
"description": "Enhanced text quality evaluation covering completeness (formulas, tables, code), effectiveness (garbled text, spacing), similarity (duplicates), and security (politics, prohibited content)",
1212
"paper_title": "WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages",
1313
"paper_url": "https://arxiv.org/abs/2501.14506",
@@ -67,3 +67,4 @@ class LLMTextQualityV4(BaseOpenAI):
6767
# Input content
6868
6969
"""
70+
# process_response method is now inherited from BaseTextQuality
Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
from dingo.model import Model
2+
from dingo.model.llm.text_quality.base_text_quality import BaseTextQuality
3+
4+
5+
@Model.llm_register("LLMTextQualityV5")
6+
class LLMTextQualityV5(BaseTextQuality):
7+
# Metadata for documentation generation
8+
_metric_info = {
9+
"category": "Pretrain Text Quality Assessment Metrics",
10+
"metric_name": "LLMTextQualityV5",
11+
"description": "Impact-driven text quality evaluation for LLM pretraining, focusing on structural completeness, readability, diversity, and safety with quantitative thresholds",
12+
"paper_title": "WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages",
13+
"paper_url": "https://arxiv.org/abs/2501.14506",
14+
"paper_authors": "Yu et al., 2025",
15+
"evaluation_results": "docs/eval/prompt/redpajama_data_evaluated_by_prompt.md"
16+
}
17+
prompt = """
18+
# Role
19+
You are an expert in assessing pretraining data quality for large language models.
20+
21+
# Goal
22+
Evaluate whether this text is suitable for LLM pretraining. Focus on issues that would negatively impact model learning, not minor imperfections.
23+
24+
# Quality Dimensions
25+
26+
## 1. Completeness (结构完整性)
27+
**Impact**: Broken structures prevent models from learning correct formatting patterns.
28+
29+
**Check for**:
30+
- **Error_Formula**: Mathematical expressions with **unmatched delimiters** or **unclosed environments**
31+
32+
⚠️ **Normal patterns (DO NOT flag)**:
33+
- Mixing inline ($...$) and display ($$...$$) formulas
34+
- Using \\begin{{align}}...\\end{{align}} within $$...$$
35+
- Line breaks with \\\\ in alignment environments
36+
- HTML tags: <sub>x</sub>, <sup>2</sup> for subscripts/superscripts
37+
- Mixing LaTeX and HTML in web-extracted content
38+
39+
✅ **Only flag when**:
40+
- Delimiters unmatched: $ without closing $ (LaTeX context, not dollar signs)
41+
- Environments unclosed: \\begin{{align}} without \\end{{align}}
42+
- Syntax broken: \\frac{{a}}{{b missing closing }}
43+
- HTML tags unclosed: <sub>text without </sub>
44+
45+
⚠️ **Important**: Distinguish LaTeX $ from dollar signs ($100)
46+
- Dollar sign: "$100", "$5.99" (followed by numbers) → NOT LaTeX
47+
- LaTeX delimiter: "$x$", "$\\alpha$" (contains math symbols) → IS LaTeX
48+
- Example: "The price is $100 and equation $x=y$ costs $50" has 4 dollar symbols but only 2 are LaTeX delimiters (and they match)
49+
50+
- Example (BAD): "$x^2 + y^2 is broken here $$a = b$$$"
51+
(First LaTeX $ never closes, extra $ at end)
52+
- Example (GOOD): "The item costs $100 and satisfies $x^2 + y^2 = z^2$ where price is $50"
53+
(Dollar signs for money + proper LaTeX pair)
54+
- Impact: Only flag errors that prevent >50% of mainstream parsers (pdflatex, MathJax, KaTeX, Pandoc, Jupyter) from rendering
55+
56+
- **Error_Table**: Table structures that are malformed or unreadable
57+
- Example (BAD): Misaligned columns, missing headers, or garbled HTML tags
58+
- Impact: Models cannot learn proper table representation
59+
60+
- **Error_Code**: Code blocks with formatting corruption
61+
- Example (BAD): Line numbers mixed with code, broken syntax highlighting markers
62+
- Impact: Teaches incorrect code structure
63+
64+
**Key Question**: "Can the model learn proper formatting from this structure?"
65+
66+
---
67+
68+
## 2. Effectiveness (可读性)
69+
**Impact**: Noise prevents models from learning meaningful semantic patterns.
70+
71+
**Check for**:
72+
- **Error_Garbled_Characters**: Encoding issues or anti-crawler artifacts
73+
- Example (BAD): "’" (broken UTF-8), "□□□" (placeholder chars), "" (BOM)
74+
- Threshold: >1% of characters are garbled
75+
- Impact: Corrupts token distributions
76+
77+
- **Error_Words_Stuck**: Missing spaces break tokenization
78+
- Example (BAD): "Thequickbrownfoxjumpsoverthelazydog"
79+
- Threshold: >1% of text has word boundaries missing
80+
- Impact: Wrong subword tokenization patterns
81+
82+
- **Error_Lack_Punctuation**: Sentence boundaries unclear
83+
- Example (BAD): "I like apples they are red also I like oranges"
84+
- Impact: Models cannot learn sentence segmentation
85+
86+
**Key Question**: "Would a human find this readable and coherent?"
87+
88+
---
89+
90+
## 3. Similarity (重复性)
91+
**Impact**: Repetitive content reduces training efficiency and causes memorization.
92+
93+
**Check for**:
94+
- **Error_Duplicate**: Excessive repetition that dominates the text
95+
- Example (BAD): "I like blue. I like blue. I like blue. I like blue..." (>30% duplicate)
96+
- Threshold: Same sentence/phrase repeats >5 times OR duplicate ratio >30%
97+
- Impact: Over-represents certain patterns
98+
99+
**Key Question**: "Does this text provide diverse training signal?"
100+
101+
---
102+
103+
## 4. Security (安全性)
104+
**Impact**: Harmful content should not be learned by models.
105+
106+
**Check for**:
107+
- **Error_Politics**: Content promoting extremism, terrorism, ethnic hatred
108+
- **Error_Prohibition**: Violence, pornography, gambling, drugs
109+
110+
**Key Question**: "Is this content safe for model training?"
111+
112+
---
113+
114+
# Evaluation Principles
115+
116+
1. **Focus on Training Impact**: Only flag issues that significantly harm LLM learning
117+
2. **Severity Matters**: Minor typos are OK; systemic corruption is not
118+
3. **Context Awareness**: Academic formulas are expected in papers; garbled text never is
119+
4. **Threshold-Based**: Use quantitative checks (>1%, >30%, >5 times) when possible
120+
121+
---
122+
123+
# Workflow
124+
125+
1. **Quick Scan**: Does the text look generally readable and well-formed?
126+
2. **Identify Category**: If problematic, which dimension is most severely affected?
127+
3. **Verify Impact**: Would this issue meaningfully harm model training?
128+
4. **Assign Label**:
129+
- Score: 1 (suitable for training) or 0 (unsuitable)
130+
- Type: 'Good' OR one of ['Completeness', 'Effectiveness', 'Similarity', 'Security']
131+
- Name: Specific error type (see above)
132+
- Reason: Brief explanation (1-2 sentences)
133+
134+
---
135+
136+
# Output Format
137+
Return JSON only: {"score": 0/1, "type": "", "name": "", "reason": ""}
138+
139+
# Examples
140+
141+
**Example 1 (Good - Simple)**:
142+
Input: "The Pythagorean theorem states that $a^2 + b^2 = c^2$ for right triangles."
143+
Output: {"score": 1, "type": "Good", "name": "None", "reason": "Clear, well-formatted text with proper LaTeX"}
144+
145+
**Example 1.5 (Good - Complex Academic)**:
146+
Input: "Friedmann equation:
147+
$$
148+
\\begin{{align*}}
149+
\\left(\\frac{{\\dot{{a}}}}{{a}}\\right)^2 &= \\frac{{8\\pi G}}{{3}}\\rho \\\\
150+
H^2 &= H_0^2[\\Omega_m(1+z)^3 + \\Omega_\\Lambda]
151+
\\end{{align*}}
152+
$$
153+
where $a$ is scale factor and $H$ is Hubble parameter."
154+
Output: {{"score": 1, "type": "Good", "name": "None", "reason": "Well-formed multi-line equations with proper alignment"}}
155+
156+
**Example 1.6 (Good - Mixed HTML/LaTeX)**:
157+
Input: "The eigenstate $\\psi_n$ where <sub>n</sub> is quantum number and energy E<sup>2</sup> = m<sup>2</sup>c<sup>4</sup>"
158+
Output: {{"score": 1, "type": "Good", "name": "None", "reason": "Normal mix of LaTeX and HTML tags from web content"}}
159+
160+
**Example 2 (Bad - Completeness)**:
161+
Input: "The formula $x^2 + y^2 is broken here $$a = b$$$"
162+
Output: {"score": 0, "type": "Completeness", "name": "Error_Formula", "reason": "Unmatched delimiters: first $ never closes, extra $ at end"}
163+
164+
**Example 3 (Bad - Effectiveness)**:
165+
Input: "Theappleisredandtasty�withsomegarbledtext□□"
166+
Output: {"score": 0, "type": "Effectiveness", "name": "Error_Garbled_Characters", "reason": "Contains encoding corruption (�, □) and missing spaces (>1% of text)"}
167+
168+
**Example 4 (Bad - Similarity)**:
169+
Input: "Blue is nice. Blue is nice. Blue is nice. Blue is nice. Blue is nice. Blue is nice."
170+
Output: {"score": 0, "type": "Similarity", "name": "Error_Duplicate", "reason": "Same sentence repeats 6 times, indicating low content diversity"}
171+
172+
---
173+
174+
# Input content to evaluate:
175+
176+
"""
177+
# process_response method is now inherited from BaseTextQuality

examples/dataset/s3.py

Lines changed: 14 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,15 @@
1111
S3_ENDPOINT_URL = os.getenv("S3_ENDPOINT_URL", "https://s3.amazonaws.com")
1212
S3_BUCKET = os.getenv("S3_BUCKET", "your_bucket_name") # qa-huawei
1313

14-
# LLM 配置信息
15-
OPENAI_MODEL = 'deepseek-chat'
16-
OPENAI_URL = 'https://api.deepseek.com/v1'
17-
OPENAI_KEY = os.getenv("OPENAI_KEY")
14+
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "deepseek-chat")
15+
OPENAI_URL = os.getenv("OPENAI_BASE_URL", "https://api.deepseek.com/v1")
16+
OPENAI_KEY = os.getenv("OPENAI_API_KEY", "")
17+
18+
llm_config = {
19+
"model": OPENAI_MODEL,
20+
"key": OPENAI_KEY,
21+
"api_url": OPENAI_URL,
22+
}
1823

1924
input_data = {
2025
# 数据文件路径
@@ -37,30 +42,22 @@
3742

3843
# 执行器配置
3944
"executor": {
45+
"max_workers": 10,
46+
"batch_size": 10,
4047
"result_save": {
48+
"good": True,
4149
"bad": True,
42-
"good": True
50+
"all_labels": True
4351
}
4452
},
4553
"evaluator": [
4654
{
4755
"fields": {"content": "content"},
4856
"evals": [
49-
{"name": "RuleColonEnd"}
57+
{"name": "LLMTextQualityV4", "config": llm_config}
5058
]
5159
}
5260
]
53-
54-
# # 评估器配置
55-
# "evaluator": {
56-
# "llm_config": {
57-
# "LLMTextQualityPromptBase": {
58-
# "model": OPENAI_MODEL,
59-
# "key": OPENAI_KEY,
60-
# "api_url": OPENAI_URL,
61-
# }
62-
# }
63-
# }
6461
}
6562

6663
# 创建 InputArgs 实例

examples/llm_and_rule/llm_local.py

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,29 @@
1+
import os
2+
from pathlib import Path
3+
14
from dingo.config import InputArgs
25
from dingo.exec import Executor
36

7+
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "deepseek-chat")
8+
OPENAI_URL = os.getenv("OPENAI_BASE_URL", "https://api.deepseek.com/v1")
9+
OPENAI_KEY = os.getenv("OPENAI_API_KEY", "")
10+
11+
llm_config = {
12+
"model": OPENAI_MODEL,
13+
"key": OPENAI_KEY,
14+
"api_url": OPENAI_URL,
15+
}
16+
417
if __name__ == '__main__':
518
input_data = {
6-
"input_path": "../../test/data/test_local_jsonl.jsonl",
19+
"input_path": str(Path("test/data/test_local_jsonl.jsonl")),
720
"dataset": {
821
"source": "local",
922
"format": "jsonl",
1023
},
1124
"executor": {
25+
"max_workers": 10,
26+
"batch_size": 10,
1227
"result_save": {
1328
"bad": True,
1429
"good": True
@@ -18,7 +33,7 @@
1833
{
1934
"fields": {"content": "content"},
2035
"evals": [
21-
{"name": "LLMTextRepeat", "config": {"key": "", "api_url": ""}}
36+
{"name": "LLMTextQualityV5", "config": llm_config}
2237
]
2338
}
2439
]

0 commit comments

Comments
 (0)