Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 60 additions & 0 deletions dingo/model/llm/text_quality/base_text_quality.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
"""
Base class for text quality evaluators with shared response processing logic.
"""

import json

from dingo.io.output.eval_detail import EvalDetail
from dingo.model.llm.base_openai import BaseOpenAI
from dingo.model.response.response_class import ResponseScoreTypeNameReason


class BaseTextQuality(BaseOpenAI):
"""
Base class for text quality evaluators.
Provides shared response processing logic for LLMTextQualityV4 and V5.
"""

@classmethod
def process_response(cls, response: str) -> EvalDetail:
"""
Process LLM response and convert to EvalDetail.

Handles:
- Cleanup of markdown code blocks (```json and ```)
- JSON parsing
- Creation of EvalDetail with proper status, score, label, and reason

Args:
response: Raw response string from LLM

Returns:
EvalDetail object with evaluation results
"""
# Cleanup markdown code blocks
if response.startswith("```json"):
response = response[7:]
elif response.startswith("```"): # Changed to elif for safety
response = response[3:]
if response.endswith("```"):
response = response[:-3]
response = response.strip()

# Parse JSON response
response_json = json.loads(response)
response_model = ResponseScoreTypeNameReason(**response_json)

# Create EvalDetail with all required fields
# status = False for Good quality (no issues found)
# status = True for Bad quality (issues found)
is_good = response_model.type == "Good"

result = EvalDetail(
metric=cls.__name__,
status=not is_good, # True if Bad (issues found), False if Good
score=response_model.score,
label=["QUALITY_GOOD"] if is_good else [f"{response_model.type}.{response_model.name}"],
reason=[response_model.reason]
)

return result
7 changes: 4 additions & 3 deletions dingo/model/llm/text_quality/llm_text_quality_v4.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from dingo.model import Model
from dingo.model.llm.base_openai import BaseOpenAI
from dingo.model.llm.text_quality.base_text_quality import BaseTextQuality


@Model.llm_register("LLMTextQualityV4")
class LLMTextQualityV4(BaseOpenAI):
class LLMTextQualityV4(BaseTextQuality):
# Metadata for documentation generation
_metric_info = {
"category": "Pretrain Text Quality Assessment Metrics",
"metric_name": "PromptTextQualityV4",
"metric_name": "LLMTextQualityV4",
"description": "Enhanced text quality evaluation covering completeness (formulas, tables, code), effectiveness (garbled text, spacing), similarity (duplicates), and security (politics, prohibited content)",
"paper_title": "WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages",
"paper_url": "https://arxiv.org/abs/2501.14506",
Expand Down Expand Up @@ -67,3 +67,4 @@ class LLMTextQualityV4(BaseOpenAI):
# Input content

"""
# process_response method is now inherited from BaseTextQuality
177 changes: 177 additions & 0 deletions dingo/model/llm/text_quality/llm_text_quality_v5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
from dingo.model import Model
from dingo.model.llm.text_quality.base_text_quality import BaseTextQuality


@Model.llm_register("LLMTextQualityV5")
class LLMTextQualityV5(BaseTextQuality):
# Metadata for documentation generation
_metric_info = {
"category": "Pretrain Text Quality Assessment Metrics",
"metric_name": "LLMTextQualityV5",
"description": "Impact-driven text quality evaluation for LLM pretraining, focusing on structural completeness, readability, diversity, and safety with quantitative thresholds",
"paper_title": "WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages",
"paper_url": "https://arxiv.org/abs/2501.14506",
"paper_authors": "Yu et al., 2025",
"evaluation_results": "docs/eval/prompt/redpajama_data_evaluated_by_prompt.md"
}
prompt = """
# Role
You are an expert in assessing pretraining data quality for large language models.

# Goal
Evaluate whether this text is suitable for LLM pretraining. Focus on issues that would negatively impact model learning, not minor imperfections.

# Quality Dimensions

## 1. Completeness (结构完整性)
**Impact**: Broken structures prevent models from learning correct formatting patterns.

**Check for**:
- **Error_Formula**: Mathematical expressions with **unmatched delimiters** or **unclosed environments**

⚠️ **Normal patterns (DO NOT flag)**:
- Mixing inline ($...$) and display ($$...$$) formulas
- Using \\begin{{align}}...\\end{{align}} within $$...$$
- Line breaks with \\\\ in alignment environments
- HTML tags: <sub>x</sub>, <sup>2</sup> for subscripts/superscripts
- Mixing LaTeX and HTML in web-extracted content

✅ **Only flag when**:
- Delimiters unmatched: $ without closing $ (LaTeX context, not dollar signs)
- Environments unclosed: \\begin{{align}} without \\end{{align}}
- Syntax broken: \\frac{{a}}{{b missing closing }}
- HTML tags unclosed: <sub>text without </sub>

⚠️ **Important**: Distinguish LaTeX $ from dollar signs ($100)
- Dollar sign: "$100", "$5.99" (followed by numbers) → NOT LaTeX
- LaTeX delimiter: "$x$", "$\\alpha$" (contains math symbols) → IS LaTeX
- Example: "The price is $100 and equation $x=y$ costs $50" has 4 dollar symbols but only 2 are LaTeX delimiters (and they match)

- Example (BAD): "$x^2 + y^2 is broken here $$a = b$$$"
(First LaTeX $ never closes, extra $ at end)
- Example (GOOD): "The item costs $100 and satisfies $x^2 + y^2 = z^2$ where price is $50"
(Dollar signs for money + proper LaTeX pair)
- Impact: Only flag errors that prevent >50% of mainstream parsers (pdflatex, MathJax, KaTeX, Pandoc, Jupyter) from rendering

- **Error_Table**: Table structures that are malformed or unreadable
- Example (BAD): Misaligned columns, missing headers, or garbled HTML tags
- Impact: Models cannot learn proper table representation

- **Error_Code**: Code blocks with formatting corruption
- Example (BAD): Line numbers mixed with code, broken syntax highlighting markers
- Impact: Teaches incorrect code structure

**Key Question**: "Can the model learn proper formatting from this structure?"

---

## 2. Effectiveness (可读性)
**Impact**: Noise prevents models from learning meaningful semantic patterns.

**Check for**:
- **Error_Garbled_Characters**: Encoding issues or anti-crawler artifacts
- Example (BAD): "’" (broken UTF-8), "□□□" (placeholder chars), "" (BOM)
- Threshold: >1% of characters are garbled
- Impact: Corrupts token distributions

- **Error_Words_Stuck**: Missing spaces break tokenization
- Example (BAD): "Thequickbrownfoxjumpsoverthelazydog"
- Threshold: >1% of text has word boundaries missing
- Impact: Wrong subword tokenization patterns

- **Error_Lack_Punctuation**: Sentence boundaries unclear
- Example (BAD): "I like apples they are red also I like oranges"
- Impact: Models cannot learn sentence segmentation

**Key Question**: "Would a human find this readable and coherent?"

---

## 3. Similarity (重复性)
**Impact**: Repetitive content reduces training efficiency and causes memorization.

**Check for**:
- **Error_Duplicate**: Excessive repetition that dominates the text
- Example (BAD): "I like blue. I like blue. I like blue. I like blue..." (>30% duplicate)
- Threshold: Same sentence/phrase repeats >5 times OR duplicate ratio >30%
- Impact: Over-represents certain patterns

**Key Question**: "Does this text provide diverse training signal?"

---

## 4. Security (安全性)
**Impact**: Harmful content should not be learned by models.

**Check for**:
- **Error_Politics**: Content promoting extremism, terrorism, ethnic hatred
- **Error_Prohibition**: Violence, pornography, gambling, drugs

**Key Question**: "Is this content safe for model training?"

---

# Evaluation Principles

1. **Focus on Training Impact**: Only flag issues that significantly harm LLM learning
2. **Severity Matters**: Minor typos are OK; systemic corruption is not
3. **Context Awareness**: Academic formulas are expected in papers; garbled text never is
4. **Threshold-Based**: Use quantitative checks (>1%, >30%, >5 times) when possible

---

# Workflow

1. **Quick Scan**: Does the text look generally readable and well-formed?
2. **Identify Category**: If problematic, which dimension is most severely affected?
3. **Verify Impact**: Would this issue meaningfully harm model training?
4. **Assign Label**:
- Score: 1 (suitable for training) or 0 (unsuitable)
- Type: 'Good' OR one of ['Completeness', 'Effectiveness', 'Similarity', 'Security']
- Name: Specific error type (see above)
- Reason: Brief explanation (1-2 sentences)

---

# Output Format
Return JSON only: {"score": 0/1, "type": "", "name": "", "reason": ""}

# Examples

**Example 1 (Good - Simple)**:
Input: "The Pythagorean theorem states that $a^2 + b^2 = c^2$ for right triangles."
Output: {"score": 1, "type": "Good", "name": "None", "reason": "Clear, well-formatted text with proper LaTeX"}

**Example 1.5 (Good - Complex Academic)**:
Input: "Friedmann equation:
$$
\\begin{{align*}}
\\left(\\frac{{\\dot{{a}}}}{{a}}\\right)^2 &= \\frac{{8\\pi G}}{{3}}\\rho \\\\
H^2 &= H_0^2[\\Omega_m(1+z)^3 + \\Omega_\\Lambda]
\\end{{align*}}
$$
where $a$ is scale factor and $H$ is Hubble parameter."
Output: {{"score": 1, "type": "Good", "name": "None", "reason": "Well-formed multi-line equations with proper alignment"}}

**Example 1.6 (Good - Mixed HTML/LaTeX)**:
Input: "The eigenstate $\\psi_n$ where <sub>n</sub> is quantum number and energy E<sup>2</sup> = m<sup>2</sup>c<sup>4</sup>"
Output: {{"score": 1, "type": "Good", "name": "None", "reason": "Normal mix of LaTeX and HTML tags from web content"}}

**Example 2 (Bad - Completeness)**:
Input: "The formula $x^2 + y^2 is broken here $$a = b$$$"
Output: {"score": 0, "type": "Completeness", "name": "Error_Formula", "reason": "Unmatched delimiters: first $ never closes, extra $ at end"}

**Example 3 (Bad - Effectiveness)**:
Input: "Theappleisredandtasty�withsomegarbledtext□□"
Output: {"score": 0, "type": "Effectiveness", "name": "Error_Garbled_Characters", "reason": "Contains encoding corruption (�, □) and missing spaces (>1% of text)"}

**Example 4 (Bad - Similarity)**:
Input: "Blue is nice. Blue is nice. Blue is nice. Blue is nice. Blue is nice. Blue is nice."
Output: {"score": 0, "type": "Similarity", "name": "Error_Duplicate", "reason": "Same sentence repeats 6 times, indicating low content diversity"}

---

# Input content to evaluate:

"""
# process_response method is now inherited from BaseTextQuality
31 changes: 14 additions & 17 deletions examples/dataset/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,15 @@
S3_ENDPOINT_URL = os.getenv("S3_ENDPOINT_URL", "https://s3.amazonaws.com")
S3_BUCKET = os.getenv("S3_BUCKET", "your_bucket_name") # qa-huawei

# LLM 配置信息
OPENAI_MODEL = 'deepseek-chat'
OPENAI_URL = 'https://api.deepseek.com/v1'
OPENAI_KEY = os.getenv("OPENAI_KEY")
OPENAI_MODEL = os.getenv("OPENAI_MODEL", "deepseek-chat")
OPENAI_URL = os.getenv("OPENAI_BASE_URL", "https://api.deepseek.com/v1")
OPENAI_KEY = os.getenv("OPENAI_API_KEY", "")

llm_config = {
"model": OPENAI_MODEL,
"key": OPENAI_KEY,
"api_url": OPENAI_URL,
}

input_data = {
# 数据文件路径
Expand All @@ -37,30 +42,22 @@

# 执行器配置
"executor": {
"max_workers": 10,
"batch_size": 10,
"result_save": {
"good": True,
"bad": True,
"good": True
"all_labels": True
}
},
"evaluator": [
{
"fields": {"content": "content"},
"evals": [
{"name": "RuleColonEnd"}
{"name": "LLMTextQualityV4", "config": llm_config}
]
}
]

# # 评估器配置
# "evaluator": {
# "llm_config": {
# "LLMTextQualityPromptBase": {
# "model": OPENAI_MODEL,
# "key": OPENAI_KEY,
# "api_url": OPENAI_URL,
# }
# }
# }
}

# 创建 InputArgs 实例
Expand Down
19 changes: 17 additions & 2 deletions examples/llm_and_rule/llm_local.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,29 @@
import os
from pathlib import Path

from dingo.config import InputArgs
from dingo.exec import Executor

OPENAI_MODEL = os.getenv("OPENAI_MODEL", "deepseek-chat")
OPENAI_URL = os.getenv("OPENAI_BASE_URL", "https://api.deepseek.com/v1")
OPENAI_KEY = os.getenv("OPENAI_API_KEY", "")

llm_config = {
"model": OPENAI_MODEL,
"key": OPENAI_KEY,
"api_url": OPENAI_URL,
}

if __name__ == '__main__':
input_data = {
"input_path": "../../test/data/test_local_jsonl.jsonl",
"input_path": str(Path("test/data/test_local_jsonl.jsonl")),
"dataset": {
"source": "local",
"format": "jsonl",
},
"executor": {
"max_workers": 10,
"batch_size": 10,
"result_save": {
"bad": True,
"good": True
Expand All @@ -18,7 +33,7 @@
{
"fields": {"content": "content"},
"evals": [
{"name": "LLMTextRepeat", "config": {"key": "", "api_url": ""}}
{"name": "LLMTextQualityV5", "config": llm_config}
]
}
]
Expand Down
Loading