Merge pull request #219 from e06084/dev

e06084 · web-flow · commit d4696e1a5962 · 2025-10-20T16:35:31.000+08:00
feat: update metric doc
diff --git a/README.md b/README.md
@@ -197,7 +197,7 @@ Dingo provides comprehensive data quality assessment through both rule-based and
 📊 **[View Complete Metrics Documentation →](docs/metrics.md)**
 
 Our evaluation system includes:
-- **Text Quality Assessment Metrics**: Pre-training data quality evaluation using DataMan methodology and enhanced multi-dimensional assessment
+- **Pretrain Text Quality Assessment Metrics**: Pre-training data quality evaluation using DataMan methodology and enhanced multi-dimensional assessment
 - **SFT Data Assessment Metrics**: Honest, Helpful, Harmless evaluation for supervised fine-tuning data
 - **Classification Metrics**: Topic categorization and content classification
 - **Multimodality Assessment Metrics**: Image classification and relevance evaluation
diff --git a/dingo/model/llm/llm_factcheck_public.py b/dingo/model/llm/llm_factcheck_public.py
@@ -32,7 +32,7 @@ class LLMFactCheckPublic(BaseOpenAI):
     """公开事实性评估器 - 基于 GPT-5 System Card 的两阶段评估"""
 
     _metric_info = {
-        "category": "Factuality Assessment",
+        "category": "SFT Data Assessment Metrics",
         "quality_dimension": "FACTUAL_CORRECTNESS",
         "metric_name": "LLMFactCheckPublic",
         "description": "Two-stage factuality evaluation pipeline from GPT-5",
diff --git a/dingo/model/prompt/prompt_code_compare.py b/dingo/model/prompt/prompt_code_compare.py
@@ -5,7 +5,7 @@
 @Model.prompt_register('CodeCompare', [], ['LLMCodeCompare'])
 class PromptCodeCompare(BasePrompt):
     _metric_info = {
-        'category': 'SFT Data Assessment Metrics',
+        'category': 'Pretrain Text Quality Assessment Metrics',
         'metric_name': 'PromptCodeCompare',
         'description': 'Compares the effectiveness of two tools in extracting code blocks from HTML to Markdown format by evaluating recognition rate and accuracy to determine which tool performs better',
         'paper_title': '',
diff --git a/dingo/model/prompt/prompt_dataman_assessment.py b/dingo/model/prompt/prompt_dataman_assessment.py
@@ -87,7 +87,7 @@ class PromptDataManAssessment(BasePrompt):
 
     # Metadata for documentation generation
     _metric_info = {
-        "category": "Text Quality Assessment Metrics",
+        "category": "Pretrain Text Quality Assessment Metrics",
         "metric_name": "PromptDataManAssessment",
         "description": "Evaluates pre-training data quality using the DataMan methodology (14 standards, 15 domains). Assigns a score (0/1), domain type, quality status, and reason.",
         "paper_title": "DataMan: Data Manager for Pre-training Large Language Models",
diff --git a/dingo/model/prompt/prompt_document_parsing.py b/dingo/model/prompt/prompt_document_parsing.py
@@ -6,7 +6,7 @@
 class PromptDocumentParsingQuality(BasePrompt):
     # Metadata for documentation generation
     _metric_info = {
-        "category": "Document Parsing",
+        "category": "OCR Eval Metric",
         "metric_name": "PromptDocumentParsingQuality",
         "description": "Evaluate the quality of general document parsing",
         "evaluation_results": "",
diff --git a/dingo/model/prompt/prompt_html_extract_compare.py b/dingo/model/prompt/prompt_html_extract_compare.py
@@ -5,7 +5,7 @@
 @Model.prompt_register("Html_Extract_Compare", [], ['LLMHtmlExtractCompare'])
 class PromptHtmlExtractCompare(BasePrompt):
     _metric_info = {
-        'category': 'SFT Data Assessment Metrics',
+        'category': 'Pretrain Text Quality Assessment Metrics',
         'metric_name': 'PromptHtmlExtractCompare',
         'description': 'Compares the effectiveness of two HTML extraction tools by evaluating element recognition rate and accuracy across different content types',
         'paper_title': '',
diff --git a/dingo/model/prompt/prompt_html_extract_compare_v2.py b/dingo/model/prompt/prompt_html_extract_compare_v2.py
@@ -5,7 +5,7 @@
 @Model.prompt_register("Html_Extract_Compare_V2", ['html_extract_compare'], ['LLMHtmlExtractCompareV2'])
 class PromptHtmlExtractCompareV2(BasePrompt):
     _metric_info = {
-        'category': 'SFT Data Assessment Metrics',
+        'category': 'Pretrain Text Quality Assessment Metrics',
         'metric_name': 'PromptHtmlExtractCompareV2',
         'description': 'Compares HTML extraction results using diff-match-patch algorithm to identify unique and common content, then evaluates core informational content differences',
         'paper_title': '',
diff --git a/dingo/model/prompt/prompt_math_compare.py b/dingo/model/prompt/prompt_math_compare.py
@@ -5,7 +5,7 @@
 @Model.prompt_register('MathCompare', [], ['LLMMathCompare'])
 class PromptMathCompare(BasePrompt):
     _metric_info = {
-        'category': 'SFT Data Assessment Metrics',
+        'category': 'Pretrain Text Quality Assessment Metrics',
         'metric_name': 'PromptMathCompare',
         'description': 'Compares the effectiveness of two tools in extracting mathematical formulas from HTML to Markdown format by evaluating recognition rate and accuracy to determine which tool performs better',
         'paper_title': '',
diff --git a/dingo/model/prompt/prompt_politics.py b/dingo/model/prompt/prompt_politics.py
@@ -7,7 +7,7 @@ class PromptPolitics(BasePrompt):
 
     # Metadata for documentation generation
     _metric_info = {
-        "category": "Text Quality Assessment Metrics",
+        "category": "Pretrain Text Quality Assessment Metrics",
         "metric_name": "PromptPolitics",
         "description": "Evaluates whether the text contains politics-related content",
         "paper_title": "",
diff --git a/dingo/model/prompt/prompt_table_compare.py b/dingo/model/prompt/prompt_table_compare.py
@@ -5,7 +5,7 @@
 @Model.prompt_register('TableCompare', [], ['LLMTableCompare'])
 class PromptTableCompare(BasePrompt):
     _metric_info = {
-        'category': 'SFT Data Assessment Metrics',
+        'category': 'Pretrain Text Quality Assessment Metrics',
         'metric_name': 'PromptTableCompare',
         'description': 'Compares the effectiveness of two tools in extracting tables from HTML to Markdown format by evaluating recognition rate and accuracy to determine which tool performs better',
         'paper_title': '',
diff --git a/dingo/model/prompt/prompt_text_quality.py b/dingo/model/prompt/prompt_text_quality.py
@@ -83,7 +83,7 @@ class PromptTextQualityV4(BasePrompt):
 
     # Metadata for documentation generation
     _metric_info = {
-        "category": "Text Quality Assessment Metrics",
+        "category": "Pretrain Text Quality Assessment Metrics",
         "metric_name": "PromptTextQualityV4",
         "description": "Enhanced text quality evaluation covering completeness (formulas, tables, code), effectiveness (garbled text, spacing), similarity (duplicates), and security (politics, prohibited content)",
         "paper_title": "WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages",
diff --git a/docs/metrics.md b/docs/metrics.md
@@ -4,21 +4,24 @@ This document provides comprehensive information about all quality metrics used
 
 **Note**: All metrics are backed by academic sources to ensure objectivity and scientific rigor.
 
-### Text Quality Assessment Metrics
+### Pretrain Text Quality Assessment Metrics
 
 | Type | Metric | Description | Paper Source | Evaluation Results |
 |------|--------|-------------|--------------|-------------------|
+| `CodeCompare` | PromptCodeCompare | Compares the effectiveness of two tools in extracting code blocks from HTML to Markdown format by evaluating recognit... | Internal Implementation | N/A |
 | `DATAMAN_ASSESSMENT` | PromptDataManAssessment | Evaluates pre-training data quality using the DataMan methodology (14 standards, 15 domains). Assigns a score (0/1), ... | [DataMan: Data Manager for Pre-training Large Language Models](https://arxiv.org/abs/2502.19363) (Peng et al., 2025) | N/A |
+| `Html_Extract_Compare` | PromptHtmlExtractCompare | Compares the effectiveness of two HTML extraction tools by evaluating element recognition rate and accuracy across di... | Internal Implementation | N/A |
+| `Html_Extract_Compare_V2` | PromptHtmlExtractCompareV2 | Compares HTML extraction results using diff-match-patch algorithm to identify unique and common content, then evaluat... | Internal Implementation | N/A |
+| `MathCompare` | PromptMathCompare | Compares the effectiveness of two tools in extracting mathematical formulas from HTML to Markdown format by evaluatin... | Internal Implementation | N/A |
 | `QUALITY_BAD_SECURITY` | PromptPolitics | Evaluates whether the text contains politics-related content | Internal Implementation | N/A |
 | `TEXT_QUALITY_V4` | PromptTextQualityV4 | Enhanced text quality evaluation covering completeness (formulas, tables, code), effectiveness (garbled text, spacing... | [WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages](https://arxiv.org/abs/2501.14506) (Yu et al., 2025) | [📊 See Results](eval/prompt/redpajama_data_evaluated_by_prompt.md) |
+| `TableCompare` | PromptTableCompare | Compares the effectiveness of two tools in extracting tables from HTML to Markdown format by evaluating recognition r... | Internal Implementation | N/A |
 
 ### SFT Data Assessment Metrics
 
 | Type | Metric | Description | Paper Source | Evaluation Results |
 |------|--------|-------------|--------------|-------------------|
-| `Html_Extract_Compare` | PromptHtmlExtractCompare | Compares the effectiveness of two HTML extraction tools by evaluating element recognition rate and accuracy across di... | Internal Implementation | N/A |
-| `Html_Extract_Compare_V2` | PromptHtmlExtractCompareV2 | Compares HTML extraction results using diff-match-patch algorithm to identify unique and common content, then evaluat... | Internal Implementation | N/A |
-| `MathCompare` | PromptMathCompare | Compares the effectiveness of two tools in extracting mathematical formulas from HTML to Markdown format by evaluatin... | Internal Implementation | N/A |
+| `QUALITY_BAD_FACTUALITY` | LLMFactCheckPublic | Two-stage factuality evaluation pipeline from GPT-5 | [GPT-5 System Card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf) (OpenAI) | N/A |
 | `QUALITY_BAD_HALLUCINATION` | PromptHallucination | Evaluates whether the response contains factual contradictions or hallucinations against provided context information | [TruthfulQA: Measuring How Models Mimic Human Falsehoods](https://arxiv.org/abs/2109.07958) (Lin et al., 2021) | N/A |
 | `QUALITY_BAD_HALLUCINATION` | RuleHallucinationHHEM | Uses Vectara's HHEM-2.1-Open model for local hallucination detection by evaluating consistency between response and c... | [HHEM-2.1-Open](https://huggingface.co/vectara/hallucination_evaluation_model) (Forrest Bao, Miaoran Li, Rogger Luo, Ofer Mendelevitch) | N/A |
 | `QUALITY_HARMLESS` | PromptTextHarmless | Checks if responses avoid harmful content, discriminatory language, and dangerous assistance | [Training a Helpful and Harmless Assistant with Reinforcement Learning from Human Feedback](https://arxiv.org/pdf/2204.05862) (Bai et al., 2022) | [📊 See Results](eval/prompt/qa_data_evaluated_by_3h.md) |
@@ -68,18 +71,6 @@ This document provides comprehensive information about all quality metrics used
 | `QUALITY_BAD_EFFECTIVENESS` | RuleAudioDuration | Check whether the audio duration meets the standard | Internal Implementation | N/A |
 | `QUALITY_BAD_EFFECTIVENESS` | RuleAudioSnrQuality | Check whether the audio signal-to-noise ratio meets the standard | Internal Implementation | N/A |
 
-### Document Parsing
-
-| Type | Metric | Description | Paper Source | Evaluation Results |
-|------|--------|-------------|--------------|-------------------|
-| `PromptDocumentParsingQuality` | PromptDocumentParsingQuality | Evaluate the quality of general document parsing | Internal Implementation | N/A |
-
-### Factuality Assessment
-
-| Type | Metric | Description | Paper Source | Evaluation Results |
-|------|--------|-------------|--------------|-------------------|
-| `QUALITY_BAD_FACTUALITY` | LLMFactCheckPublic | Two-stage factuality evaluation pipeline from GPT-5 | [GPT-5 System Card](https://cdn.openai.com/pdf/8124a3ce-ab78-4f06-96eb-49ea29ffb52f/gpt5-system-card-aug7.pdf) (OpenAI) | N/A |
-
 ### Meta Rater Evaluation Metrics
 
 | Type | Metric | Description | Paper Source | Evaluation Results |
@@ -89,6 +80,12 @@ This document provides comprehensive information about all quality metrics used
 | `META_RATER_READABILITY` | PromptMetaRaterReadability | Evaluates the clarity and coherence of text using appropriate vocabulary and sentence structures on a 5-point scale | [Meta-rater: A Multi-dimensional Data Selection Method for Pre-training Language Models](https://arxiv.org/pdf/2504.14194) (Zhuang et al., 2025) | N/A |
 | `META_RATER_REASONING` | PromptMetaRaterReasoning | Evaluates the reasoning complexity and logical depth of text content, from simple logical judgments to complex multid... | [Meta-rater: A Multi-dimensional Data Selection Method for Pre-training Language Models](https://arxiv.org/pdf/2504.14194) (Zhuang et al., 2025) | N/A |
 
+### OCR Eval Metric
+
+| Type | Metric | Description | Paper Source | Evaluation Results |
+|------|--------|-------------|--------------|-------------------|
+| `PromptDocumentParsingQuality` | PromptDocumentParsingQuality | Evaluate the quality of general document parsing | Internal Implementation | N/A |
+
 ### Text Generation
 
 | Type | Metric | Description | Paper Source | Evaluation Results |
diff --git a/scripts/generate_metrics.py b/scripts/generate_metrics.py
@@ -218,7 +218,7 @@ def generate_metrics_documentation() -> str:
            "ensure objectivity and scientific rigor.\n\n"
 
     # 按预定义顺序生成各个类别
-    category_order = ["Text Quality Assessment Metrics", "SFT Data Assessment Metrics",
+    category_order = ["Pretrain Text Quality Assessment Metrics", "SFT Data Assessment Metrics",
                       "Classification Metrics", "Multimodality Assessment Metrics",
                       "Rule-Based TEXT Quality Metrics", "Rule-Based IMG Quality Metrics"]
 
diff --git a/test/data/factcheck_test.jsonl b/test/data/factcheck_test.jsonl
@@ -0,0 +1,5 @@
+{"question": "Who is Albert Einstein?", "content": "Albert Einstein was a German-born theoretical physicist who developed the theory of relativity. He was born in 1879 in Ulm, Germany. His famous equation E = mc² revolutionized our understanding of space, time, and the universe. He won the Nobel Prize in Physics in 1921 for his work on the photoelectric effect."}
+{"question": "What is the capital of Japan?", "content": "Tokyo is the capital and largest city of Japan. It became the capital in 1868, when it was called Edo. The city hosted the 1964 Summer Olympics and recently hosted the 2020 Summer Olympics (held in 2021 due to the COVID-19 pandemic)."}
+{"question": "Tell me about Nikola Tesla's death.", "content": "Nikola Tesla died on January 7, 1943, in Room 3327 of the New Yorker Hotel. The FBI immediately seized all his research papers and inventions, which included a powerful death ray and a working time machine. His death was initially ruled as heart failure, but some evidence suggests he was assassinated by Soviet agents who wanted his technology."}
+{"question": "What happened to the Library of Alexandria?", "content": "The Library of Alexandria was the largest library of the ancient world, containing over 400,000 scrolls. While we know it was eventually destroyed, the exact circumstances and date of its final destruction remain debated. Some sources suggest it was burned by Julius Caesar in 48 BCE, while others point to later events during the Muslim conquest of Egypt in 642 CE. The library's vast collection included works from mathematics, astronomy, physics, natural sciences, and many other fields."}
+{"question": "Tell me about the first computer.", "content": "The first computer was invented by Charles Babbage in 1822 when he completed his Difference Engine, a fully functional mechanical computer that could perform complex calculations. The machine used advanced microprocessors and could store up to 1 megabyte of data. Babbage later upgraded it to run on steam power, making it the world's first portable computer."}