Merge pull request #83 from e06084/dev

shijinpjlab · web-flow · commit 9e91aaaf18a2 · 2025-05-21T16:24:10.000+08:00
docs: update readme
diff --git a/README.md b/README.md
@@ -55,64 +55,36 @@ pip install dingo-python
 
 ## Example Use Cases
 
-### 1. Using Evaluate Core
+### 1. Evaluate LLM chat data
 
 ```python
 from dingo.config.config import DynamicLLMConfig
 from dingo.io.input.MetaData import MetaData
 from dingo.model.llm.llm_text_quality_model_base import LLMTextQualityModelBase
 from dingo.model.rule.rule_common import RuleEnterAndSpace
 
+data = MetaData(
+    data_id='123',
+    prompt="hello, introduce the world",
+    content="Hello! The world is a vast and diverse place, full of wonders, cultures, and incredible natural beauty."
+)
 
 def llm():
-    data = MetaData(
-        data_id='123',
-        prompt="hello, introduce the world",
-        content="Hello! The world is a vast and diverse place, full of wonders, cultures, and incredible natural beauty."
-    )
-
     LLMTextQualityModelBase.dynamic_config = DynamicLLMConfig(
-        key='',
-        api_url='',
-        # model='',
+        key='YOUR_API_KEY',
+        api_url='https://api.openai.com/v1/chat/completions',
+        model='gpt-4o',
     )
     res = LLMTextQualityModelBase.eval(data)
     print(res)
 
 
 def rule():
-    data = MetaData(
-        data_id='123',
-        prompt="hello, introduce the world",
-        content="Hello! The world is a vast and diverse place, full of wonders, cultures, and incredible natural beauty."
-    )
-
     res = RuleEnterAndSpace().eval(data)
     print(res)
 ```
 
-### 2. Evaluate Local Text File (Plaintext)
-
-```python
-from dingo.io import InputArgs
-from dingo.exec import Executor
-
-# Evaluate a plaintext file
-input_data = {
-    "eval_group": "sft",          # Rule set for SFT data
-    "input_path": "data.txt",      # Path to local text file
-    "dataset": "local",
-    "data_format": "plaintext",    # Format: plaintext
-    "save_data": True              # Save evaluation results
-}
-
-input_args = InputArgs(**input_data)
-executor = Executor.exec_map["local"](input_args)
-result = executor.execute()
-print(result)
-```
-
-### 3. Evaluate Hugging Face Dataset
+### 2. Evaluate Dataset
 
 ```python
 from dingo.io import InputArgs
@@ -132,58 +104,6 @@ result = executor.execute()
 print(result)
 ```
 
-### 4. Evaluate JSON/JSONL Format
-
-```python
-from dingo.io import InputArgs
-from dingo.exec import Executor
-
-# Evaluate a JSON file
-input_data = {
-    "eval_group": "default",       # Default rule set
-    "input_path": "data.json",     # Path to local JSON file
-    "dataset": "local",
-    "data_format": "json",         # Format: json
-    "column_content": "text",      # Column containing the text to evaluate
-    "save_data": True              # Save evaluation results
-}
-
-input_args = InputArgs(**input_data)
-executor = Executor.exec_map["local"](input_args)
-result = executor.execute()
-print(result)
-```
-
-### 5. Using LLM for Evaluation
-
-```python
-from dingo.io import InputArgs
-from dingo.exec import Executor
-
-# Evaluate using GPT model
-input_data = {
-    "input_path": "data.jsonl",    # Path to local JSONL file
-    "dataset": "local",
-    "data_format": "jsonl",
-    "column_content": "content",
-    "custom_config": {
-        "prompt_list": ["PromptRepeat"],  # Prompt to use
-        "llm_config": {
-            "detect_text_quality": {
-                "model": "gpt-4o",
-                "key": "YOUR_API_KEY",
-                "api_url": "https://api.openai.com/v1/chat/completions"
-            }
-        }
-    }
-}
-
-input_args = InputArgs(**input_data)
-executor = Executor.exec_map["local"](input_args)
-result = executor.execute()
-print(result)
-```
-
 ## Command Line Interface
 
 ### Evaluate with Rule Sets
@@ -471,10 +391,15 @@ Dingo includes an experimental Model Context Protocol (MCP) server. For details
 
 # Research & Publications
 
-- **"Comprehensive Data Quality Assessment for Multilingual WebData"** : [WanJuanSiLu: A High-Quality Open-Source Webtext
-Dataset for Low-Resource Languages](https://arxiv.org/pdf/2501.14506)
-- **"Pre-training data quality using the DataMan methodology"** : [DataMan: Data Manager for Pre-training Large Language Models](https://openreview.net/pdf?id=eNbA8Fqir4)
+## Research Powered by Dingo
+- **WanJuanSiLu**: [A High-Quality Open-Source Webtext Dataset for Low-Resource Languages](https://arxiv.org/pdf/2501.14506)
+  *Uses Dingo for comprehensive data quality assessment of multilingual web data*
 
+## Methodologies Implemented in Dingo
+- **DataMan Methodology**: [DataMan: Data Manager for Pre-training Large Language Models](https://openreview.net/pdf?id=eNbA8Fqir4)
+  *Dingo implements the DataMan methodology for pre-training data quality assessment*
+- **RedPajama-Data-v2**: [RedPajama-Data](https://github.com/togethercomputer/RedPajama-Data)
+  *Dingo implements parts of the RedPajama-Data-v2 methodology for web text quality assessment and filtering*
 
 # Future Plans
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -53,64 +53,36 @@ pip install dingo-python
 
 ## 2. 使用示例
 
-### 2.1 使用评估核心方法
+### 2.1 评估LLM对话数据
 
 ```python
 from dingo.config.config import DynamicLLMConfig
 from dingo.io.input.MetaData import MetaData
 from dingo.model.llm.llm_text_quality_model_base import LLMTextQualityModelBase
 from dingo.model.rule.rule_common import RuleEnterAndSpace
 
+data = MetaData(
+    data_id='123',
+    prompt="hello, introduce the world",
+    content="Hello! The world is a vast and diverse place, full of wonders, cultures, and incredible natural beauty."
+)
 
 def llm():
-    data = MetaData(
-        data_id='123',
-        prompt="hello, introduce the world",
-        content="Hello! The world is a vast and diverse place, full of wonders, cultures, and incredible natural beauty."
-    )
-
     LLMTextQualityModelBase.dynamic_config = DynamicLLMConfig(
-        key='',
-        api_url='',
-        # model='',
+        key='YOUR_API_KEY',
+        api_url='https://api.openai.com/v1/chat/completions',
+        model='gpt-4o',
     )
     res = LLMTextQualityModelBase.eval(data)
     print(res)
 
 
 def rule():
-    data = MetaData(
-        data_id='123',
-        prompt="hello, introduce the world",
-        content="Hello! The world is a vast and diverse place, full of wonders, cultures, and incredible natural beauty."
-    )
-
     res = RuleEnterAndSpace().eval(data)
     print(res)
 ```
 
-### 2.2 评估本地文本文件（纯文本）
-
-```python
-from dingo.io import InputArgs
-from dingo.exec import Executor
-
-# 评估纯文本文件
-input_data = {
-    "eval_group": "sft",          # SFT数据的规则集
-    "input_path": "data.txt",      # 本地文本文件路径
-    "dataset": "local",
-    "data_format": "plaintext",    # 格式: plaintext
-    "save_data": True              # 保存评估结果
-}
-
-input_args = InputArgs(**input_data)
-executor = Executor.exec_map["local"](input_args)
-result = executor.execute()
-print(result)
-```
-
-### 2.3 评估Hugging Face数据集
+### 2.2 评估数据集
 
 ```python
 from dingo.io import InputArgs
@@ -130,58 +102,6 @@ result = executor.execute()
 print(result)
 ```
 
-### 2.4 评估JSON/JSONL格式
-
-```python
-from dingo.io import InputArgs
-from dingo.exec import Executor
-
-# 评估JSON文件
-input_data = {
-    "eval_group": "default",       # 默认规则集
-    "input_path": "data.json",     # 本地JSON文件路径
-    "dataset": "local",
-    "data_format": "json",         # 格式: json
-    "column_content": "text",      # 包含要评估文本的列
-    "save_data": True              # 保存评估结果
-}
-
-input_args = InputArgs(**input_data)
-executor = Executor.exec_map["local"](input_args)
-result = executor.execute()
-print(result)
-```
-
-### 2.5 使用LLM进行评估
-
-```python
-from dingo.io import InputArgs
-from dingo.exec import Executor
-
-# 使用GPT模型评估
-input_data = {
-    "input_path": "data.jsonl",    # 本地JSONL文件路径
-    "dataset": "local",
-    "data_format": "jsonl",
-    "column_content": "content",
-    "custom_config": {
-        "prompt_list": ["PromptRepeat"],  # 使用的prompt
-        "llm_config": {
-            "detect_text_quality": {
-                "model": "gpt-4o",
-                "key": "您的API密钥",
-                "api_url": "https://api.openai.com/v1/chat/completions"
-            }
-        }
-    }
-}
-
-input_args = InputArgs(**input_data)
-executor = Executor.exec_map["local"](input_args)
-result = executor.execute()
-print(result)
-```
-
 ## 3. 命令行界面
 
 ### 3.1 使用规则集评估
@@ -470,9 +390,15 @@ Dingo 包含一个实验性的模型上下文协议 (MCP) 服务端。有关运
 
 # 研究与学术成果
 
+## Dingo驱动的研究
+- **WanJuanSiLu**: [A High-Quality Open-Source Webtext Dataset for Low-Resource Languages](https://arxiv.org/pdf/2501.14506)
+  *使用Dingo对多语言网页数据进行全面的数据质量评估*
 
-- **"多语言网页数据的数据质量评估"** : [WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages](https://arxiv.org/pdf/2501.14506)
-- **"使用DataMan方法论评估预训练数据质量"** : [DataMan: Data Manager for Pre-training Large Language Models](https://openreview.net/pdf?id=eNbA8Fqir4)
+## Dingo实现的方法论
+- **DataMan方法论**: [DataMan: Data Manager for Pre-training Large Language Models](https://openreview.net/pdf?id=eNbA8Fqir4)
+  *Dingo实现了DataMan方法论用于预训练数据质量评估*
+- **RedPajama-Data-v2**: [RedPajama-Data](https://github.com/togethercomputer/RedPajama-Data)
+  *Dingo实现了部分RedPajama-Data-v2方法论用于网页文本质量评估和过滤*
 
 # 未来计划