docs: update readme

e06084 · e06084 · commit 42b24e8c351b · 2025-05-21T14:45:14.000+08:00
diff --git a/README.md b/README.md
@@ -55,22 +55,21 @@ pip install dingo-python
 
 ## Example Use Cases
 
-### 1. Using Evaluate Core
+### 1. Evaluate Stream Data
 
 ```python
 from dingo.config.config import DynamicLLMConfig
 from dingo.io.input.MetaData import MetaData
 from dingo.model.llm.llm_text_quality_model_base import LLMTextQualityModelBase
 from dingo.model.rule.rule_common import RuleEnterAndSpace
 
+data = MetaData(
+    data_id='123',
+    prompt="hello, introduce the world",
+    content="Hello! The world is a vast and diverse place, full of wonders, cultures, and incredible natural beauty."
+)
 
 def llm():
-    data = MetaData(
-        data_id='123',
-        prompt="hello, introduce the world",
-        content="Hello! The world is a vast and diverse place, full of wonders, cultures, and incredible natural beauty."
-    )
-
     LLMTextQualityModelBase.dynamic_config = DynamicLLMConfig(
         key='',
         api_url='',
@@ -81,38 +80,11 @@ def llm():
 
 
 def rule():
-    data = MetaData(
-        data_id='123',
-        prompt="hello, introduce the world",
-        content="Hello! The world is a vast and diverse place, full of wonders, cultures, and incredible natural beauty."
-    )
-
     res = RuleEnterAndSpace().eval(data)
     print(res)
 ```
 
-### 2. Evaluate Local Text File (Plaintext)
-
-```python
-from dingo.io import InputArgs
-from dingo.exec import Executor
-
-# Evaluate a plaintext file
-input_data = {
-    "eval_group": "sft",          # Rule set for SFT data
-    "input_path": "data.txt",      # Path to local text file
-    "dataset": "local",
-    "data_format": "plaintext",    # Format: plaintext
-    "save_data": True              # Save evaluation results
-}
-
-input_args = InputArgs(**input_data)
-executor = Executor.exec_map["local"](input_args)
-result = executor.execute()
-print(result)
-```
-
-### 3. Evaluate Hugging Face Dataset
+### 2. Evaluate Hugging Face Dataset
 
 ```python
 from dingo.io import InputArgs
@@ -132,29 +104,7 @@ result = executor.execute()
 print(result)
 ```
 
-### 4. Evaluate JSON/JSONL Format
-
-```python
-from dingo.io import InputArgs
-from dingo.exec import Executor
-
-# Evaluate a JSON file
-input_data = {
-    "eval_group": "default",       # Default rule set
-    "input_path": "data.json",     # Path to local JSON file
-    "dataset": "local",
-    "data_format": "json",         # Format: json
-    "column_content": "text",      # Column containing the text to evaluate
-    "save_data": True              # Save evaluation results
-}
-
-input_args = InputArgs(**input_data)
-executor = Executor.exec_map["local"](input_args)
-result = executor.execute()
-print(result)
-```
-
-### 5. Using LLM for Evaluation
+### 3. Using LLM for Evaluation
 
 ```python
 from dingo.io import InputArgs
@@ -471,10 +421,15 @@ Dingo includes an experimental Model Context Protocol (MCP) server. For details
 
 # Research & Publications
 
-- **"Comprehensive Data Quality Assessment for Multilingual WebData"** : [WanJuanSiLu: A High-Quality Open-Source Webtext
-Dataset for Low-Resource Languages](https://arxiv.org/pdf/2501.14506)
-- **"Pre-training data quality using the DataMan methodology"** : [DataMan: Data Manager for Pre-training Large Language Models](https://openreview.net/pdf?id=eNbA8Fqir4)
+## Research Powered by Dingo
+- **WanJuanSiLu**: [A High-Quality Open-Source Webtext Dataset for Low-Resource Languages](https://arxiv.org/pdf/2501.14506)
+  *Uses Dingo for comprehensive data quality assessment of multilingual web data*
 
+## Methodologies Implemented in Dingo
+- **DataMan Methodology**: [DataMan: Data Manager for Pre-training Large Language Models](https://openreview.net/pdf?id=eNbA8Fqir4)
+  *Dingo implements the DataMan methodology for pre-training data quality assessment*
+- **RedPajama-Data-v2**: [RedPajama-Data](https://github.com/togethercomputer/RedPajama-Data)
+  *Dingo implements parts of the RedPajama-Data-v2 methodology for web text quality assessment and filtering*
 
 # Future Plans
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -53,22 +53,21 @@ pip install dingo-python
 
 ## 2. 使用示例
 
-### 2.1 使用评估核心方法
+### 2.1 评估流式数据
 
 ```python
 from dingo.config.config import DynamicLLMConfig
 from dingo.io.input.MetaData import MetaData
 from dingo.model.llm.llm_text_quality_model_base import LLMTextQualityModelBase
 from dingo.model.rule.rule_common import RuleEnterAndSpace
 
+data = MetaData(
+    data_id='123',
+    prompt="hello, introduce the world",
+    content="Hello! The world is a vast and diverse place, full of wonders, cultures, and incredible natural beauty."
+)
 
 def llm():
-    data = MetaData(
-        data_id='123',
-        prompt="hello, introduce the world",
-        content="Hello! The world is a vast and diverse place, full of wonders, cultures, and incredible natural beauty."
-    )
-
     LLMTextQualityModelBase.dynamic_config = DynamicLLMConfig(
         key='',
         api_url='',
@@ -79,38 +78,11 @@ def llm():
 
 
 def rule():
-    data = MetaData(
-        data_id='123',
-        prompt="hello, introduce the world",
-        content="Hello! The world is a vast and diverse place, full of wonders, cultures, and incredible natural beauty."
-    )
-
     res = RuleEnterAndSpace().eval(data)
     print(res)
 ```
 
-### 2.2 评估本地文本文件（纯文本）
-
-```python
-from dingo.io import InputArgs
-from dingo.exec import Executor
-
-# 评估纯文本文件
-input_data = {
-    "eval_group": "sft",          # SFT数据的规则集
-    "input_path": "data.txt",      # 本地文本文件路径
-    "dataset": "local",
-    "data_format": "plaintext",    # 格式: plaintext
-    "save_data": True              # 保存评估结果
-}
-
-input_args = InputArgs(**input_data)
-executor = Executor.exec_map["local"](input_args)
-result = executor.execute()
-print(result)
-```
-
-### 2.3 评估Hugging Face数据集
+### 2.2 评估Hugging Face数据集
 
 ```python
 from dingo.io import InputArgs
@@ -130,29 +102,7 @@ result = executor.execute()
 print(result)
 ```
 
-### 2.4 评估JSON/JSONL格式
-
-```python
-from dingo.io import InputArgs
-from dingo.exec import Executor
-
-# 评估JSON文件
-input_data = {
-    "eval_group": "default",       # 默认规则集
-    "input_path": "data.json",     # 本地JSON文件路径
-    "dataset": "local",
-    "data_format": "json",         # 格式: json
-    "column_content": "text",      # 包含要评估文本的列
-    "save_data": True              # 保存评估结果
-}
-
-input_args = InputArgs(**input_data)
-executor = Executor.exec_map["local"](input_args)
-result = executor.execute()
-print(result)
-```
-
-### 2.5 使用LLM进行评估
+### 2.3 使用LLM进行评估
 
 ```python
 from dingo.io import InputArgs
@@ -470,9 +420,15 @@ Dingo 包含一个实验性的模型上下文协议 (MCP) 服务端。有关运
 
 # 研究与学术成果
 
+## Dingo驱动的研究
+- **WanJuanSiLu**: [A High-Quality Open-Source Webtext Dataset for Low-Resource Languages](https://arxiv.org/pdf/2501.14506)
+  *使用Dingo对多语言网页数据进行全面的数据质量评估*
 
-- **"多语言网页数据的数据质量评估"** : [WanJuanSiLu: A High-Quality Open-Source Webtext Dataset for Low-Resource Languages](https://arxiv.org/pdf/2501.14506)
-- **"使用DataMan方法论评估预训练数据质量"** : [DataMan: Data Manager for Pre-training Large Language Models](https://openreview.net/pdf?id=eNbA8Fqir4)
+## Dingo实现的方法论
+- **DataMan方法论**: [DataMan: Data Manager for Pre-training Large Language Models](https://openreview.net/pdf?id=eNbA8Fqir4)
+  *Dingo实现了DataMan方法论用于预训练数据质量评估*
+- **RedPajama-Data-v2**: [RedPajama-Data](https://github.com/togethercomputer/RedPajama-Data)
+  *Dingo实现了部分RedPajama-Data-v2方法论用于网页文本质量评估和过滤*
 
 # 未来计划