MigoXLab · shijinpjlab · Feb 12, 2025 · Feb 12, 2025 · Feb 12, 2025 · Feb 13, 2025
diff --git a/README.md b/README.md
@@ -99,7 +99,10 @@ If the user wants to manually start a frontend page, you need to enter the follo
 python -m dingo.run.vsl --input xxx
 ```
 
-The input followed is the directory of the quality inspection results. Users need to ensure that there is a summary.json file when the directory is opened.
+The input followed is the directory of the quality inspection results. Users need to ensure that there is a summary.json file when the directory is opened. Frontend page of output looks like:![GUI output](docs/assets/dingo_gui.png)
+
+## Online Demo
+Try dingo on our online demo: [(Hugging Face)🤗](https://huggingface.co/spaces/DataEval/dingo)
 
 # Feature List
 
@@ -275,6 +278,7 @@ If you find this project useful, please consider citing our tool:
 ```
 @misc{dingo,
   title={Dingo: A Comprehensive Data Quality Evaluation Tool for Large Models},
+  author={Dingo Contributors},
   howpublished={\url{https://github.com/DataEval/dingo}},
   year={2024}
 }

diff --git a/README_zh-CN.md b/README_zh-CN.md
@@ -98,7 +98,12 @@ $ cat test/data/config_gpt.json
 python -m dingo.run.vsl --input xxx
 ```
 
-input之后跟随的是质检结果的目录，用户需要确保目录打开后其中有summary.json文件
+input之后跟随的是质检结果的目录，用户需要确保目录打开后其中有summary.json文件。
+前端页面输出效果如下：![GUI output](docs/assets/dingo_gui.png)
+
+## 5.在线demo
+
+尝试使用我们的在线demo: [(Hugging Face)🤗](https://huggingface.co/spaces/DataEval/dingo)
 
 # 三、功能列表
 
@@ -274,6 +279,7 @@ If you find this project useful, please consider citing our tool:
 ```
 @misc{dingo,
   title={Dingo: A Comprehensive Data Quality Evaluation Tool for Large Models},
+  author={Dingo Contributors},
   howpublished={\url{https://github.com/DataEval/dingo}},
   year={2024}
 }

diff --git a/dingo/data/converter/base.py b/dingo/data/converter/base.py
@@ -45,6 +45,21 @@ def find_levels_image(cls, data: json, levels: str) -> List:
         res = reduce(lambda x, y: x[y], levels.split('.'), data)
         return res if isinstance(res, List) else [res]
 
+    def split_text(text, chunk_size=4000):
+        chunks = []
+        start = 0
+        while start < len(text.encode("utf-8")):
+            end = start + chunk_size
+            newline_pos = text.rfind('\n\n', start, end)
+            if newline_pos == -1 or newline_pos == start:
+                chunk = text[start:end]
+                start = end
+            else:
+                chunk = text[start:newline_pos + 2]
+                start = newline_pos + 2
+            chunks.append(chunk)
+        return chunks
+
 @BaseConverter.register("chatml-jsonl")
 class ChatMLConvertor(BaseConverter):
     """
@@ -99,12 +114,21 @@ def _convert(raw: Union[str, Dict]):
             if isinstance(raw, str):
                 j = json.loads(raw)
             for k, v in j.items():
-                yield MetaData(**{
+                data = MetaData(**{
                     'data_id': cls.find_levels_data(v, input_args.column_id) if input_args.column_id != '' else str(k),
                     'prompt': cls.find_levels_data(v, input_args.column_prompt) if input_args.column_prompt != '' else '',
                     'content': cls.find_levels_data(v, input_args.column_content) if input_args.column_content != '' else '',
                     'raw_data': v
                 })
+                # yield data
+                data_chunks = cls.split_text(data.content)
+                for chunk_id in range(len(data_chunks)):
+                    yield MetaData(**{
+                        'data_id': data.data_id + '_' + str(chunk_id),
+                        'prompt': data.prompt,
+                        'content': data_chunks[chunk_id],
+                        'raw_data': data.raw_data
+                    })
 
         return _convert
 
@@ -131,7 +155,15 @@ def _convert(raw: Union[str, Dict]):
                 'raw_data': {'content': raw}
             })
             cls.data_id += 1
-            return data
+            # return data
+            content_chunks = cls.split_text(data.content, input_args.chunk_size)
+            for chunk_id in range(len(content_chunks)):
+                yield MetaData(**{
+                    'data_id': data.data_id + '_' + str(chunk_id),
+                    'prompt': data.prompt,
+                    'content': content_chunks[chunk_id],
+                    'raw_data': data.raw_data
+                })
 
         return _convert
 
@@ -153,12 +185,21 @@ def _convert(raw: Union[str, Dict]):
             if isinstance(raw, str):
                 j = json.loads(raw)
             cls.data_id += 1
-            return MetaData(**{
+            data = MetaData(**{
                 'data_id': cls.find_levels_data(j, input_args.column_id) if input_args.column_id != '' else str(cls.data_id),
                 'prompt': cls.find_levels_data(j, input_args.column_prompt) if input_args.column_prompt != '' else '',
                 'content': cls.find_levels_data(j, input_args.column_content) if input_args.column_content != '' else '',
                 'raw_data': j
             })
+            # return data
+            content_chunks = cls.split_text(data.content, input_args.chunk_size)
+            for chunk_id in range(len(content_chunks)):
+                yield MetaData(**{
+                    'data_id': data.data_id + '_' + str(chunk_id),
+                    'prompt': data.prompt,
+                    'content': content_chunks[chunk_id],
+                    'raw_data': data.raw_data
+                })
 
         return _convert
 
@@ -181,13 +222,22 @@ def _convert(raw: Union[str, Dict]):
             if isinstance(raw, str):
                 l_j = json.loads(raw)
             for j in l_j:
-                yield MetaData(**{
+                data = MetaData(**{
                     'data_id': cls.find_levels_data(j, input_args.column_id) if input_args.column_id != '' else str(cls.data_id),
                     'prompt': cls.find_levels_data(j, input_args.column_prompt) if input_args.column_prompt != '' else '',
                     'content': cls.find_levels_data(j, input_args.column_content) if input_args.column_content != '' else '',
                     'raw_data': j
                 })
                 cls.data_id += 1
+                # yield data
+                content_chunks = cls.split_text(data.content, input_args.chunk_size)
+                for chunk_id in range(len(content_chunks)):
+                    yield MetaData(**{
+                        'data_id': data.data_id + '_' + str(chunk_id),
+                        'prompt': data.prompt,
+                        'content': content_chunks[chunk_id],
+                        'raw_data': data.raw_data
+                    })
 
         return _convert
 

diff --git a/dingo/io/input/InputArgs.py b/dingo/io/input/InputArgs.py
@@ -25,6 +25,7 @@ class InputArgs(BaseModel):
     start_index: int = 0
     end_index: int = -1
     interval_size: int = 1000
+    chunk_size: int = 4000
 
     # Concurrent settings
     max_workers: int = 1
@@ -86,13 +87,18 @@ def check_args(self):
         if self.start_index < 0:
             raise ValueError("start_index must be non negative.")
 
+        # check end index
         if self.end_index >= 0 and self.end_index < self.start_index:
             raise ValueError("if end_index is non negative, end_index must be greater than start_index")
 
         # check interval size
         if self.interval_size <= 0:
             raise ValueError("interval_size must be positive.")
 
+        # check chunk size
+        if self.chunk_size <= 0:
+            raise ValueError("chunk_size must be positive.")
+
         # check max workers
         if self.max_workers <= 0:
             raise ValueError("max_workers must be a positive integer.")

diff --git a/dingo/run/cli.py b/dingo/run/cli.py
@@ -32,6 +32,8 @@ def parse_args():
                         default=None, help="The number of data end to check.")
     parser.add_argument("--interval_size", type=int,
                         default=None, help="The number of size to save while checking.")
+    parser.add_argument("--chunk_size", type=int,
+                        default=None, help="The size of chunk to split the dataset.")
     parser.add_argument("--max_workers", type=int,
                         default=None, help="The number of max workers to concurrent check. ")
     parser.add_argument("--batch_size", type=int,
@@ -114,6 +116,8 @@ def parse_args():
             input_data['end_index'] = args.end_index
         if args.interval_size:
             input_data['interval_size'] = args.interval_size
+        if args.chunk_size:
+            input_data['chunk_size'] = args.chunk_size
         if args.max_workers:
             input_data['max_workers'] = args.max_workers
         if args.batch_size:

diff --git a/docs/assets/dingo_gui.png b/docs/assets/dingo_gui.png
diff --git a/docs/config.md b/docs/config.md
@@ -18,6 +18,7 @@
 | --start_index             | int  |                0                 |    No    | the number of data start to check.                                                           |
 | --end_index               | int  |                -1                |    No    | the number of data end to check. if it's negative, include the data from start_index to end. |
 | --interval_size           | int  |               1000               |    No    | the number of size to save while checking.                                                   |
+| --chunk_size              | int  |               4000               |    No    | the size of chunk to split the dataset.                                                      |
 | --max_workers             | int  |                1                 |    No    | the number of max workers to concurrent check.                                               |
 | --batch_size              | int  |                1                 |    No    | the number of max data for concurrent check.                                                 |
 | --dataset                 | str  |          "hugging_face"          |   Yes    | dataset type, in ['hugging_face', 'local']                                                   |
@@ -47,6 +48,7 @@
 | start_index             | int                   |                0                 |    No    | the number of data start to check.                                                           |
 | end_index               | int                   |                -1                |    No    | the number of data end to check. if it's negative, include the data from start_index to end. |
 | interval_size           | int                   |               1000               |    No    | the number of size to save while checking.                                                   |
+| chunk_size              | int                   |               4000               |    No    | the size of chunk to split the dataset.                                                      |
 | max_workers             | int                   |                1                 |    No    | the number of max workers to concurrent check.                                               |
 | batch_size              | int                   |                1                 |    No    | the number of max data for concurrent check.                                                 |
 | dataset                 | str                   |          "hugging_face"          |   Yes    | dataset type, in ['hugging_face', 'local']                                                   |

diff --git a/examples/app_huggingface/app.py b/examples/app_huggingface/app.py
@@ -5,24 +5,24 @@
 from dingo.io import InputArgs
 
 
-def dingo_demo(input_path, data_format, column_content, input_rules, input_prompts, key, api_url):
+def dingo_demo(input_path, data_format, column_content, rule_list, prompt_list, key, api_url):
     if not input_path:
         return 'ValueError: input_path can not be empty, please input.'
     if not data_format:
         return 'ValueError: data_format can not be empty, please input.'
     if not column_content:
         return 'ValueError: column_content can not be empty, please input.'
-    if not input_rules and not input_prompts:
-        return 'ValueError: input_rules and input_prompts can not be empty at the same time.'
+    if not rule_list and not prompt_list:
+        return 'ValueError: rule_list and prompt_list can not be empty at the same time.'
 
     input_data = {
         "input_path": input_path,
         "data_format": data_format,
         "column_content": column_content,
         "custom_config":
             {
-                "rule_list": input_rules,
-                "prompt_list": input_prompts,
+                "rule_list": rule_list,
+                "prompt_list": prompt_list,
                 "llm_config":
                     {
                         "detect_text_quality_detail":
@@ -44,19 +44,30 @@ def dingo_demo(input_path, data_format, column_content, input_rules, input_promp
     rule_options = ['RuleAbnormalChar', 'RuleAbnormalHtml', 'RuleContentNull', 'RuleContentShort', 'RuleEnterAndSpace', 'RuleOnlyUrl']
     prompt_options = ['PromptRepeat', 'PromptContentChaos']
 
-    #接口创建函数
-    #fn设置处理函数，inputs设置输入接口组件，outputs设置输出接口组件
-    #fn,inputs,outputs都是必填函数
-    demo = gr.Interface(
-        fn=dingo_demo,
-        inputs=[
-            gr.Textbox(value='chupei/format-jsonl', placeholder="please input huggingface dataset path"),
-            gr.Dropdown(["jsonl", "json", "plaintext", "listjson"], label="data_format"),
-            gr.Textbox(value="content", placeholder="please input column name of content in dataset"),
-            gr.CheckboxGroup(choices=rule_options, label="rule_list"),
-            gr.CheckboxGroup(choices=prompt_options, label="prompt_list"),
-            'text',
-            'text',
-        ],
-        outputs="text")
+    with open("header.html", "r") as file:
+        header = file.read()
+    with gr.Blocks() as demo:
+        gr.HTML(header)
+        with gr.Row():
+            with gr.Column():
+                input_path = gr.Textbox(value='chupei/format-jsonl', placeholder="please input huggingface dataset path", label="input_path")
+                data_format = gr.Dropdown(["jsonl", "json", "plaintext", "listjson"], label="data_format")
+                column_content = gr.Textbox(value="content", placeholder="please input column name of content in dataset", label="column_content")
+                rule_list = gr.CheckboxGroup(choices=rule_options, label="rule_list")
+                prompt_list = gr.CheckboxGroup(choices=prompt_options, label="prompt_list")
+                key = gr.Textbox(placeholder="If want to use llm, please input the key of it.", label="key")
+                api_url = gr.Textbox(placeholder="If want to use llm, please input the api_url of it.", label="api_url")
+                with gr.Row():
+                    submit_single = gr.Button(value="Submit", interactive=True, variant="primary")
+            with gr.Column():
+                # 输出组件
+                output = gr.Textbox(label="output")
+
+        submit_single.click(
+            fn=dingo_demo,
+            inputs=[input_path, data_format, column_content, rule_list, prompt_list, key, api_url],
+            outputs=output
+        )
+
+    # 启动界面
     demo.launch()
diff --git a/examples/app_huggingface/header.html b/examples/app_huggingface/header.html
@@ -0,0 +1,109 @@
+<html><head>
+    <!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css"> -->
+    <link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
+  <style>
+    .link-block {
+      border: 1px solid transparent;
+      border-radius: 24px;
+      background-color: rgba(54, 54, 54, 1);
+      cursor: pointer !important;
+    }
+    .link-block:hover {
+      background-color: rgba(54, 54, 54, 0.75) !important;
+      cursor: pointer !important;
+    }
+    .external-link {
+      display: inline-flex;
+      align-items: center;
+      height: 36px;
+      line-height: 36px;
+      padding: 0 16px;
+      cursor: pointer !important;
+    }
+    .external-link,
+    .external-link:hover {
+      cursor: pointer !important;
+    }
+    a {
+      text-decoration: none;
+    }
+  </style></head>
+
+  <body>
+    <div style="
+        display: flex;
+        flex-direction: column;
+        justify-content: center;
+        align-items: center;
+        text-align: center;
+        background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
+        padding: 24px;
+        gap: 24px;
+        border-radius: 8px;
+      ">
+      <div style="
+          display: flex;
+          flex-direction: column;
+          align-items: center;
+          gap: 16px;
+        ">
+        <div style="display: flex; flex-direction: column; gap: 8px">
+          <h1 style="
+              font-size: 48px;
+              color: #fafafa;
+              margin: 0;
+              font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
+                'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
+            ">
+            Dingo
+          </h1>
+        </div>
+      </div>
+
+      <p style="
+          margin: 0;
+          line-height: 1.6rem;
+          font-size: 16px;
+          color: #fafafa;
+          opacity: 0.8;
+        ">
+        Dingo: A Comprehensive Data Quality Evaluation Tool.<br>
+      </p>
+      <style>
+        .link-block {
+          display: inline-block;
+        }
+        .link-block + .link-block {
+          margin-left: 20px;
+        }
+      </style>
+
+      <div class="column has-text-centered">
+        <div class="publication-links">
+          <!-- Code Link. -->
+          <span class="link-block">
+            <a href="https://github.com/DataEval/dingo" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
+              <span class="icon" style="margin-right: 4px">
+                <i class="fab fa-github" style="color: white; margin-right: 4px"></i>
+              </span>
+              <span style="color: white">Code</span>
+            </a>
+          </span>
+
+          <!-- Paper Link. -->
+          <span class="link-block">
+            <a href="https://pypi.org/project/dingo-python/" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
+              <span class="icon" style="margin-right: 8px">
+                <i class="fas fa-globe" style="color: white"></i>
+              </span>
+              <span style="color: white">Package</span>
+            </a>
+          </span>
+        </div>
+      </div>
+
+      <!-- New Demo Links -->
+    </div>
+
+
+  </body></html>