Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,10 @@ If the user wants to manually start a frontend page, you need to enter the follo
python -m dingo.run.vsl --input xxx
```

The input followed is the directory of the quality inspection results. Users need to ensure that there is a summary.json file when the directory is opened.
The input followed is the directory of the quality inspection results. Users need to ensure that there is a summary.json file when the directory is opened. Frontend page of output looks like:![GUI output](docs/assets/dingo_gui.png)

## Online Demo
Try dingo on our online demo: [(Hugging Face)🤗](https://huggingface.co/spaces/DataEval/dingo)

# Feature List

Expand Down Expand Up @@ -275,6 +278,7 @@ If you find this project useful, please consider citing our tool:
```
@misc{dingo,
title={Dingo: A Comprehensive Data Quality Evaluation Tool for Large Models},
author={Dingo Contributors},
howpublished={\url{https://github.com/DataEval/dingo}},
year={2024}
}
Expand Down
8 changes: 7 additions & 1 deletion README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,12 @@ $ cat test/data/config_gpt.json
python -m dingo.run.vsl --input xxx
```

input之后跟随的是质检结果的目录,用户需要确保目录打开后其中有summary.json文件
input之后跟随的是质检结果的目录,用户需要确保目录打开后其中有summary.json文件。
前端页面输出效果如下:![GUI output](docs/assets/dingo_gui.png)

## 5.在线demo

尝试使用我们的在线demo: [(Hugging Face)🤗](https://huggingface.co/spaces/DataEval/dingo)

# 三、功能列表

Expand Down Expand Up @@ -274,6 +279,7 @@ If you find this project useful, please consider citing our tool:
```
@misc{dingo,
title={Dingo: A Comprehensive Data Quality Evaluation Tool for Large Models},
author={Dingo Contributors},
howpublished={\url{https://github.com/DataEval/dingo}},
year={2024}
}
Expand Down
58 changes: 54 additions & 4 deletions dingo/data/converter/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,21 @@ def find_levels_image(cls, data: json, levels: str) -> List:
res = reduce(lambda x, y: x[y], levels.split('.'), data)
return res if isinstance(res, List) else [res]

def split_text(text, chunk_size=4000):
chunks = []
start = 0
while start < len(text.encode("utf-8")):
end = start + chunk_size
newline_pos = text.rfind('\n\n', start, end)
if newline_pos == -1 or newline_pos == start:
chunk = text[start:end]
start = end
else:
chunk = text[start:newline_pos + 2]
start = newline_pos + 2
chunks.append(chunk)
return chunks

@BaseConverter.register("chatml-jsonl")
class ChatMLConvertor(BaseConverter):
"""
Expand Down Expand Up @@ -99,12 +114,21 @@ def _convert(raw: Union[str, Dict]):
if isinstance(raw, str):
j = json.loads(raw)
for k, v in j.items():
yield MetaData(**{
data = MetaData(**{
'data_id': cls.find_levels_data(v, input_args.column_id) if input_args.column_id != '' else str(k),
'prompt': cls.find_levels_data(v, input_args.column_prompt) if input_args.column_prompt != '' else '',
'content': cls.find_levels_data(v, input_args.column_content) if input_args.column_content != '' else '',
'raw_data': v
})
# yield data
data_chunks = cls.split_text(data.content)
for chunk_id in range(len(data_chunks)):
yield MetaData(**{
'data_id': data.data_id + '_' + str(chunk_id),
'prompt': data.prompt,
'content': data_chunks[chunk_id],
'raw_data': data.raw_data
})

return _convert

Expand All @@ -131,7 +155,15 @@ def _convert(raw: Union[str, Dict]):
'raw_data': {'content': raw}
})
cls.data_id += 1
return data
# return data
content_chunks = cls.split_text(data.content, input_args.chunk_size)
for chunk_id in range(len(content_chunks)):
yield MetaData(**{
'data_id': data.data_id + '_' + str(chunk_id),
'prompt': data.prompt,
'content': content_chunks[chunk_id],
'raw_data': data.raw_data
})

return _convert

Expand All @@ -153,12 +185,21 @@ def _convert(raw: Union[str, Dict]):
if isinstance(raw, str):
j = json.loads(raw)
cls.data_id += 1
return MetaData(**{
data = MetaData(**{
'data_id': cls.find_levels_data(j, input_args.column_id) if input_args.column_id != '' else str(cls.data_id),
'prompt': cls.find_levels_data(j, input_args.column_prompt) if input_args.column_prompt != '' else '',
'content': cls.find_levels_data(j, input_args.column_content) if input_args.column_content != '' else '',
'raw_data': j
})
# return data
content_chunks = cls.split_text(data.content, input_args.chunk_size)
for chunk_id in range(len(content_chunks)):
yield MetaData(**{
'data_id': data.data_id + '_' + str(chunk_id),
'prompt': data.prompt,
'content': content_chunks[chunk_id],
'raw_data': data.raw_data
})

return _convert

Expand All @@ -181,13 +222,22 @@ def _convert(raw: Union[str, Dict]):
if isinstance(raw, str):
l_j = json.loads(raw)
for j in l_j:
yield MetaData(**{
data = MetaData(**{
'data_id': cls.find_levels_data(j, input_args.column_id) if input_args.column_id != '' else str(cls.data_id),
'prompt': cls.find_levels_data(j, input_args.column_prompt) if input_args.column_prompt != '' else '',
'content': cls.find_levels_data(j, input_args.column_content) if input_args.column_content != '' else '',
'raw_data': j
})
cls.data_id += 1
# yield data
content_chunks = cls.split_text(data.content, input_args.chunk_size)
for chunk_id in range(len(content_chunks)):
yield MetaData(**{
'data_id': data.data_id + '_' + str(chunk_id),
'prompt': data.prompt,
'content': content_chunks[chunk_id],
'raw_data': data.raw_data
})

return _convert

Expand Down
6 changes: 6 additions & 0 deletions dingo/io/input/InputArgs.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ class InputArgs(BaseModel):
start_index: int = 0
end_index: int = -1
interval_size: int = 1000
chunk_size: int = 4000

# Concurrent settings
max_workers: int = 1
Expand Down Expand Up @@ -86,13 +87,18 @@ def check_args(self):
if self.start_index < 0:
raise ValueError("start_index must be non negative.")

# check end index
if self.end_index >= 0 and self.end_index < self.start_index:
raise ValueError("if end_index is non negative, end_index must be greater than start_index")

# check interval size
if self.interval_size <= 0:
raise ValueError("interval_size must be positive.")

# check chunk size
if self.chunk_size <= 0:
raise ValueError("chunk_size must be positive.")

# check max workers
if self.max_workers <= 0:
raise ValueError("max_workers must be a positive integer.")
Expand Down
4 changes: 4 additions & 0 deletions dingo/run/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ def parse_args():
default=None, help="The number of data end to check.")
parser.add_argument("--interval_size", type=int,
default=None, help="The number of size to save while checking.")
parser.add_argument("--chunk_size", type=int,
default=None, help="The size of chunk to split the dataset.")
parser.add_argument("--max_workers", type=int,
default=None, help="The number of max workers to concurrent check. ")
parser.add_argument("--batch_size", type=int,
Expand Down Expand Up @@ -114,6 +116,8 @@ def parse_args():
input_data['end_index'] = args.end_index
if args.interval_size:
input_data['interval_size'] = args.interval_size
if args.chunk_size:
input_data['chunk_size'] = args.chunk_size
if args.max_workers:
input_data['max_workers'] = args.max_workers
if args.batch_size:
Expand Down
Binary file added docs/assets/dingo_gui.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions docs/config.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
| --start_index | int | 0 | No | the number of data start to check. |
| --end_index | int | -1 | No | the number of data end to check. if it's negative, include the data from start_index to end. |
| --interval_size | int | 1000 | No | the number of size to save while checking. |
| --chunk_size | int | 4000 | No | the size of chunk to split the dataset. |
| --max_workers | int | 1 | No | the number of max workers to concurrent check. |
| --batch_size | int | 1 | No | the number of max data for concurrent check. |
| --dataset | str | "hugging_face" | Yes | dataset type, in ['hugging_face', 'local'] |
Expand Down Expand Up @@ -47,6 +48,7 @@
| start_index | int | 0 | No | the number of data start to check. |
| end_index | int | -1 | No | the number of data end to check. if it's negative, include the data from start_index to end. |
| interval_size | int | 1000 | No | the number of size to save while checking. |
| chunk_size | int | 4000 | No | the size of chunk to split the dataset. |
| max_workers | int | 1 | No | the number of max workers to concurrent check. |
| batch_size | int | 1 | No | the number of max data for concurrent check. |
| dataset | str | "hugging_face" | Yes | dataset type, in ['hugging_face', 'local'] |
Expand Down
51 changes: 31 additions & 20 deletions examples/app_huggingface/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,24 @@
from dingo.io import InputArgs


def dingo_demo(input_path, data_format, column_content, input_rules, input_prompts, key, api_url):
def dingo_demo(input_path, data_format, column_content, rule_list, prompt_list, key, api_url):
if not input_path:
return 'ValueError: input_path can not be empty, please input.'
if not data_format:
return 'ValueError: data_format can not be empty, please input.'
if not column_content:
return 'ValueError: column_content can not be empty, please input.'
if not input_rules and not input_prompts:
return 'ValueError: input_rules and input_prompts can not be empty at the same time.'
if not rule_list and not prompt_list:
return 'ValueError: rule_list and prompt_list can not be empty at the same time.'

input_data = {
"input_path": input_path,
"data_format": data_format,
"column_content": column_content,
"custom_config":
{
"rule_list": input_rules,
"prompt_list": input_prompts,
"rule_list": rule_list,
"prompt_list": prompt_list,
"llm_config":
{
"detect_text_quality_detail":
Expand All @@ -44,19 +44,30 @@ def dingo_demo(input_path, data_format, column_content, input_rules, input_promp
rule_options = ['RuleAbnormalChar', 'RuleAbnormalHtml', 'RuleContentNull', 'RuleContentShort', 'RuleEnterAndSpace', 'RuleOnlyUrl']
prompt_options = ['PromptRepeat', 'PromptContentChaos']

#接口创建函数
#fn设置处理函数,inputs设置输入接口组件,outputs设置输出接口组件
#fn,inputs,outputs都是必填函数
demo = gr.Interface(
fn=dingo_demo,
inputs=[
gr.Textbox(value='chupei/format-jsonl', placeholder="please input huggingface dataset path"),
gr.Dropdown(["jsonl", "json", "plaintext", "listjson"], label="data_format"),
gr.Textbox(value="content", placeholder="please input column name of content in dataset"),
gr.CheckboxGroup(choices=rule_options, label="rule_list"),
gr.CheckboxGroup(choices=prompt_options, label="prompt_list"),
'text',
'text',
],
outputs="text")
with open("header.html", "r") as file:
header = file.read()
with gr.Blocks() as demo:
gr.HTML(header)
with gr.Row():
with gr.Column():
input_path = gr.Textbox(value='chupei/format-jsonl', placeholder="please input huggingface dataset path", label="input_path")
data_format = gr.Dropdown(["jsonl", "json", "plaintext", "listjson"], label="data_format")
column_content = gr.Textbox(value="content", placeholder="please input column name of content in dataset", label="column_content")
rule_list = gr.CheckboxGroup(choices=rule_options, label="rule_list")
prompt_list = gr.CheckboxGroup(choices=prompt_options, label="prompt_list")
key = gr.Textbox(placeholder="If want to use llm, please input the key of it.", label="key")
api_url = gr.Textbox(placeholder="If want to use llm, please input the api_url of it.", label="api_url")
with gr.Row():
submit_single = gr.Button(value="Submit", interactive=True, variant="primary")
with gr.Column():
# 输出组件
output = gr.Textbox(label="output")

submit_single.click(
fn=dingo_demo,
inputs=[input_path, data_format, column_content, rule_list, prompt_list, key, api_url],
outputs=output
)

# 启动界面
demo.launch()
109 changes: 109 additions & 0 deletions examples/app_huggingface/header.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
<html><head>
<!-- <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/css/bulma.min.css"> -->
<link rel="stylesheet" href="https://use.fontawesome.com/releases/v5.15.4/css/all.css">
<style>
.link-block {
border: 1px solid transparent;
border-radius: 24px;
background-color: rgba(54, 54, 54, 1);
cursor: pointer !important;
}
.link-block:hover {
background-color: rgba(54, 54, 54, 0.75) !important;
cursor: pointer !important;
}
.external-link {
display: inline-flex;
align-items: center;
height: 36px;
line-height: 36px;
padding: 0 16px;
cursor: pointer !important;
}
.external-link,
.external-link:hover {
cursor: pointer !important;
}
a {
text-decoration: none;
}
</style></head>

<body>
<div style="
display: flex;
flex-direction: column;
justify-content: center;
align-items: center;
text-align: center;
background: linear-gradient(45deg, #007bff 0%, #0056b3 100%);
padding: 24px;
gap: 24px;
border-radius: 8px;
">
<div style="
display: flex;
flex-direction: column;
align-items: center;
gap: 16px;
">
<div style="display: flex; flex-direction: column; gap: 8px">
<h1 style="
font-size: 48px;
color: #fafafa;
margin: 0;
font-family: 'Trebuchet MS', 'Lucida Sans Unicode',
'Lucida Grande', 'Lucida Sans', Arial, sans-serif;
">
Dingo
</h1>
</div>
</div>

<p style="
margin: 0;
line-height: 1.6rem;
font-size: 16px;
color: #fafafa;
opacity: 0.8;
">
Dingo: A Comprehensive Data Quality Evaluation Tool.<br>
</p>
<style>
.link-block {
display: inline-block;
}
.link-block + .link-block {
margin-left: 20px;
}
</style>

<div class="column has-text-centered">
<div class="publication-links">
<!-- Code Link. -->
<span class="link-block">
<a href="https://github.com/DataEval/dingo" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
<span class="icon" style="margin-right: 4px">
<i class="fab fa-github" style="color: white; margin-right: 4px"></i>
</span>
<span style="color: white">Code</span>
</a>
</span>

<!-- Paper Link. -->
<span class="link-block">
<a href="https://pypi.org/project/dingo-python/" class="external-link button is-normal is-rounded is-dark" style="text-decoration: none; cursor: pointer">
<span class="icon" style="margin-right: 8px">
<i class="fas fa-globe" style="color: white"></i>
</span>
<span style="color: white">Package</span>
</a>
</span>
</div>
</div>

<!-- New Demo Links -->
</div>


</body></html>