diff --git a/README.md b/README.md index 894092eb..53d063c0 100644 --- a/README.md +++ b/README.md @@ -99,7 +99,10 @@ If the user wants to manually start a frontend page, you need to enter the follo python -m dingo.run.vsl --input xxx ``` -The input followed is the directory of the quality inspection results. Users need to ensure that there is a summary.json file when the directory is opened. +The input followed is the directory of the quality inspection results. Users need to ensure that there is a summary.json file when the directory is opened. Frontend page of output looks like: + +## Online Demo +Try dingo on our online demo: [(Hugging Face)🤗](https://huggingface.co/spaces/DataEval/dingo) # Feature List @@ -275,6 +278,7 @@ If you find this project useful, please consider citing our tool: ``` @misc{dingo, title={Dingo: A Comprehensive Data Quality Evaluation Tool for Large Models}, + author={Dingo Contributors}, howpublished={\url{https://github.com/DataEval/dingo}}, year={2024} } diff --git a/README_zh-CN.md b/README_zh-CN.md index 2236aa3a..33671c53 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -98,7 +98,12 @@ $ cat test/data/config_gpt.json python -m dingo.run.vsl --input xxx ``` -input之后跟随的是质检结果的目录,用户需要确保目录打开后其中有summary.json文件 +input之后跟随的是质检结果的目录,用户需要确保目录打开后其中有summary.json文件。 +前端页面输出效果如下: + +## 5.在线demo + +尝试使用我们的在线demo: [(Hugging Face)🤗](https://huggingface.co/spaces/DataEval/dingo) # 三、功能列表 @@ -274,6 +279,7 @@ If you find this project useful, please consider citing our tool: ``` @misc{dingo, title={Dingo: A Comprehensive Data Quality Evaluation Tool for Large Models}, + author={Dingo Contributors}, howpublished={\url{https://github.com/DataEval/dingo}}, year={2024} } diff --git a/dingo/data/converter/base.py b/dingo/data/converter/base.py index 227b21c7..003d29c5 100644 --- a/dingo/data/converter/base.py +++ b/dingo/data/converter/base.py @@ -45,6 +45,21 @@ def find_levels_image(cls, data: json, levels: str) -> List: res = reduce(lambda x, y: x[y], levels.split('.'), data) return res if isinstance(res, List) else [res] + def split_text(text, chunk_size=4000): + chunks = [] + start = 0 + while start < len(text.encode("utf-8")): + end = start + chunk_size + newline_pos = text.rfind('\n\n', start, end) + if newline_pos == -1 or newline_pos == start: + chunk = text[start:end] + start = end + else: + chunk = text[start:newline_pos + 2] + start = newline_pos + 2 + chunks.append(chunk) + return chunks + @BaseConverter.register("chatml-jsonl") class ChatMLConvertor(BaseConverter): """ @@ -99,12 +114,21 @@ def _convert(raw: Union[str, Dict]): if isinstance(raw, str): j = json.loads(raw) for k, v in j.items(): - yield MetaData(**{ + data = MetaData(**{ 'data_id': cls.find_levels_data(v, input_args.column_id) if input_args.column_id != '' else str(k), 'prompt': cls.find_levels_data(v, input_args.column_prompt) if input_args.column_prompt != '' else '', 'content': cls.find_levels_data(v, input_args.column_content) if input_args.column_content != '' else '', 'raw_data': v }) + # yield data + data_chunks = cls.split_text(data.content) + for chunk_id in range(len(data_chunks)): + yield MetaData(**{ + 'data_id': data.data_id + '_' + str(chunk_id), + 'prompt': data.prompt, + 'content': data_chunks[chunk_id], + 'raw_data': data.raw_data + }) return _convert @@ -131,7 +155,15 @@ def _convert(raw: Union[str, Dict]): 'raw_data': {'content': raw} }) cls.data_id += 1 - return data + # return data + content_chunks = cls.split_text(data.content, input_args.chunk_size) + for chunk_id in range(len(content_chunks)): + yield MetaData(**{ + 'data_id': data.data_id + '_' + str(chunk_id), + 'prompt': data.prompt, + 'content': content_chunks[chunk_id], + 'raw_data': data.raw_data + }) return _convert @@ -153,12 +185,21 @@ def _convert(raw: Union[str, Dict]): if isinstance(raw, str): j = json.loads(raw) cls.data_id += 1 - return MetaData(**{ + data = MetaData(**{ 'data_id': cls.find_levels_data(j, input_args.column_id) if input_args.column_id != '' else str(cls.data_id), 'prompt': cls.find_levels_data(j, input_args.column_prompt) if input_args.column_prompt != '' else '', 'content': cls.find_levels_data(j, input_args.column_content) if input_args.column_content != '' else '', 'raw_data': j }) + # return data + content_chunks = cls.split_text(data.content, input_args.chunk_size) + for chunk_id in range(len(content_chunks)): + yield MetaData(**{ + 'data_id': data.data_id + '_' + str(chunk_id), + 'prompt': data.prompt, + 'content': content_chunks[chunk_id], + 'raw_data': data.raw_data + }) return _convert @@ -181,13 +222,22 @@ def _convert(raw: Union[str, Dict]): if isinstance(raw, str): l_j = json.loads(raw) for j in l_j: - yield MetaData(**{ + data = MetaData(**{ 'data_id': cls.find_levels_data(j, input_args.column_id) if input_args.column_id != '' else str(cls.data_id), 'prompt': cls.find_levels_data(j, input_args.column_prompt) if input_args.column_prompt != '' else '', 'content': cls.find_levels_data(j, input_args.column_content) if input_args.column_content != '' else '', 'raw_data': j }) cls.data_id += 1 + # yield data + content_chunks = cls.split_text(data.content, input_args.chunk_size) + for chunk_id in range(len(content_chunks)): + yield MetaData(**{ + 'data_id': data.data_id + '_' + str(chunk_id), + 'prompt': data.prompt, + 'content': content_chunks[chunk_id], + 'raw_data': data.raw_data + }) return _convert diff --git a/dingo/io/input/InputArgs.py b/dingo/io/input/InputArgs.py index 2d5d2c06..6b3427cb 100644 --- a/dingo/io/input/InputArgs.py +++ b/dingo/io/input/InputArgs.py @@ -25,6 +25,7 @@ class InputArgs(BaseModel): start_index: int = 0 end_index: int = -1 interval_size: int = 1000 + chunk_size: int = 4000 # Concurrent settings max_workers: int = 1 @@ -86,6 +87,7 @@ def check_args(self): if self.start_index < 0: raise ValueError("start_index must be non negative.") + # check end index if self.end_index >= 0 and self.end_index < self.start_index: raise ValueError("if end_index is non negative, end_index must be greater than start_index") @@ -93,6 +95,10 @@ def check_args(self): if self.interval_size <= 0: raise ValueError("interval_size must be positive.") + # check chunk size + if self.chunk_size <= 0: + raise ValueError("chunk_size must be positive.") + # check max workers if self.max_workers <= 0: raise ValueError("max_workers must be a positive integer.") diff --git a/dingo/run/cli.py b/dingo/run/cli.py index cad2059c..738e9b64 100644 --- a/dingo/run/cli.py +++ b/dingo/run/cli.py @@ -32,6 +32,8 @@ def parse_args(): default=None, help="The number of data end to check.") parser.add_argument("--interval_size", type=int, default=None, help="The number of size to save while checking.") + parser.add_argument("--chunk_size", type=int, + default=None, help="The size of chunk to split the dataset.") parser.add_argument("--max_workers", type=int, default=None, help="The number of max workers to concurrent check. ") parser.add_argument("--batch_size", type=int, @@ -114,6 +116,8 @@ def parse_args(): input_data['end_index'] = args.end_index if args.interval_size: input_data['interval_size'] = args.interval_size + if args.chunk_size: + input_data['chunk_size'] = args.chunk_size if args.max_workers: input_data['max_workers'] = args.max_workers if args.batch_size: diff --git a/docs/assets/dingo_gui.png b/docs/assets/dingo_gui.png new file mode 100644 index 00000000..72c463e9 Binary files /dev/null and b/docs/assets/dingo_gui.png differ diff --git a/docs/config.md b/docs/config.md index 90bb287e..e742f496 100644 --- a/docs/config.md +++ b/docs/config.md @@ -18,6 +18,7 @@ | --start_index | int | 0 | No | the number of data start to check. | | --end_index | int | -1 | No | the number of data end to check. if it's negative, include the data from start_index to end. | | --interval_size | int | 1000 | No | the number of size to save while checking. | +| --chunk_size | int | 4000 | No | the size of chunk to split the dataset. | | --max_workers | int | 1 | No | the number of max workers to concurrent check. | | --batch_size | int | 1 | No | the number of max data for concurrent check. | | --dataset | str | "hugging_face" | Yes | dataset type, in ['hugging_face', 'local'] | @@ -47,6 +48,7 @@ | start_index | int | 0 | No | the number of data start to check. | | end_index | int | -1 | No | the number of data end to check. if it's negative, include the data from start_index to end. | | interval_size | int | 1000 | No | the number of size to save while checking. | +| chunk_size | int | 4000 | No | the size of chunk to split the dataset. | | max_workers | int | 1 | No | the number of max workers to concurrent check. | | batch_size | int | 1 | No | the number of max data for concurrent check. | | dataset | str | "hugging_face" | Yes | dataset type, in ['hugging_face', 'local'] | diff --git a/examples/app_huggingface/app.py b/examples/app_huggingface/app.py index 8aed83a4..a59995bb 100644 --- a/examples/app_huggingface/app.py +++ b/examples/app_huggingface/app.py @@ -5,15 +5,15 @@ from dingo.io import InputArgs -def dingo_demo(input_path, data_format, column_content, input_rules, input_prompts, key, api_url): +def dingo_demo(input_path, data_format, column_content, rule_list, prompt_list, key, api_url): if not input_path: return 'ValueError: input_path can not be empty, please input.' if not data_format: return 'ValueError: data_format can not be empty, please input.' if not column_content: return 'ValueError: column_content can not be empty, please input.' - if not input_rules and not input_prompts: - return 'ValueError: input_rules and input_prompts can not be empty at the same time.' + if not rule_list and not prompt_list: + return 'ValueError: rule_list and prompt_list can not be empty at the same time.' input_data = { "input_path": input_path, @@ -21,8 +21,8 @@ def dingo_demo(input_path, data_format, column_content, input_rules, input_promp "column_content": column_content, "custom_config": { - "rule_list": input_rules, - "prompt_list": input_prompts, + "rule_list": rule_list, + "prompt_list": prompt_list, "llm_config": { "detect_text_quality_detail": @@ -44,19 +44,30 @@ def dingo_demo(input_path, data_format, column_content, input_rules, input_promp rule_options = ['RuleAbnormalChar', 'RuleAbnormalHtml', 'RuleContentNull', 'RuleContentShort', 'RuleEnterAndSpace', 'RuleOnlyUrl'] prompt_options = ['PromptRepeat', 'PromptContentChaos'] - #接口创建函数 - #fn设置处理函数,inputs设置输入接口组件,outputs设置输出接口组件 - #fn,inputs,outputs都是必填函数 - demo = gr.Interface( - fn=dingo_demo, - inputs=[ - gr.Textbox(value='chupei/format-jsonl', placeholder="please input huggingface dataset path"), - gr.Dropdown(["jsonl", "json", "plaintext", "listjson"], label="data_format"), - gr.Textbox(value="content", placeholder="please input column name of content in dataset"), - gr.CheckboxGroup(choices=rule_options, label="rule_list"), - gr.CheckboxGroup(choices=prompt_options, label="prompt_list"), - 'text', - 'text', - ], - outputs="text") + with open("header.html", "r") as file: + header = file.read() + with gr.Blocks() as demo: + gr.HTML(header) + with gr.Row(): + with gr.Column(): + input_path = gr.Textbox(value='chupei/format-jsonl', placeholder="please input huggingface dataset path", label="input_path") + data_format = gr.Dropdown(["jsonl", "json", "plaintext", "listjson"], label="data_format") + column_content = gr.Textbox(value="content", placeholder="please input column name of content in dataset", label="column_content") + rule_list = gr.CheckboxGroup(choices=rule_options, label="rule_list") + prompt_list = gr.CheckboxGroup(choices=prompt_options, label="prompt_list") + key = gr.Textbox(placeholder="If want to use llm, please input the key of it.", label="key") + api_url = gr.Textbox(placeholder="If want to use llm, please input the api_url of it.", label="api_url") + with gr.Row(): + submit_single = gr.Button(value="Submit", interactive=True, variant="primary") + with gr.Column(): + # 输出组件 + output = gr.Textbox(label="output") + + submit_single.click( + fn=dingo_demo, + inputs=[input_path, data_format, column_content, rule_list, prompt_list, key, api_url], + outputs=output + ) + + # 启动界面 demo.launch() diff --git a/examples/app_huggingface/header.html b/examples/app_huggingface/header.html new file mode 100644 index 00000000..78f11d45 --- /dev/null +++ b/examples/app_huggingface/header.html @@ -0,0 +1,109 @@ +
+ + + + + +
+ Dingo: A Comprehensive Data Quality Evaluation Tool.
+