11import json
2+ import os
3+ import shutil
24
35import gradio as gr
46from dingo .exec import Executor
57from dingo .io import InputArgs
68
79
8- def dingo_demo (input_path , data_format , column_content , rule_list , prompt_list , model , key , api_url ):
9- if not input_path :
10- return 'ValueError: input_path can not be empty, please input.'
10+ def dingo_demo (dataset_source , input_path , uploaded_file , data_format , column_content , rule_list , prompt_list , model ,
11+ key , api_url ):
1112 if not data_format :
12- return 'ValueError: data_format can not be empty, please input.'
13+ return 'ValueError: data_format can not be empty, please input.' , None
1314 if not column_content :
14- return 'ValueError: column_content can not be empty, please input.'
15+ return 'ValueError: column_content can not be empty, please input.' , None
1516 if not rule_list and not prompt_list :
16- return 'ValueError: rule_list and prompt_list can not be empty at the same time.'
17+ return 'ValueError: rule_list and prompt_list can not be empty at the same time.' , None
18+
19+ # Handle input path based on dataset source
20+ if dataset_source == "hugging_face" :
21+ if not input_path :
22+ return 'ValueError: input_path can not be empty for hugging_face dataset, please input.' , None
23+ final_input_path = input_path
24+ else : # local
25+ if not uploaded_file :
26+ return 'ValueError: Please upload a file for local dataset.' , None
27+ final_input_path = uploaded_file .name
1728
1829 input_data = {
19- "input_path" : input_path ,
30+ "dataset" : dataset_source ,
31+ "input_path" : final_input_path ,
32+ "output_path" : "" if dataset_source == 'hugging_face' else os .path .dirname (final_input_path ),
33+ "save_data" : True ,
34+ "save_raw" : True ,
2035 "data_format" : data_format ,
2136 "column_content" : column_content ,
2237 "custom_config" :
@@ -36,9 +51,33 @@ def dingo_demo(input_path, data_format, column_content, rule_list, prompt_list,
3651 }
3752 input_args = InputArgs (** input_data )
3853 executor = Executor .exec_map ["local" ](input_args )
39- result = executor .execute ()
40- summary = result [0 ].to_dict ()
41- return json .dumps (summary , indent = 4 )
54+ executor .execute ()
55+ summary = executor .get_summary ().to_dict ()
56+ detail = executor .get_bad_info_list ()
57+ new_detail = []
58+ for item in detail :
59+ new_detail .append (item .to_raw_dict ())
60+ if summary ['output_path' ]:
61+ shutil .rmtree (summary ['output_path' ])
62+
63+ # 返回两个值:概要信息和详细信息
64+ return json .dumps (summary , indent = 4 ), new_detail
65+
66+
67+ def update_input_components (dataset_source ):
68+ # 根据数据源的不同,返回不同的输入组件
69+ if dataset_source == "hugging_face" :
70+ # 如果数据源是huggingface,返回一个可见的文本框和一个不可见的文件组件
71+ return [
72+ gr .Textbox (visible = True ),
73+ gr .File (visible = False ),
74+ ]
75+ else : # local
76+ # 如果数据源是本地,返回一个不可见的文本框和一个可见的文件组件
77+ return [
78+ gr .Textbox (visible = False ),
79+ gr .File (visible = True ),
80+ ]
4281
4382
4483if __name__ == '__main__' :
@@ -51,24 +90,76 @@ def dingo_demo(input_path, data_format, column_content, rule_list, prompt_list,
5190 gr .HTML (header )
5291 with gr .Row ():
5392 with gr .Column ():
54- input_path = gr .Textbox (value = 'chupei/format-jsonl' , placeholder = "please input huggingface dataset path" , label = "input_path" )
55- data_format = gr .Dropdown (["jsonl" , "json" , "plaintext" , "listjson" ], label = "data_format" )
56- column_content = gr .Textbox (value = "content" , placeholder = "please input column name of content in dataset" , label = "column_content" )
57- rule_list = gr .CheckboxGroup (choices = rule_options , label = "rule_list" )
58- prompt_list = gr .CheckboxGroup (choices = prompt_options , label = "prompt_list" )
59- model = gr .Textbox (placeholder = "If want to use llm, please input model, such as: deepseek-chat" , label = "model" )
60- key = gr .Textbox (placeholder = "If want to use llm, please input key, such as: 123456789012345678901234567890xx" , label = "key" )
61- api_url = gr .Textbox (placeholder = "If want to use llm, please input api_url, such as: https://api.deepseek.com/v1" , label = "api_url" )
93+ with gr .Column ():
94+ dataset_source = gr .Dropdown (
95+ choices = ["hugging_face" , "local" ],
96+ value = "local" ,
97+ label = "dataset [source]"
98+ )
99+ input_path = gr .Textbox (
100+ value = 'chupei/format-jsonl' ,
101+ placeholder = "please input hugging_face dataset path" ,
102+ label = "input_path" ,
103+ visible = False
104+ )
105+ uploaded_file = gr .File (
106+ label = "upload file" ,
107+ visible = True
108+ )
109+
110+ data_format = gr .Dropdown (
111+ ["jsonl" , "json" , "plaintext" , "listjson" ],
112+ label = "data_format"
113+ )
114+ column_content = gr .Textbox (
115+ value = "content" ,
116+ placeholder = "please input column name of content in dataset" ,
117+ label = "column_content"
118+ )
119+
120+ rule_list = gr .CheckboxGroup (
121+ choices = rule_options ,
122+ label = "rule_list"
123+ )
124+ prompt_list = gr .CheckboxGroup (
125+ choices = prompt_options ,
126+ label = "prompt_list"
127+ )
128+ model = gr .Textbox (
129+ placeholder = "If want to use llm, please input model, such as: deepseek-chat" ,
130+ label = "model"
131+ )
132+ key = gr .Textbox (
133+ placeholder = "If want to use llm, please input key, such as: 123456789012345678901234567890xx" ,
134+ label = "API KEY"
135+ )
136+ api_url = gr .Textbox (
137+ placeholder = "If want to use llm, please input api_url, such as: https://api.deepseek.com/v1" ,
138+ label = "API URL"
139+ )
140+
62141 with gr .Row ():
63142 submit_single = gr .Button (value = "Submit" , interactive = True , variant = "primary" )
143+
64144 with gr .Column ():
65- # 输出组件
66- output = gr .Textbox (label = "output" )
145+ # 修改输出组件部分,使用Tabs
146+ with gr .Tabs ():
147+ with gr .Tab ("Result Summary" ):
148+ summary_output = gr .Textbox (label = "summary" , max_lines = 50 )
149+ with gr .Tab ("Result Detail" ):
150+ detail_output = gr .JSON (label = "detail" , max_height = 800 ) # 使用JSON组件来更好地展示结构化数据
151+
152+ dataset_source .change (
153+ fn = update_input_components ,
154+ inputs = dataset_source ,
155+ outputs = [input_path , uploaded_file ]
156+ )
67157
68158 submit_single .click (
69159 fn = dingo_demo ,
70- inputs = [input_path , data_format , column_content , rule_list , prompt_list , model , key , api_url ],
71- outputs = output
160+ inputs = [dataset_source , input_path , uploaded_file , data_format , column_content , rule_list , prompt_list ,
161+ model , key , api_url ],
162+ outputs = [summary_output , detail_output ] # 修改输出为两个组件
72163 )
73164
74165 # 启动界面
0 commit comments