Skip to content

Commit 9f846ab

Browse files
authored
feat: update huggingface demo. (#58)
1 parent da23a21 commit 9f846ab

File tree

1 file changed

+113
-22
lines changed
  • examples/app_huggingface

1 file changed

+113
-22
lines changed

examples/app_huggingface/app.py

Lines changed: 113 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,37 @@
11
import json
2+
import os
3+
import shutil
24

35
import gradio as gr
46
from dingo.exec import Executor
57
from dingo.io import InputArgs
68

79

8-
def dingo_demo(input_path, data_format, column_content, rule_list, prompt_list, model, key, api_url):
9-
if not input_path:
10-
return 'ValueError: input_path can not be empty, please input.'
10+
def dingo_demo(dataset_source, input_path, uploaded_file, data_format, column_content, rule_list, prompt_list, model,
11+
key, api_url):
1112
if not data_format:
12-
return 'ValueError: data_format can not be empty, please input.'
13+
return 'ValueError: data_format can not be empty, please input.', None
1314
if not column_content:
14-
return 'ValueError: column_content can not be empty, please input.'
15+
return 'ValueError: column_content can not be empty, please input.', None
1516
if not rule_list and not prompt_list:
16-
return 'ValueError: rule_list and prompt_list can not be empty at the same time.'
17+
return 'ValueError: rule_list and prompt_list can not be empty at the same time.', None
18+
19+
# Handle input path based on dataset source
20+
if dataset_source == "hugging_face":
21+
if not input_path:
22+
return 'ValueError: input_path can not be empty for hugging_face dataset, please input.', None
23+
final_input_path = input_path
24+
else: # local
25+
if not uploaded_file:
26+
return 'ValueError: Please upload a file for local dataset.', None
27+
final_input_path = uploaded_file.name
1728

1829
input_data = {
19-
"input_path": input_path,
30+
"dataset": dataset_source,
31+
"input_path": final_input_path,
32+
"output_path": "" if dataset_source == 'hugging_face' else os.path.dirname(final_input_path),
33+
"save_data": True,
34+
"save_raw": True,
2035
"data_format": data_format,
2136
"column_content": column_content,
2237
"custom_config":
@@ -36,9 +51,33 @@ def dingo_demo(input_path, data_format, column_content, rule_list, prompt_list,
3651
}
3752
input_args = InputArgs(**input_data)
3853
executor = Executor.exec_map["local"](input_args)
39-
result = executor.execute()
40-
summary = result[0].to_dict()
41-
return json.dumps(summary, indent=4)
54+
executor.execute()
55+
summary = executor.get_summary().to_dict()
56+
detail = executor.get_bad_info_list()
57+
new_detail = []
58+
for item in detail:
59+
new_detail.append(item.to_raw_dict())
60+
if summary['output_path']:
61+
shutil.rmtree(summary['output_path'])
62+
63+
# 返回两个值:概要信息和详细信息
64+
return json.dumps(summary, indent=4), new_detail
65+
66+
67+
def update_input_components(dataset_source):
68+
# 根据数据源的不同,返回不同的输入组件
69+
if dataset_source == "hugging_face":
70+
# 如果数据源是huggingface,返回一个可见的文本框和一个不可见的文件组件
71+
return [
72+
gr.Textbox(visible=True),
73+
gr.File(visible=False),
74+
]
75+
else: # local
76+
# 如果数据源是本地,返回一个不可见的文本框和一个可见的文件组件
77+
return [
78+
gr.Textbox(visible=False),
79+
gr.File(visible=True),
80+
]
4281

4382

4483
if __name__ == '__main__':
@@ -51,24 +90,76 @@ def dingo_demo(input_path, data_format, column_content, rule_list, prompt_list,
5190
gr.HTML(header)
5291
with gr.Row():
5392
with gr.Column():
54-
input_path = gr.Textbox(value='chupei/format-jsonl', placeholder="please input huggingface dataset path", label="input_path")
55-
data_format = gr.Dropdown(["jsonl", "json", "plaintext", "listjson"], label="data_format")
56-
column_content = gr.Textbox(value="content", placeholder="please input column name of content in dataset", label="column_content")
57-
rule_list = gr.CheckboxGroup(choices=rule_options, label="rule_list")
58-
prompt_list = gr.CheckboxGroup(choices=prompt_options, label="prompt_list")
59-
model = gr.Textbox(placeholder="If want to use llm, please input model, such as: deepseek-chat", label="model")
60-
key = gr.Textbox(placeholder="If want to use llm, please input key, such as: 123456789012345678901234567890xx", label="key")
61-
api_url = gr.Textbox(placeholder="If want to use llm, please input api_url, such as: https://api.deepseek.com/v1", label="api_url")
93+
with gr.Column():
94+
dataset_source = gr.Dropdown(
95+
choices=["hugging_face", "local"],
96+
value="local",
97+
label="dataset [source]"
98+
)
99+
input_path = gr.Textbox(
100+
value='chupei/format-jsonl',
101+
placeholder="please input hugging_face dataset path",
102+
label="input_path",
103+
visible=False
104+
)
105+
uploaded_file = gr.File(
106+
label="upload file",
107+
visible=True
108+
)
109+
110+
data_format = gr.Dropdown(
111+
["jsonl", "json", "plaintext", "listjson"],
112+
label="data_format"
113+
)
114+
column_content = gr.Textbox(
115+
value="content",
116+
placeholder="please input column name of content in dataset",
117+
label="column_content"
118+
)
119+
120+
rule_list = gr.CheckboxGroup(
121+
choices=rule_options,
122+
label="rule_list"
123+
)
124+
prompt_list = gr.CheckboxGroup(
125+
choices=prompt_options,
126+
label="prompt_list"
127+
)
128+
model = gr.Textbox(
129+
placeholder="If want to use llm, please input model, such as: deepseek-chat",
130+
label="model"
131+
)
132+
key = gr.Textbox(
133+
placeholder="If want to use llm, please input key, such as: 123456789012345678901234567890xx",
134+
label="API KEY"
135+
)
136+
api_url = gr.Textbox(
137+
placeholder="If want to use llm, please input api_url, such as: https://api.deepseek.com/v1",
138+
label="API URL"
139+
)
140+
62141
with gr.Row():
63142
submit_single = gr.Button(value="Submit", interactive=True, variant="primary")
143+
64144
with gr.Column():
65-
# 输出组件
66-
output = gr.Textbox(label="output")
145+
# 修改输出组件部分,使用Tabs
146+
with gr.Tabs():
147+
with gr.Tab("Result Summary"):
148+
summary_output = gr.Textbox(label="summary", max_lines=50)
149+
with gr.Tab("Result Detail"):
150+
detail_output = gr.JSON(label="detail", max_height=800) # 使用JSON组件来更好地展示结构化数据
151+
152+
dataset_source.change(
153+
fn=update_input_components,
154+
inputs=dataset_source,
155+
outputs=[input_path, uploaded_file]
156+
)
67157

68158
submit_single.click(
69159
fn=dingo_demo,
70-
inputs=[input_path, data_format, column_content, rule_list, prompt_list, model, key, api_url],
71-
outputs=output
160+
inputs=[dataset_source, input_path, uploaded_file, data_format, column_content, rule_list, prompt_list,
161+
model, key, api_url],
162+
outputs=[summary_output, detail_output] # 修改输出为两个组件
72163
)
73164

74165
# 启动界面

0 commit comments

Comments
 (0)