Skip to content

Commit 5d68449

Browse files
committed
support ShareGPT dataset as data file
Signed-off-by: guangli.bao <[email protected]>
1 parent ad9513f commit 5d68449

File tree

4 files changed

+148
-0
lines changed

4 files changed

+148
-0
lines changed

docs/datasets.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,3 +220,33 @@ benchmark_generative_text(data=data, ...)
220220
- For lists of dictionaries, all items must have the same keys.
221221
- For lists of items, all elements must be of the same type.
222222
- A processor/tokenizer is only required if `GUIDELLM__PREFERRED_PROMPT_TOKENS_SOURCE="local"` or `GUIDELLM__PREFERRED_OUTPUT_TOKENS_SOURCE="local"` is set in the environment. In this case, the processor/tokenizer must be specified using the `--processor` argument. If not set, the processor/tokenizer will be set to the model passed in or retrieved from the server.
223+
224+
225+
### ShareGPT Datasets
226+
227+
You can use ShareGPT_V3_unfiltered_cleaned_split.json as benchmark datasets.
228+
229+
1. Download and prepare the ShareGPT dataset
230+
You can specify the proportion of data to process by providing a number between 0 and 1 as an argument to the script.
231+
232+
```bash
233+
cd src/guidellm/utils
234+
pip install -r requirements.txt
235+
bash prepare_sharegpt_data.sh 1
236+
```
237+
238+
In this example, 1 indicates processing 100% of the dataset. You can adjust this value as needed.
239+
240+
Conda env Recommanded to install libs.
241+
242+
2. Run the benchmark
243+
Example:
244+
245+
```bash
246+
guidellm benchmark \
247+
--target "http://localhost:8000" \
248+
--rate-type "throughput" \
249+
--data-args '{"prompt_column": "value", "split": "train"}' \
250+
--max-requests 10 \
251+
--data "/${local_path}/ShareGPT.json"
252+
```
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/bin/bash
2+
3+
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
4+
python3 shareGPT_data_preprocessing.py --parse $1
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
tqdm
2+
pandas
3+
openai
4+
pyyaml
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# Standard
3+
import argparse
4+
import json
5+
import os
6+
7+
# Third Party
8+
from transformers import AutoTokenizer
9+
import numpy as np
10+
11+
from datasets import load_dataset
12+
import re
13+
14+
def extract_and_save_with_filtering():
15+
"""substract human prompts and apply filtering conditions"""
16+
17+
dataset = load_dataset('json', data_files='./ShareGPT.json', split='train')
18+
19+
filtered_prompts = []
20+
21+
for example in dataset:
22+
conversations = example.get('conversations', [])
23+
24+
if isinstance(conversations, list):
25+
for turn in conversations:
26+
if turn.get('from') in ['human', 'user']:
27+
prompt_text = turn['value'].strip()
28+
29+
# 应用过滤条件
30+
if (len(prompt_text) >= 10 and # 至少10个字符
31+
len(prompt_text) <= 1000 and # 最多1000个字符
32+
not prompt_text.startswith(('http://', 'https://')) and # 排除URL
33+
not re.search(r'[<>{}[\]\\]', prompt_text) and # 排除特殊字符
34+
not prompt_text.isdigit()): # 排除纯数字
35+
36+
filtered_prompts.append({
37+
'from': turn.get('from'),
38+
'text': prompt_text,
39+
'char_count': len(prompt_text),
40+
'word_count': len(prompt_text.split())
41+
})
42+
43+
return filtered_prompts
44+
45+
if __name__ == "__main__":
46+
parser = argparse.ArgumentParser(description="Process data percentage.")
47+
parser.add_argument(
48+
"--parse",
49+
type=float,
50+
default=1,
51+
help="The percentage of data to process (0 to 1). Default is 1 (100%).",
52+
)
53+
54+
args = parser.parse_args()
55+
56+
with open("ShareGPT_V3_unfiltered_cleaned_split.json", "r", encoding="utf-8") as file:
57+
data = json.load(file)
58+
59+
60+
def estimate_num_tokens(text: str) -> int:
61+
if not hasattr(estimate_num_tokens, "tokenizer"):
62+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
63+
estimate_num_tokens.tokenizer = AutoTokenizer.from_pretrained(
64+
"mistralai/Mistral-7B-Instruct-v0.2"
65+
)
66+
return len(estimate_num_tokens.tokenizer.tokenize(text))
67+
68+
69+
num_of_ids = len(data)
70+
print(f"Number of IDs: {num_of_ids}")
71+
data = data[: int(num_of_ids * args.parse)]
72+
73+
count = 0
74+
75+
for d in data:
76+
d["num_round"] = len(d["conversations"]) # human is one round, gpt is another round
77+
human_tokens = []
78+
gpt_tokens = []
79+
for conv in d["conversations"]:
80+
if conv["from"] == "human":
81+
human_tokens.append(estimate_num_tokens(conv["value"]))
82+
if conv["from"] == "gpt":
83+
token_number = estimate_num_tokens(conv["value"])
84+
conv["num_tokens"] = token_number
85+
gpt_tokens.append(token_number)
86+
if len(human_tokens) == 0:
87+
d["average_human_token"] = 0
88+
d["max_human_token"] = 0
89+
else:
90+
d["average_human_token"] = float(np.mean(human_tokens))
91+
d["max_human_token"] = float(np.max(human_tokens))
92+
if len(gpt_tokens) == 0:
93+
d["average_gpt_token"] = 0
94+
d["max_gpt_token"] = 0
95+
else:
96+
d["average_gpt_token"] = float(np.mean(gpt_tokens))
97+
d["max_gpt_token"] = float(np.max(gpt_tokens))
98+
99+
count += 1
100+
print(f"Finished {count}")
101+
102+
# save unfiletered datasets to ShareGPT.json
103+
with open("ShareGPT.json", "w", encoding="utf-8") as file:
104+
json.dump(data, file, ensure_ascii=False, indent=2)
105+
# filter from: human prompts and save again
106+
filtered_result = extract_and_save_with_filtering()
107+
with open("ShareGPT.json", "w", encoding="utf-8") as file:
108+
json.dump(filtered_result, file, ensure_ascii=False, indent=2)
109+
110+

0 commit comments

Comments
 (0)