Skip to content

Commit 9c5dfe7

Browse files
committed
support ShareGPT dataset as data file
Signed-off-by: guangli.bao <[email protected]>
1 parent ad9513f commit 9c5dfe7

File tree

4 files changed

+161
-0
lines changed

4 files changed

+161
-0
lines changed

docs/datasets.md

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,3 +220,27 @@ benchmark_generative_text(data=data, ...)
220220
- For lists of dictionaries, all items must have the same keys.
221221
- For lists of items, all elements must be of the same type.
222222
- A processor/tokenizer is only required if `GUIDELLM__PREFERRED_PROMPT_TOKENS_SOURCE="local"` or `GUIDELLM__PREFERRED_OUTPUT_TOKENS_SOURCE="local"` is set in the environment. In this case, the processor/tokenizer must be specified using the `--processor` argument. If not set, the processor/tokenizer will be set to the model passed in or retrieved from the server.
223+
224+
### ShareGPT Datasets
225+
226+
You can use ShareGPT_V3_unfiltered_cleaned_split.json as benchmark datasets.
227+
228+
#### Example Commands
229+
230+
Download and prepare the ShareGPT dataset; You can specify the proportion of data to process by providing a number between 0 and 1 as an argument to the script.
231+
232+
```bash
233+
cd src/guidellm/utils && pip install -r requirements.txt && bash prepare_sharegpt_data.sh 1
234+
235+
```
236+
237+
In this example, 1 indicates processing 100% of the dataset. You can adjust this value as needed. Conda env Recommanded to install libs.
238+
239+
```bash
240+
guidellm benchmark \
241+
--target "http://localhost:8000" \
242+
--rate-type "throughput" \
243+
--data-args '{"prompt_column": "value", "split": "train"}' \
244+
--max-requests 10 \
245+
--data "/${local_path}/ShareGPT.json"
246+
```
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
#!/bin/bash
2+
3+
wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
4+
python3 preprocessing_sharegpt_data.py --parse $1
Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
import argparse
2+
import json
3+
import os
4+
import re
5+
from pathlib import Path
6+
from typing import Callable, Optional
7+
8+
import numpy as np
9+
from datasets import load_dataset
10+
from transformers import AutoTokenizer
11+
12+
MIN_CHAR = 10
13+
MAX_CHAR = 1000
14+
15+
16+
def create_token_estimator(
17+
model_name: str = "mistralai/Mistral-7B-Instruct-v0.2",
18+
) -> Callable[[str], int]:
19+
_tokenizer: Optional[AutoTokenizer] = None
20+
21+
def initialize() -> None:
22+
nonlocal _tokenizer
23+
if _tokenizer is None:
24+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
25+
try:
26+
_tokenizer = AutoTokenizer.from_pretrained(model_name)
27+
except (OSError, ImportError, ValueError) as e:
28+
raise RuntimeError(f"Failed to initialize tokenizer: {e}") from e
29+
30+
def estimate_num_tokens(text: str) -> int:
31+
initialize()
32+
33+
if _tokenizer is None:
34+
return 0
35+
36+
try:
37+
encoding = _tokenizer(text, return_tensors=None)
38+
return len(encoding["input_ids"])
39+
except (AttributeError, TypeError, RuntimeError) as e:
40+
raise ValueError(f"Error processing text: {e}") from e
41+
42+
return estimate_num_tokens
43+
44+
45+
def extract_and_save_with_filtering(file):
46+
"""substract human prompts and apply filtering conditions"""
47+
dataset = load_dataset("json", data_files=file, split="train")
48+
filtered_prompts = []
49+
50+
for example in dataset:
51+
conversations = example.get("conversations", [])
52+
if isinstance(conversations, list):
53+
for turn in conversations:
54+
if turn.get("from") in ["human", "user"]:
55+
prompt_text = turn["value"].strip()
56+
# apply filter conditions: more than 10 characters
57+
if (
58+
len(prompt_text) >= MIN_CHAR
59+
and
60+
# less thant 1000 characters
61+
len(prompt_text) <= MAX_CHAR
62+
and
63+
# except URLs
64+
not prompt_text.startswith(("http://", "https://"))
65+
and
66+
# except special characters
67+
not re.search(r"[<>{}[\]\\]", prompt_text)
68+
and not prompt_text.isdigit()
69+
): # except pure numbers
70+
filtered_prompts.append(
71+
{
72+
"from": turn.get("from"),
73+
"text": prompt_text,
74+
"char_count": len(prompt_text),
75+
"word_count": len(prompt_text.split()),
76+
}
77+
)
78+
79+
return filtered_prompts
80+
81+
82+
if __name__ == "__main__":
83+
parser = argparse.ArgumentParser(description="Process data percentage.")
84+
parser.add_argument(
85+
"--parse",
86+
type=float,
87+
default=1,
88+
help="The percentage of data to process (0 to 1). Default is 1 (100%).",
89+
)
90+
args = parser.parse_args()
91+
92+
sharegpt_file = "ShareGPT_V3_unfiltered_cleaned_split.json"
93+
with Path(sharegpt_file).open("r", encoding="utf-8") as file:
94+
data = json.load(file)
95+
96+
estimate_tokens = create_token_estimator()
97+
num_of_ids = len(data)
98+
data = data[: int(num_of_ids * args.parse)]
99+
for d in data:
100+
d["num_round"] = len(d["conversations"])
101+
human_tokens = []
102+
gpt_tokens = []
103+
for conv in d["conversations"]:
104+
if conv["from"] == "human":
105+
human_tokens.append(estimate_tokens(conv["value"]))
106+
if conv["from"] == "gpt":
107+
token_number = estimate_tokens(conv["value"])
108+
conv["num_tokens"] = token_number
109+
gpt_tokens.append(token_number)
110+
if len(human_tokens) == 0:
111+
d["average_human_token"] = 0
112+
d["max_human_token"] = 0
113+
else:
114+
d["average_human_token"] = float(np.mean(human_tokens))
115+
d["max_human_token"] = float(np.max(human_tokens))
116+
if len(gpt_tokens) == 0:
117+
d["average_gpt_token"] = 0
118+
d["max_gpt_token"] = 0
119+
else:
120+
d["average_gpt_token"] = float(np.mean(gpt_tokens))
121+
d["max_gpt_token"] = float(np.max(gpt_tokens))
122+
123+
# save unfiletered datasets to ShareGPT.json
124+
with Path("ShareGPT.json").open("w", encoding="utf-8") as file:
125+
json.dump(data, file, ensure_ascii=False, indent=2)
126+
# filter from: human prompts and save again
127+
filtered_result = extract_and_save_with_filtering("ShareGPT.json")
128+
with Path("ShareGPT.json").open("w", encoding="utf-8") as file:
129+
json.dump(filtered_result, file, ensure_ascii=False, indent=2)
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
tqdm
2+
pandas
3+
openai
4+
pyyaml

0 commit comments

Comments
 (0)