diff --git a/contrib/sharegpt_preprocess/README.md b/contrib/sharegpt_preprocess/README.md new file mode 100644 index 00000000..476a172a --- /dev/null +++ b/contrib/sharegpt_preprocess/README.md @@ -0,0 +1,31 @@ +# ShareGPT Datasets + +You can use ShareGPT_V3_unfiltered_cleaned_split.json as benchmark datasets. + +## Prerequisites +Before you begin, ensure you have the following installed: + +* Python 3.9 or higher +* pip (Python package manager) + +## Example Commands + +Download and prepare the ShareGPT dataset; You can specify the proportion of data to process by providing a number between 0 and 1 as an argument to the script. + +```bash +cd contrib/sharegpt_preprocess +pip install -r requirements.txt +bash prepare_sharegpt_data.sh 1 + +``` + +In this example, 1 indicates processing 100% of the dataset. You can adjust this value as needed. Conda env is Recommanded to install libs. + +```bash +guidellm benchmark \ + --target "http://localhost:8000" \ + --rate-type "throughput" \ + --data-args '{"prompt_column": "value", "split": "train"}' \ + --max-requests 10 \ + --data "/${local_path}/ShareGPT.json" +``` diff --git a/contrib/sharegpt_preprocess/__init__.py b/contrib/sharegpt_preprocess/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/contrib/sharegpt_preprocess/prepare_sharegpt_data.sh b/contrib/sharegpt_preprocess/prepare_sharegpt_data.sh new file mode 100644 index 00000000..e0ff1779 --- /dev/null +++ b/contrib/sharegpt_preprocess/prepare_sharegpt_data.sh @@ -0,0 +1,4 @@ +#!/bin/bash + +wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json +python3 preprocessing_sharegpt_data.py --parse $1 diff --git a/contrib/sharegpt_preprocess/preprocessing_sharegpt_data.py b/contrib/sharegpt_preprocess/preprocessing_sharegpt_data.py new file mode 100644 index 00000000..102affb4 --- /dev/null +++ b/contrib/sharegpt_preprocess/preprocessing_sharegpt_data.py @@ -0,0 +1,127 @@ +import argparse +import json +import os +import re +from pathlib import Path +from typing import Optional + +import numpy as np +from datasets import load_dataset +from transformers import AutoTokenizer, PreTrainedTokenizerBase + +MIN_CHAR = 10 +MAX_CHAR = 1000 + + +class TokenCounter: + def __init__(self, model_name: str = "mistralai/Mistral-7B-Instruct-v0.2"): + self.model_name = model_name + self._tokenizer: Optional[PreTrainedTokenizerBase] = None + + def _initialize_tokenizer(self) -> None: + if self._tokenizer is None: + os.environ["TOKENIZERS_PARALLELISM"] = "false" + try: + self._tokenizer = AutoTokenizer.from_pretrained(self.model_name) + except (OSError, ImportError, ValueError) as e: + raise RuntimeError(f"Failed to initialize tokenizer: {e}") from e + + def estimate_num_tokens(self, text: str) -> int: + self._initialize_tokenizer() + + if self._tokenizer is None: + return 0 + + try: + encoding = self._tokenizer.__call__(text, return_tensors=None) + return len(encoding["input_ids"]) + except (AttributeError, TypeError, RuntimeError) as e: + raise ValueError(f"Error processing text: {e}") from e + + +def extract_and_save_with_filtering(file): + """substract human prompts and apply filtering conditions""" + dataset = load_dataset("json", data_files=file, split="train") + filtered_prompts = [] + + for example in dataset: + conversations = example.get("conversations", []) + if isinstance(conversations, list): + for turn in conversations: + if turn.get("from") in ["human", "user"]: + prompt_text = turn["value"].strip() + # apply filter conditions: more than 10 characters + if ( + len(prompt_text) >= MIN_CHAR + and + # less thant 1000 characters + len(prompt_text) <= MAX_CHAR + and + # except URLs + not prompt_text.startswith(("http://", "https://")) + and + # except special characters + not re.search(r"[<>{}[\]\\]", prompt_text) + # except pure numbers + and not prompt_text.isdigit() + ): + filtered_prompts.append( + { + "from": turn.get("from"), + "text": prompt_text, + "char_count": len(prompt_text), + "word_count": len(prompt_text.split()), + } + ) + + return filtered_prompts + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Process data percentage.") + parser.add_argument( + "--parse", + type=float, + default=1, + help="The percentage of data to process (0 to 1). Default is 1 (100%).", + ) + args = parser.parse_args() + + sharegpt_file = "ShareGPT_V3_unfiltered_cleaned_split.json" + with Path(sharegpt_file).open("r", encoding="utf-8") as file: + data = json.load(file) + + counter = TokenCounter() + num_of_ids = len(data) + data = data[: int(num_of_ids * args.parse)] + for d in data: + d["num_round"] = len(d["conversations"]) + human_tokens = [] + gpt_tokens = [] + for conv in d["conversations"]: + if conv["from"] == "human": + human_tokens.append(counter.estimate_num_tokens(conv["value"])) + if conv["from"] == "gpt": + token_number = counter.estimate_num_tokens(conv["value"]) + conv["num_tokens"] = token_number + gpt_tokens.append(token_number) + if len(human_tokens) == 0: + d["average_human_token"] = 0 + d["max_human_token"] = 0 + else: + d["average_human_token"] = float(np.mean(human_tokens)) + d["max_human_token"] = float(np.max(human_tokens)) + if len(gpt_tokens) == 0: + d["average_gpt_token"] = 0 + d["max_gpt_token"] = 0 + else: + d["average_gpt_token"] = float(np.mean(gpt_tokens)) + d["max_gpt_token"] = float(np.max(gpt_tokens)) + + # save unfiletered datasets to ShareGPT.json + with Path("ShareGPT.json").open("w", encoding="utf-8") as file: + json.dump(data, file, ensure_ascii=False, indent=2) + # filter from: human prompts and save again + filtered_result = extract_and_save_with_filtering("ShareGPT.json") + with Path("ShareGPT.json").open("w", encoding="utf-8") as file: + json.dump(filtered_result, file, ensure_ascii=False, indent=2) diff --git a/contrib/sharegpt_preprocess/requirements.txt b/contrib/sharegpt_preprocess/requirements.txt new file mode 100644 index 00000000..67b677e9 --- /dev/null +++ b/contrib/sharegpt_preprocess/requirements.txt @@ -0,0 +1,5 @@ +tqdm==4.67.1 +pandas==2.3.1 +openai==1.99.9 +datasets==4.0.0 +transformers==4.55.4