support ShareGPT dataset as data file

tukwila · tukwila · commit 9c5dfe752b06 · 2025-09-09T11:25:53.000+08:00
Signed-off-by: guangli.bao &lt;guangli.bao@daocloud.io&gt;
diff --git a/docs/datasets.md b/docs/datasets.md
@@ -220,3 +220,27 @@ benchmark_generative_text(data=data, ...)
 - For lists of dictionaries, all items must have the same keys.
 - For lists of items, all elements must be of the same type.
 - A processor/tokenizer is only required if `GUIDELLM__PREFERRED_PROMPT_TOKENS_SOURCE="local"` or `GUIDELLM__PREFERRED_OUTPUT_TOKENS_SOURCE="local"` is set in the environment. In this case, the processor/tokenizer must be specified using the `--processor` argument. If not set, the processor/tokenizer will be set to the model passed in or retrieved from the server.
+
+### ShareGPT Datasets
+
+You can use ShareGPT_V3_unfiltered_cleaned_split.json as benchmark datasets.
+
+#### Example Commands
+
+Download and prepare the ShareGPT dataset; You can specify the proportion of data to process by providing a number between 0 and 1 as an argument to the script.
+
+```bash
+  cd src/guidellm/utils && pip install -r requirements.txt && bash prepare_sharegpt_data.sh 1
+
+```
+
+In this example, 1 indicates processing 100% of the dataset. You can adjust this value as needed. Conda env Recommanded to install libs.
+
+```bash
+guidellm benchmark \
+  --target "http://localhost:8000" \
+  --rate-type "throughput" \
+  --data-args '{"prompt_column": "value", "split": "train"}' \
+  --max-requests 10 \
+  --data "/${local_path}/ShareGPT.json"
+```
diff --git a/src/guidellm/utils/prepare_sharegpt_data.sh b/src/guidellm/utils/prepare_sharegpt_data.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+python3 preprocessing_sharegpt_data.py --parse $1
diff --git a/src/guidellm/utils/preprocessing_sharegpt_data.py b/src/guidellm/utils/preprocessing_sharegpt_data.py
@@ -0,0 +1,129 @@
+import argparse
+import json
+import os
+import re
+from pathlib import Path
+from typing import Callable, Optional
+
+import numpy as np
+from datasets import load_dataset
+from transformers import AutoTokenizer
+
+MIN_CHAR = 10
+MAX_CHAR = 1000
+
+
+def create_token_estimator(
+    model_name: str = "mistralai/Mistral-7B-Instruct-v0.2",
+) -> Callable[[str], int]:
+    _tokenizer: Optional[AutoTokenizer] = None
+
+    def initialize() -> None:
+        nonlocal _tokenizer
+        if _tokenizer is None:
+            os.environ["TOKENIZERS_PARALLELISM"] = "false"
+            try:
+                _tokenizer = AutoTokenizer.from_pretrained(model_name)
+            except (OSError, ImportError, ValueError) as e:
+                raise RuntimeError(f"Failed to initialize tokenizer: {e}") from e
+
+    def estimate_num_tokens(text: str) -> int:
+        initialize()
+
+        if _tokenizer is None:
+            return 0
+
+        try:
+            encoding = _tokenizer(text, return_tensors=None)
+            return len(encoding["input_ids"])
+        except (AttributeError, TypeError, RuntimeError) as e:
+            raise ValueError(f"Error processing text: {e}") from e
+
+    return estimate_num_tokens
+
+
+def extract_and_save_with_filtering(file):
+    """substract human prompts and apply filtering conditions"""
+    dataset = load_dataset("json", data_files=file, split="train")
+    filtered_prompts = []
+
+    for example in dataset:
+        conversations = example.get("conversations", [])
+        if isinstance(conversations, list):
+            for turn in conversations:
+                if turn.get("from") in ["human", "user"]:
+                    prompt_text = turn["value"].strip()
+                    # apply filter conditions: more than 10 characters
+                    if (
+                        len(prompt_text) >= MIN_CHAR
+                        and
+                        # less thant 1000 characters
+                        len(prompt_text) <= MAX_CHAR
+                        and
+                        # except URLs
+                        not prompt_text.startswith(("http://", "https://"))
+                        and
+                        # except special characters
+                        not re.search(r"[<>{}[\]\\]", prompt_text)
+                        and not prompt_text.isdigit()
+                    ):  # except pure numbers
+                        filtered_prompts.append(
+                            {
+                                "from": turn.get("from"),
+                                "text": prompt_text,
+                                "char_count": len(prompt_text),
+                                "word_count": len(prompt_text.split()),
+                            }
+                        )
+
+    return filtered_prompts
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process data percentage.")
+    parser.add_argument(
+        "--parse",
+        type=float,
+        default=1,
+        help="The percentage of data to process (0 to 1). Default is 1 (100%).",
+    )
+    args = parser.parse_args()
+
+    sharegpt_file = "ShareGPT_V3_unfiltered_cleaned_split.json"
+    with Path(sharegpt_file).open("r", encoding="utf-8") as file:
+        data = json.load(file)
+
+    estimate_tokens = create_token_estimator()
+    num_of_ids = len(data)
+    data = data[: int(num_of_ids * args.parse)]
+    for d in data:
+        d["num_round"] = len(d["conversations"])
+        human_tokens = []
+        gpt_tokens = []
+        for conv in d["conversations"]:
+            if conv["from"] == "human":
+                human_tokens.append(estimate_tokens(conv["value"]))
+            if conv["from"] == "gpt":
+                token_number = estimate_tokens(conv["value"])
+                conv["num_tokens"] = token_number
+                gpt_tokens.append(token_number)
+        if len(human_tokens) == 0:
+            d["average_human_token"] = 0
+            d["max_human_token"] = 0
+        else:
+            d["average_human_token"] = float(np.mean(human_tokens))
+            d["max_human_token"] = float(np.max(human_tokens))
+        if len(gpt_tokens) == 0:
+            d["average_gpt_token"] = 0
+            d["max_gpt_token"] = 0
+        else:
+            d["average_gpt_token"] = float(np.mean(gpt_tokens))
+            d["max_gpt_token"] = float(np.max(gpt_tokens))
+
+    # save unfiletered datasets to ShareGPT.json
+    with Path("ShareGPT.json").open("w", encoding="utf-8") as file:
+        json.dump(data, file, ensure_ascii=False, indent=2)
+    # filter from: human prompts and save again
+    filtered_result = extract_and_save_with_filtering("ShareGPT.json")
+    with Path("ShareGPT.json").open("w", encoding="utf-8") as file:
+        json.dump(filtered_result, file, ensure_ascii=False, indent=2)
diff --git a/src/guidellm/utils/requirements.txt b/src/guidellm/utils/requirements.txt
@@ -0,0 +1,4 @@
+tqdm
+pandas
+openai
+pyyaml

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +tqdm
 +pandas
 +openai
 +pyyaml