support ShareGPT dataset as data file

tukwila · tukwila · commit 5d6844931a8d · 2025-09-08T12:36:24.000+08:00
Signed-off-by: guangli.bao &lt;guangli.bao@daocloud.io&gt;
diff --git a/docs/datasets.md b/docs/datasets.md
@@ -220,3 +220,33 @@ benchmark_generative_text(data=data, ...)
 - For lists of dictionaries, all items must have the same keys.
 - For lists of items, all elements must be of the same type.
 - A processor/tokenizer is only required if `GUIDELLM__PREFERRED_PROMPT_TOKENS_SOURCE="local"` or `GUIDELLM__PREFERRED_OUTPUT_TOKENS_SOURCE="local"` is set in the environment. In this case, the processor/tokenizer must be specified using the `--processor` argument. If not set, the processor/tokenizer will be set to the model passed in or retrieved from the server.
+
+
+### ShareGPT Datasets
+
+You can use ShareGPT_V3_unfiltered_cleaned_split.json as benchmark datasets.
+
+1. Download and prepare the ShareGPT dataset 
+    You can specify the proportion of data to process by providing a number between 0 and 1 as an argument to the script.
+
+    ```bash
+    cd src/guidellm/utils
+    pip install -r requirements.txt
+    bash prepare_sharegpt_data.sh 1
+    ```
+
+    In this example, 1 indicates processing 100% of the dataset. You can adjust this value as needed.
+
+    Conda env Recommanded to install libs.
+
+2. Run the benchmark
+    Example:
+
+    ```bash
+    guidellm benchmark \
+      --target "http://localhost:8000" \
+      --rate-type "throughput" \
+      --data-args '{"prompt_column": "value", "split": "train"}' \
+      --max-requests 10 \
+      --data "/${local_path}/ShareGPT.json"
+    ```
diff --git a/src/guidellm/utils/prepare_sharegpt_data.sh b/src/guidellm/utils/prepare_sharegpt_data.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+python3 shareGPT_data_preprocessing.py --parse $1
diff --git a/src/guidellm/utils/requirements.txt b/src/guidellm/utils/requirements.txt
@@ -0,0 +1,4 @@
+tqdm
+pandas
+openai
+pyyaml
diff --git a/src/guidellm/utils/shareGPT_data_preprocessing.py b/src/guidellm/utils/shareGPT_data_preprocessing.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# Standard
+import argparse
+import json
+import os
+
+# Third Party
+from transformers import AutoTokenizer
+import numpy as np
+
+from datasets import load_dataset
+import re
+
+def extract_and_save_with_filtering():
+    """substract human prompts and apply filtering conditions"""
+    
+    dataset = load_dataset('json', data_files='./ShareGPT.json', split='train')
+    
+    filtered_prompts = []
+    
+    for example in dataset:
+        conversations = example.get('conversations', [])
+        
+        if isinstance(conversations, list):
+            for turn in conversations:
+                if turn.get('from') in ['human', 'user']:
+                    prompt_text = turn['value'].strip()
+                    
+                    # 应用过滤条件
+                    if (len(prompt_text) >= 10 and  # 至少10个字符
+                        len(prompt_text) <= 1000 and  # 最多1000个字符
+                        not prompt_text.startswith(('http://', 'https://')) and  # 排除URL
+                        not re.search(r'[<>{}[\]\\]', prompt_text) and  # 排除特殊字符
+                        not prompt_text.isdigit()):  # 排除纯数字
+                        
+                        filtered_prompts.append({
+                            'from': turn.get('from'),
+                            'text': prompt_text,
+                            'char_count': len(prompt_text),
+                            'word_count': len(prompt_text.split())
+                        })
+    
+    return filtered_prompts
+    
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Process data percentage.")
+    parser.add_argument(
+        "--parse",
+        type=float,
+        default=1,
+        help="The percentage of data to process (0 to 1). Default is 1 (100%).",
+    )
+
+    args = parser.parse_args()
+
+    with open("ShareGPT_V3_unfiltered_cleaned_split.json", "r", encoding="utf-8") as file:
+        data = json.load(file)
+
+
+    def estimate_num_tokens(text: str) -> int:
+        if not hasattr(estimate_num_tokens, "tokenizer"):
+            os.environ["TOKENIZERS_PARALLELISM"] = "false"
+            estimate_num_tokens.tokenizer = AutoTokenizer.from_pretrained(
+                "mistralai/Mistral-7B-Instruct-v0.2"
+            )
+        return len(estimate_num_tokens.tokenizer.tokenize(text))
+
+
+    num_of_ids = len(data)
+    print(f"Number of IDs: {num_of_ids}")
+    data = data[: int(num_of_ids * args.parse)]
+
+    count = 0
+
+    for d in data:
+        d["num_round"] = len(d["conversations"])  # human is one round, gpt is another round
+        human_tokens = []
+        gpt_tokens = []
+        for conv in d["conversations"]:
+            if conv["from"] == "human":
+                human_tokens.append(estimate_num_tokens(conv["value"]))
+            if conv["from"] == "gpt":
+                token_number = estimate_num_tokens(conv["value"])
+                conv["num_tokens"] = token_number
+                gpt_tokens.append(token_number)
+        if len(human_tokens) == 0:
+            d["average_human_token"] = 0
+            d["max_human_token"] = 0
+        else:
+            d["average_human_token"] = float(np.mean(human_tokens))
+            d["max_human_token"] = float(np.max(human_tokens))
+        if len(gpt_tokens) == 0:
+            d["average_gpt_token"] = 0
+            d["max_gpt_token"] = 0
+        else:
+            d["average_gpt_token"] = float(np.mean(gpt_tokens))
+            d["max_gpt_token"] = float(np.max(gpt_tokens))
+
+        count += 1
+        print(f"Finished {count}")
+
+    # save unfiletered datasets to ShareGPT.json
+    with open("ShareGPT.json", "w", encoding="utf-8") as file:
+        json.dump(data, file, ensure_ascii=False, indent=2)
+    # filter from: human prompts and save again
+    filtered_result = extract_and_save_with_filtering()
+    with open("ShareGPT.json", "w", encoding="utf-8") as file:
+        json.dump(filtered_result, file, ensure_ascii=False, indent=2)
+
+

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +tqdm
 +pandas
 +openai
 +pyyaml