vllm-project · sjmonson · Jun 10, 2025 · Jun 10, 2025 · Jun 11, 2025 · Jul 1, 2025
diff --git a/docs/datasets.md b/docs/datasets.md
@@ -76,6 +76,7 @@ guidellm benchmark \
 - `output_tokens_stdev`: Standard deviation for output tokens. If not supplied and min/max are not specified, no deviation is applied. If not supplied and min/max are specified, a uniform distribution is used.
 - `output_tokens_min`: Minimum number of tokens in outputs. If unset and `output_tokens_stdev` is set, the minimum is 1.
 - `output_tokens_max`: Maximum number of tokens in outputs. If unset and `output_tokens_stdev` is set, the maximum is 5 times the standard deviation.
+- `prefix_tokens`: Number of tokens to share as a prefix across all prompts. Is additive to the prompt tokens distribution so each request is `prefix_tokens + prompt_tokens_sample()`. If unset, defaults to 0.
 - `samples`: Number of samples to generate (default: 1000). More samples will increase the time taken to generate the dataset before benchmarking, but will also decrease the likelihood of caching requests.
 - `source`: Source text for generation (default: `data:prideandprejudice.txt.gz`). This can be any text file, URL containing a text file, or a compressed text file. The text is used to sample from at a word and punctuation granularity and then combined into a single string of the desired lengths.
 

diff --git a/src/guidellm/dataset/__init__.py b/src/guidellm/dataset/__init__.py
@@ -4,6 +4,7 @@
 from .hf_datasets import HFDatasetsCreator
 from .in_memory import InMemoryDatasetCreator
 from .synthetic import (
+    PrefixBucketConfig,
     SyntheticDatasetConfig,
     SyntheticDatasetCreator,
     SyntheticTextItemsGenerator,
@@ -15,6 +16,7 @@
     "FileDatasetCreator",
     "HFDatasetsCreator",
     "InMemoryDatasetCreator",
+    "PrefixBucketConfig",
     "SyntheticDatasetConfig",
     "SyntheticDatasetCreator",
     "SyntheticTextItemsGenerator",

diff --git a/src/guidellm/dataset/synthetic.py b/src/guidellm/dataset/synthetic.py
@@ -1,6 +1,8 @@
 import json
+import math
 import random
 from collections.abc import Iterable, Iterator
+from itertools import cycle
 from pathlib import Path
 from typing import Any, Literal, Optional, Union
 
@@ -11,20 +13,48 @@
     IterableDataset,
     IterableDatasetDict,
 )
-from pydantic import BaseModel, Field
+from pydantic import BaseModel, ConfigDict, Field, model_validator
 from transformers import PreTrainedTokenizerBase  # type: ignore[import]
+from typing_extensions import Self
 
 from guidellm.dataset.creator import ColumnInputTypes, DatasetCreator
 from guidellm.utils import EndlessTextCreator, IntegerRangeSampler, check_load_processor
 
 __all__ = [
+    "PrefixBucketConfig",
     "SyntheticDatasetConfig",
     "SyntheticDatasetCreator",
     "SyntheticTextItemsGenerator",
 ]
 
 
+class PrefixBucketConfig(BaseModel):
+    bucket_weight: int = Field(
+        description="Weight of this bucket in the overall distribution.",
+        gt=0,
+        default=100,
+    )
+    prefix_count: int = Field(
+        description="The number of unique prefixs to generate for this bucket.",
+        ge=1,
+        default=1,
+    )
+    prefix_tokens: int = Field(
+        description="The number of prefix tokens per-prompt for this bucket.",
+        ge=0,
+        default=0,
+    )
+
+
 class SyntheticDatasetConfig(BaseModel):
+    model_config = ConfigDict(
+        extra="allow",
+    )
+
+    prefix_buckets: Optional[list[PrefixBucketConfig]] = Field(
+        description="Buckets for the prefix tokens distribution.",
+        default=None,
+    )
     prompt_tokens: int = Field(
         description="The average number of text tokens generated for prompts.",
         gt=0,
@@ -73,6 +103,26 @@ class SyntheticDatasetConfig(BaseModel):
         default="data:prideandprejudice.txt.gz",
     )
 
+    @model_validator(mode="after")
+    def check_prefix_options(self) -> Self:
+        prefix_count = self.__pydantic_extra__.get("prefix_count", None)  # type: ignore[attr-defined]
+        prefix_tokens = self.__pydantic_extra__.get("prefix_count", None)  # type: ignore[attr-defined]
+        if prefix_count is not None or prefix_tokens is not None:
+            if self.prefix_buckets:
+                raise ValueError(
+                    "prefix_buckets is mutually exclusive"
+                    " with prefix_count and prefix_tokens"
+                )
+
+            self.prefix_buckets = [
+                PrefixBucketConfig(
+                    prefix_count=prefix_count or 1,
+                    prefix_tokens=prefix_tokens or 0,
+                )
+            ]
+
+        return self
+
     @staticmethod
     def parse_str(data: Union[str, Path]) -> "SyntheticDatasetConfig":
         if (
@@ -162,40 +212,88 @@ def __iter__(
             random_seed=self.random_seed + 1,  # ensure diff dist from prompts
         )
         # ensure diff distribution from output tokens
-        rand = random.Random(self.random_seed + 2)  # noqa: S311
+        rand = random.Random(self.random_seed + 3)  # noqa: S311
+        shared_prefix_iter = self._create_prefix_iter(rand)
+        unique_prefix_iter = cycle(self.processor.get_vocab().values())
 
         for _, prompt_tokens, output_tokens in zip(
             range(self.config.samples),
             prompt_tokens_sampler,
             output_tokens_sampler,
         ):
-            start_index = rand.randint(0, len(self.text_creator.words))
+            start_index = self._rand_start_index(rand)
+            prefix_tokens = next(shared_prefix_iter, [])
+            prompt_text = self.processor.decode(
+                prefix_tokens
+                + self._create_prompt(
+                    prompt_tokens, start_index, next(unique_prefix_iter)
+                ),
+                skip_special_tokens=True,
+            )
             yield {
-                "prompt": self._create_prompt(prompt_tokens, start_index),
-                "prompt_tokens_count": prompt_tokens,
+                "prompt": prompt_text,
+                "prompt_tokens_count": len(prefix_tokens) + prompt_tokens,
                 "output_tokens_count": output_tokens,
             }
 
-    def _create_prompt(self, prompt_tokens: int, start_index: int) -> str:
+    def _rand_start_index(self, rand: random.Random) -> int:
+        """Generate a random start index for text generation."""
+        return rand.randint(0, len(self.text_creator.words) - 1)
+
+    def _create_prefix_iter(self, rand: random.Random) -> Iterator[list[int]]:
+        if not self.config.prefix_buckets:
+            while True:
+                yield []
+
+        # Increase weights to ensure an integer number of samples per per-prefix
+        least_common_prefix_count = math.lcm(
+            *(bucket.prefix_count for bucket in self.config.prefix_buckets)
+        )
+        unnorm_weights = [
+            least_common_prefix_count * bucket.bucket_weight // bucket.prefix_count
+            for bucket in self.config.prefix_buckets
+        ]
+        # Use GCD to reduce the weights to smallest integer ratio
+        common_divisor = math.gcd(*unnorm_weights)
+
+        # Create prefix list maintaining the correct distribution
+        prefixes = []
+        for bucket, weight in zip(self.config.prefix_buckets, unnorm_weights):
+            bucket_prefixes = []
+            for _ in range(bucket.prefix_count):
+                start_index = self._rand_start_index(rand)
+                prompt_tokens = self._create_prompt(bucket.prefix_tokens, start_index)
+                bucket_prefixes.append(prompt_tokens)
+            sample_count = weight // common_divisor
+            prefixes.extend(bucket_prefixes * sample_count)
+
+        while True:
+            yield rand.choice(prefixes)
+
+    def _create_prompt(
+        self, prompt_tokens: int, start_index: int, unique_prefix: Optional[int] = None
+    ) -> list[int]:
         if prompt_tokens <= 0:
-            return ""
+            return []
 
         left = start_index
         right = start_index + 4 * prompt_tokens
+        start_tokens = [unique_prefix] if unique_prefix else []
 
         while left < right:
             mid = (left + right) // 2
             test_prompt = self.text_creator.create_text(start_index, mid - start_index)
-            test_tokens = len(self.processor.tokenize(test_prompt))
+            test_tokens = start_tokens + self.processor.encode(test_prompt)
 
-            if test_tokens == prompt_tokens:
-                return test_prompt
-            elif test_tokens < prompt_tokens:
+            if len(test_tokens) == prompt_tokens:
+                return test_tokens
+            elif len(test_tokens) < prompt_tokens:
                 left = mid + 1
             else:
                 right = mid
 
-        return self.text_creator.create_text(start_index, left - start_index)
+        final_text = self.text_creator.create_text(start_index, left - start_index)
+        return start_tokens + self.processor.encode(final_text)
 
 
 class SyntheticDatasetCreator(DatasetCreator):

diff --git a/tests/unit/dataset/__init__.py b/tests/unit/dataset/__init__.py