vllm-project · markurtz · Oct 3, 2025 · Oct 1, 2025 · Oct 1, 2025 · Oct 1, 2025
diff --git a/pylock.toml b/pylock.toml
diff --git a/pyproject.toml b/pyproject.toml
@@ -13,6 +13,13 @@ include = ["*"]
 [tool.pdm]
 distribution = true
 
+[[tool.pdm.source]]
+name = "torch"
+type = "find_links"
+#url = "https://download.pytorch.org/whl/cpu/torch_stable.html"
-#url = "https://download.pytorch.org/whl/cpu/torch_stable.html"
+# Previous URL for torch wheels (replaced with the current one due to updated structure)
-#url = "https://download.pytorch.org/whl/cpu/torch_stable.html"
+# Previous URL for torch wheels (replaced with the current one due to updated structure)
+url = "https://download.pytorch.org/whl/cpu/torch/"
+include_packages = ["torch"]
+
 
 # ************************************************
 # ********** Project Metadata **********
@@ -64,6 +71,8 @@ dependencies = [
     "sanic",
     "transformers",
     "uvloop>=0.18",
+    "librosa>=0.11.0",
+    "torch>=2.8.0",
 ]
 
 [project.optional-dependencies]

diff --git a/src/guidellm/data/deserializers/__init__.py b/src/guidellm/data/deserializers/__init__.py
@@ -25,6 +25,7 @@
     SyntheticTextDatasetConfig,
     SyntheticTextDatasetDeserializer,
     SyntheticTextGenerator,
+    SyntheticTextPrefixBucketConfig,
 )
 
 __all__ = [
@@ -46,6 +47,7 @@
     "SyntheticTextDatasetConfig",
     "SyntheticTextDatasetDeserializer",
     "SyntheticTextGenerator",
+    "SyntheticTextPrefixBucketConfig",
     "TarFileDatasetDeserializer",
     "TextFileDatasetDeserializer",
 ]
diff --git a/src/guidellm/data/deserializers/synthetic.py b/src/guidellm/data/deserializers/synthetic.py
@@ -1,13 +1,15 @@
 from __future__ import annotations
 
+import math
 from collections.abc import Iterator
 from pathlib import Path
-from typing import Any, Callable
+from random import Random
+from typing import Any, Callable, Self
 
 import yaml
 from datasets import Features, IterableDataset, Value
 from faker import Faker
-from pydantic import Field
+from pydantic import ConfigDict, Field, model_validator
 from transformers import PreTrainedTokenizerBase
 
 from guidellm.data.deserializers.deserializer import (
@@ -21,10 +23,37 @@
     "SyntheticTextDatasetConfig",
     "SyntheticTextDatasetDeserializer",
     "SyntheticTextGenerator",
+    "SyntheticTextPrefixBucketConfig",
 ]
 
 
+class SyntheticTextPrefixBucketConfig(StandardBaseModel):
+    bucket_weight: int = Field(
+        description="Weight of this bucket in the overall distribution.",
+        gt=0,
+        default=100,
+    )
+    prefix_count: int = Field(
+        description="The number of unique prefixes to generate for this bucket.",
+        ge=1,
+        default=1,
+    )
+    prefix_tokens: int = Field(
+        description="The number of prefix tokens per-prompt for this bucket.",
+        ge=0,
+        default=0,
+    )
+
+
 class SyntheticTextDatasetConfig(StandardBaseModel):
+    model_config = ConfigDict(
+        extra="allow",
+    )
+
+    prefix_buckets: list[SyntheticTextPrefixBucketConfig] | None = Field(
+        description="Buckets for the prefix tokens distribution.",
+        default=None,
+    )
     prompt_tokens: int = Field(
         description="The average number of text tokens generated for prompts.",
         gt=0,
@@ -68,6 +97,26 @@ class SyntheticTextDatasetConfig(StandardBaseModel):
         default="data:prideandprejudice.txt.gz",
     )
 
+    @model_validator(mode="after")
+    def check_prefix_options(self) -> Self:
+        prefix_count = self.__pydantic_extra__.get("prefix_count", None)  # type: ignore[attr-defined]
+        prefix_tokens = self.__pydantic_extra__.get("prefix_count", None)  # type: ignore[attr-defined]
-        prefix_tokens = self.__pydantic_extra__.get("prefix_count", None)  # type: ignore[attr-defined]
+        prefix_tokens = self.__pydantic_extra__.get("prefix_tokens", None)  # type: ignore[attr-defined]
-        prefix_tokens = self.__pydantic_extra__.get("prefix_count", None)  # type: ignore[attr-defined]
+        prefix_tokens = self.__pydantic_extra__.get("prefix_tokens", None)  # type: ignore[attr-defined]
+        if prefix_count is not None or prefix_tokens is not None:
+            if self.prefix_buckets:
+                raise ValueError(
+                    "prefix_buckets is mutually exclusive"
+                    " with prefix_count and prefix_tokens"
+                )
+
+            self.prefix_buckets = [
+                SyntheticTextPrefixBucketConfig(
+                    prefix_count=prefix_count or 1,
+                    prefix_tokens=prefix_tokens or 0,
+                )
+            ]
+
+        return self
+
 
 class SyntheticTextGenerator:
     def __init__(
@@ -104,20 +153,27 @@ def __iter__(self) -> Iterator[dict[str, Any]]:
             )
         )
 
+        # Create a shared prefix if specified
+        rand = Random(self.random_seed + 3)
+        prefix_iter = self._create_prefix_iter(faker, rand)
+
         while True:
             prompt_tokens_count = next(prompt_tokens_sampler)
             output_tokens_count = next(output_tokens_sampler)
 
             yield {
+                "prefix": next(prefix_iter),
                 "prompt": self._create_prompt(
-                    prompt_tokens_count, samples_generated, faker
+                    prompt_tokens_count, faker, f"{samples_generated} "
                 ),
                 "prompt_tokens_count": prompt_tokens_count,
                 "output_tokens_count": output_tokens_count,
             }
             samples_generated += 1
 
-    def _create_prompt(self, prompt_tokens_count: int, index: int, faker: Faker) -> str:
+    def _create_prompt(
+        self, prompt_tokens_count: int, faker: Faker, unique: str = ""
+    ) -> str:
         prompt_token_ids = []
         avg_chars_per_token = 5
         margin_of_safety = 1.5
@@ -128,13 +184,42 @@ def _create_prompt(self, prompt_tokens_count: int, index: int, faker: Faker) ->
             num_chars = (
                 prompt_tokens_count * avg_chars_per_token * margin_of_safety * attempts
             )
-            text = f"{index} " + faker.text(max_nb_chars=num_chars)
+            text = unique + faker.text(max_nb_chars=num_chars)
             prompt_token_ids = self.processor.encode(text)
 
         return self.processor.decode(
             prompt_token_ids[:prompt_tokens_count], skip_special_tokens=True
         )
 
+    def _create_prefix_iter(self, faker: Faker, rand: Random) -> Iterator[str]:
+        if not self.config.prefix_buckets:
+            while True:
+                yield ""
+
+        # Increase weights to ensure an integer number of samples per per-prefix
+        least_common_prefix_count = math.lcm(
+            *(bucket.prefix_count for bucket in self.config.prefix_buckets)
+        )
+        unnorm_weights = [
+            least_common_prefix_count * bucket.bucket_weight // bucket.prefix_count
+            for bucket in self.config.prefix_buckets
+        ]
+        # Use GCD to reduce the weights to smallest integer ratio
+        common_divisor = math.gcd(*unnorm_weights)
+
+        # Create prefix list maintaining the correct distribution
+        prefixes = []
+        for bucket, weight in zip(self.config.prefix_buckets, unnorm_weights):
+            bucket_prefixes = [
+                self._create_prompt(bucket.prefix_tokens, faker)
+                for _ in range(bucket.prefix_count)
+            ]
+            sample_count = weight // common_divisor
+            prefixes.extend(bucket_prefixes * sample_count)
+
+        while True:
+            yield rand.choice(prefixes)
+
 
-
-        # Increase weights to ensure an integer number of samples per per-prefix
-        least_common_prefix_count = math.lcm(
-            *(bucket.prefix_count for bucket in self.config.prefix_buckets)
-        )
-        unnorm_weights = [
-            least_common_prefix_count * bucket.bucket_weight // bucket.prefix_count
-            for bucket in self.config.prefix_buckets
-        ]
-        # Use GCD to reduce the weights to smallest integer ratio
-        common_divisor = math.gcd(*unnorm_weights)
-
-        # Create prefix list maintaining the correct distribution
-        prefixes = []
-        for bucket, weight in zip(self.config.prefix_buckets, unnorm_weights):
-            bucket_prefixes = [
-                self._create_prompt(bucket.prefix_tokens, faker)
-                for _ in range(bucket.prefix_count)
-            ]
-            sample_count = weight // common_divisor
-            prefixes.extend(bucket_prefixes * sample_count)
-
-        while True:
-            yield rand.choice(prefixes)
+        else:
+            # Increase weights to ensure an integer number of samples per per-prefix
+            least_common_prefix_count = math.lcm(
+                *(bucket.prefix_count for bucket in self.config.prefix_buckets)
+            )
+            unnorm_weights = [
+                least_common_prefix_count * bucket.bucket_weight // bucket.prefix_count
+                for bucket in self.config.prefix_buckets
+            ]
+            # Use GCD to reduce the weights to smallest integer ratio
+            common_divisor = math.gcd(*unnorm_weights)
+
+            # Create prefix list maintaining the correct distribution
+            prefixes = []
+            for bucket, weight in zip(self.config.prefix_buckets, unnorm_weights):
+                bucket_prefixes = [
+                    self._create_prompt(bucket.prefix_tokens, faker)
+                    for _ in range(bucket.prefix_count)
+                ]
+                sample_count = weight // common_divisor
+                prefixes.extend(bucket_prefixes * sample_count)
+
+            while True:
+                yield rand.choice(prefixes)
-
-        # Increase weights to ensure an integer number of samples per per-prefix
-        least_common_prefix_count = math.lcm(
-            *(bucket.prefix_count for bucket in self.config.prefix_buckets)
-        )
-        unnorm_weights = [
-            least_common_prefix_count * bucket.bucket_weight // bucket.prefix_count
-            for bucket in self.config.prefix_buckets
-        ]
-        # Use GCD to reduce the weights to smallest integer ratio
-        common_divisor = math.gcd(*unnorm_weights)
-
-        # Create prefix list maintaining the correct distribution
-        prefixes = []
-        for bucket, weight in zip(self.config.prefix_buckets, unnorm_weights):
-            bucket_prefixes = [
-                self._create_prompt(bucket.prefix_tokens, faker)
-                for _ in range(bucket.prefix_count)
-            ]
-            sample_count = weight // common_divisor
-            prefixes.extend(bucket_prefixes * sample_count)
-
-        while True:
-            yield rand.choice(prefixes)
+        else:
+            # Increase weights to ensure an integer number of samples per per-prefix
+            least_common_prefix_count = math.lcm(
+                *(bucket.prefix_count for bucket in self.config.prefix_buckets)
+            )
+            unnorm_weights = [
+                least_common_prefix_count * bucket.bucket_weight // bucket.prefix_count
+                for bucket in self.config.prefix_buckets
+            ]
+            # Use GCD to reduce the weights to smallest integer ratio
+            common_divisor = math.gcd(*unnorm_weights)
+
+            # Create prefix list maintaining the correct distribution
+            prefixes = []
+            for bucket, weight in zip(self.config.prefix_buckets, unnorm_weights):
+                bucket_prefixes = [
+                    self._create_prompt(bucket.prefix_tokens, faker)
+                    for _ in range(bucket.prefix_count)
+                ]
+                sample_count = weight // common_divisor
+                prefixes.extend(bucket_prefixes * sample_count)
+
+            while True:
+                yield rand.choice(prefixes)
 @DatasetDeserializerFactory.register("synthetic_text")
 class SyntheticTextDatasetDeserializer(DatasetDeserializer):
@@ -166,6 +251,7 @@ def __call__(
             ),
             features=Features(
                 {
+                    "prefix": Value("string"),
                     "prompt": Value("string"),
                     "prompt_tokens_count": Value("int32"),
                     "output_tokens_count": Value("int32"),

diff --git a/src/guidellm/data/formatters/templates.py b/src/guidellm/data/formatters/templates.py
@@ -22,11 +22,7 @@ class JinjaTemplatesRegistry(RegistryMixin[Union[Template, str]]):
     textwrap.dedent("""
         {% set obj = {
             "json_body": {
-                "prompt": (
-                    text_column[0]
-                    if text_column and text_column|length == 1
-                    else text_column
-                )
+                "prompt": prefix_column[0]|default("") + text_column[0]
-                "prompt": prefix_column[0]|default("") + text_column[0]
+                "prompt": prefix_column[0]|default("") ~ " " ~ text_column[0]
-                "prompt": prefix_column[0]|default("") + text_column[0]
+                "prompt": prefix_column[0]|default("") ~ " " ~ text_column[0]
             }
         } %}
 
@@ -52,6 +48,10 @@ class JinjaTemplatesRegistry(RegistryMixin[Union[Template, str]]):
         {% set obj = {
             "json_body": {
                 "messages": [
+                    {
+                        "role": "system",
+                        "content": prefix_column[0]|default("")
+                    },
                     {
                         "role": "user",
                         "content": []
@@ -61,11 +61,11 @@ class JinjaTemplatesRegistry(RegistryMixin[Union[Template, str]]):
         } %}
 
         {%- for item in text_column or [] %}
-            {% do obj["json_body"].messages[0].content.append({"type": "text", "text": item}) %}
+            {% do obj["json_body"].messages[1].content.append({"type": "text", "text": item}) %}
         {%- endfor %}
 
         {%- for item in image_column or [] %}
-            {% do obj["json_body"].messages[0].content.append({
+            {% do obj["json_body"].messages[1].content.append({
                 "type": "image_url",
                 "image_url": encode_image(
                     item,
@@ -78,7 +78,7 @@ class JinjaTemplatesRegistry(RegistryMixin[Union[Template, str]]):
         {%- endfor %}
 
         {%- for item in video_column or [] %}
-            {% do obj["json_body"].messages[0].content.append({
+            {% do obj["json_body"].messages[1].content.append({
                 "type": "video_url",
                 "video_url": encode_video(
                     item,

diff --git a/src/guidellm/data/objects.py b/src/guidellm/data/objects.py
@@ -31,6 +31,7 @@
 GenerativeDatasetColumnType = Literal[
     "prompt_tokens_count_column",
     "output_tokens_count_column",
+    "prefix_column",
     "text_column",
     "image_column",
     "video_column",
@@ -195,6 +196,7 @@ class GenerativeDatasetArgs(StandardBaseDict):
     split: str | None = None
     prompt_tokens_count_column: str | None = None
     output_tokens_count_column: str | None = None
+    prefix_column: str | None = None
     text_column: str | list[str] | None = None
     image_column: str | list[str] | None = None
     video_column: str | list[str] | None = None

diff --git a/src/guidellm/data/utils.py b/src/guidellm/data/utils.py
@@ -80,6 +80,11 @@
 DEFAULT_COLUMN_NAMES: dict[str, list[str]] = {
     "prompt_tokens_count": ["prompt_tokens_count", "input_tokens_count"],
     "output_tokens_count": ["output_tokens_count", "completion_tokens_count"],
+    "prefix_column": [
+        "system_prompt",
+        "system",
+        "prefix",
+    ],
     "text_column": [
         "prompt",
         "instruction",

diff --git a/tests/unit/dataset/__init__.py → tests/unit/data/__init__.py b/tests/unit/dataset/__init__.py → tests/unit/data/__init__.py
diff --git a/tests/unit/data/deserializers/__init__.py b/tests/unit/data/deserializers/__init__.py