Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2,465 changes: 1,331 additions & 1,134 deletions pylock.toml

Large diffs are not rendered by default.

9 changes: 9 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ include = ["*"]
[tool.pdm]
distribution = true

[[tool.pdm.source]]
name = "torch"
type = "find_links"
#url = "https://download.pytorch.org/whl/cpu/torch_stable.html"
Copy link
Preview

Copilot AI Oct 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commented-out URL should be removed rather than left as a comment. If this is for documentation purposes, consider adding a proper comment explaining why this specific URL is used.

Suggested change
#url = "https://download.pytorch.org/whl/cpu/torch_stable.html"
# Previous URL for torch wheels (replaced with the current one due to updated structure)

Copilot uses AI. Check for mistakes.

url = "https://download.pytorch.org/whl/cpu/torch/"
include_packages = ["torch"]


# ************************************************
# ********** Project Metadata **********
Expand Down Expand Up @@ -64,6 +71,8 @@ dependencies = [
"sanic",
"transformers",
"uvloop>=0.18",
"librosa>=0.11.0",
"torch>=2.8.0",
]

[project.optional-dependencies]
Expand Down
2 changes: 2 additions & 0 deletions src/guidellm/data/deserializers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
SyntheticTextDatasetConfig,
SyntheticTextDatasetDeserializer,
SyntheticTextGenerator,
SyntheticTextPrefixBucketConfig,
)

__all__ = [
Expand All @@ -46,6 +47,7 @@
"SyntheticTextDatasetConfig",
"SyntheticTextDatasetDeserializer",
"SyntheticTextGenerator",
"SyntheticTextPrefixBucketConfig",
"TarFileDatasetDeserializer",
"TextFileDatasetDeserializer",
]
96 changes: 91 additions & 5 deletions src/guidellm/data/deserializers/synthetic.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from __future__ import annotations

import math
from collections.abc import Iterator
from pathlib import Path
from typing import Any, Callable
from random import Random
from typing import Any, Callable, Self

import yaml
from datasets import Features, IterableDataset, Value
from faker import Faker
from pydantic import Field
from pydantic import ConfigDict, Field, model_validator
from transformers import PreTrainedTokenizerBase

from guidellm.data.deserializers.deserializer import (
Expand All @@ -21,10 +23,37 @@
"SyntheticTextDatasetConfig",
"SyntheticTextDatasetDeserializer",
"SyntheticTextGenerator",
"SyntheticTextPrefixBucketConfig",
]


class SyntheticTextPrefixBucketConfig(StandardBaseModel):
bucket_weight: int = Field(
description="Weight of this bucket in the overall distribution.",
gt=0,
default=100,
)
prefix_count: int = Field(
description="The number of unique prefixes to generate for this bucket.",
ge=1,
default=1,
)
prefix_tokens: int = Field(
description="The number of prefix tokens per-prompt for this bucket.",
ge=0,
default=0,
)


class SyntheticTextDatasetConfig(StandardBaseModel):
model_config = ConfigDict(
extra="allow",
)

prefix_buckets: list[SyntheticTextPrefixBucketConfig] | None = Field(
description="Buckets for the prefix tokens distribution.",
default=None,
)
prompt_tokens: int = Field(
description="The average number of text tokens generated for prompts.",
gt=0,
Expand Down Expand Up @@ -68,6 +97,26 @@ class SyntheticTextDatasetConfig(StandardBaseModel):
default="data:prideandprejudice.txt.gz",
)

@model_validator(mode="after")
def check_prefix_options(self) -> Self:
prefix_count = self.__pydantic_extra__.get("prefix_count", None) # type: ignore[attr-defined]
prefix_tokens = self.__pydantic_extra__.get("prefix_count", None) # type: ignore[attr-defined]
Copy link
Preview

Copilot AI Oct 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The second call to __pydantic_extra__.get() should retrieve 'prefix_tokens', not 'prefix_count'. This will always return None for prefix_tokens, breaking the backward compatibility feature.

Suggested change
prefix_tokens = self.__pydantic_extra__.get("prefix_count", None) # type: ignore[attr-defined]
prefix_tokens = self.__pydantic_extra__.get("prefix_tokens", None) # type: ignore[attr-defined]

Copilot uses AI. Check for mistakes.

if prefix_count is not None or prefix_tokens is not None:
if self.prefix_buckets:
raise ValueError(
"prefix_buckets is mutually exclusive"
" with prefix_count and prefix_tokens"
)

self.prefix_buckets = [
SyntheticTextPrefixBucketConfig(
prefix_count=prefix_count or 1,
prefix_tokens=prefix_tokens or 0,
)
]

return self


class SyntheticTextGenerator:
def __init__(
Expand Down Expand Up @@ -104,20 +153,27 @@ def __iter__(self) -> Iterator[dict[str, Any]]:
)
)

# Create a shared prefix if specified
rand = Random(self.random_seed + 3)
prefix_iter = self._create_prefix_iter(faker, rand)

while True:
prompt_tokens_count = next(prompt_tokens_sampler)
output_tokens_count = next(output_tokens_sampler)

yield {
"prefix": next(prefix_iter),
"prompt": self._create_prompt(
prompt_tokens_count, samples_generated, faker
prompt_tokens_count, faker, f"{samples_generated} "
),
"prompt_tokens_count": prompt_tokens_count,
"output_tokens_count": output_tokens_count,
}
samples_generated += 1

def _create_prompt(self, prompt_tokens_count: int, index: int, faker: Faker) -> str:
def _create_prompt(
self, prompt_tokens_count: int, faker: Faker, unique: str = ""
) -> str:
prompt_token_ids = []
avg_chars_per_token = 5
margin_of_safety = 1.5
Expand All @@ -128,13 +184,42 @@ def _create_prompt(self, prompt_tokens_count: int, index: int, faker: Faker) ->
num_chars = (
prompt_tokens_count * avg_chars_per_token * margin_of_safety * attempts
)
text = f"{index} " + faker.text(max_nb_chars=num_chars)
text = unique + faker.text(max_nb_chars=num_chars)
prompt_token_ids = self.processor.encode(text)

return self.processor.decode(
prompt_token_ids[:prompt_tokens_count], skip_special_tokens=True
)

def _create_prefix_iter(self, faker: Faker, rand: Random) -> Iterator[str]:
if not self.config.prefix_buckets:
while True:
yield ""

# Increase weights to ensure an integer number of samples per per-prefix
least_common_prefix_count = math.lcm(
*(bucket.prefix_count for bucket in self.config.prefix_buckets)
)
unnorm_weights = [
least_common_prefix_count * bucket.bucket_weight // bucket.prefix_count
Copy link
Preview

Copilot AI Oct 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This calculation could result in integer division by zero if bucket.prefix_count is 0, despite the field validation. Consider adding a runtime check or ensuring the validation prevents this case.

Copilot uses AI. Check for mistakes.

for bucket in self.config.prefix_buckets
]
# Use GCD to reduce the weights to smallest integer ratio
common_divisor = math.gcd(*unnorm_weights)

# Create prefix list maintaining the correct distribution
prefixes = []
for bucket, weight in zip(self.config.prefix_buckets, unnorm_weights):
bucket_prefixes = [
self._create_prompt(bucket.prefix_tokens, faker)
for _ in range(bucket.prefix_count)
]
sample_count = weight // common_divisor
prefixes.extend(bucket_prefixes * sample_count)

while True:
yield rand.choice(prefixes)


Comment on lines +198 to 223
Copy link
Preview

Copilot AI Oct 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function returns early when prefix_buckets is None or empty, but the remaining code is not properly indented as an else block. This creates unreachable code and potential runtime errors.

Suggested change
# Increase weights to ensure an integer number of samples per per-prefix
least_common_prefix_count = math.lcm(
*(bucket.prefix_count for bucket in self.config.prefix_buckets)
)
unnorm_weights = [
least_common_prefix_count * bucket.bucket_weight // bucket.prefix_count
for bucket in self.config.prefix_buckets
]
# Use GCD to reduce the weights to smallest integer ratio
common_divisor = math.gcd(*unnorm_weights)
# Create prefix list maintaining the correct distribution
prefixes = []
for bucket, weight in zip(self.config.prefix_buckets, unnorm_weights):
bucket_prefixes = [
self._create_prompt(bucket.prefix_tokens, faker)
for _ in range(bucket.prefix_count)
]
sample_count = weight // common_divisor
prefixes.extend(bucket_prefixes * sample_count)
while True:
yield rand.choice(prefixes)
else:
# Increase weights to ensure an integer number of samples per per-prefix
least_common_prefix_count = math.lcm(
*(bucket.prefix_count for bucket in self.config.prefix_buckets)
)
unnorm_weights = [
least_common_prefix_count * bucket.bucket_weight // bucket.prefix_count
for bucket in self.config.prefix_buckets
]
# Use GCD to reduce the weights to smallest integer ratio
common_divisor = math.gcd(*unnorm_weights)
# Create prefix list maintaining the correct distribution
prefixes = []
for bucket, weight in zip(self.config.prefix_buckets, unnorm_weights):
bucket_prefixes = [
self._create_prompt(bucket.prefix_tokens, faker)
for _ in range(bucket.prefix_count)
]
sample_count = weight // common_divisor
prefixes.extend(bucket_prefixes * sample_count)
while True:
yield rand.choice(prefixes)

Copilot uses AI. Check for mistakes.

@DatasetDeserializerFactory.register("synthetic_text")
class SyntheticTextDatasetDeserializer(DatasetDeserializer):
Expand Down Expand Up @@ -166,6 +251,7 @@ def __call__(
),
features=Features(
{
"prefix": Value("string"),
"prompt": Value("string"),
"prompt_tokens_count": Value("int32"),
"output_tokens_count": Value("int32"),
Expand Down
16 changes: 8 additions & 8 deletions src/guidellm/data/formatters/templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,7 @@ class JinjaTemplatesRegistry(RegistryMixin[Union[Template, str]]):
textwrap.dedent("""
{% set obj = {
"json_body": {
"prompt": (
text_column[0]
if text_column and text_column|length == 1
else text_column
)
"prompt": prefix_column[0]|default("") + text_column[0]
Copy link
Preview

Copilot AI Oct 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Direct string concatenation without proper spacing could result in malformed prompts. Consider adding a space or newline separator between prefix and text content to ensure proper formatting.

Suggested change
"prompt": prefix_column[0]|default("") + text_column[0]
"prompt": prefix_column[0]|default("") ~ " " ~ text_column[0]

Copilot uses AI. Check for mistakes.

}
} %}

Expand All @@ -52,6 +48,10 @@ class JinjaTemplatesRegistry(RegistryMixin[Union[Template, str]]):
{% set obj = {
"json_body": {
"messages": [
{
"role": "system",
"content": prefix_column[0]|default("")
},
{
"role": "user",
"content": []
Expand All @@ -61,11 +61,11 @@ class JinjaTemplatesRegistry(RegistryMixin[Union[Template, str]]):
} %}

{%- for item in text_column or [] %}
{% do obj["json_body"].messages[0].content.append({"type": "text", "text": item}) %}
{% do obj["json_body"].messages[1].content.append({"type": "text", "text": item}) %}
{%- endfor %}

{%- for item in image_column or [] %}
{% do obj["json_body"].messages[0].content.append({
{% do obj["json_body"].messages[1].content.append({
"type": "image_url",
"image_url": encode_image(
item,
Expand All @@ -78,7 +78,7 @@ class JinjaTemplatesRegistry(RegistryMixin[Union[Template, str]]):
{%- endfor %}

{%- for item in video_column or [] %}
{% do obj["json_body"].messages[0].content.append({
{% do obj["json_body"].messages[1].content.append({
"type": "video_url",
"video_url": encode_video(
item,
Expand Down
2 changes: 2 additions & 0 deletions src/guidellm/data/objects.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
GenerativeDatasetColumnType = Literal[
"prompt_tokens_count_column",
"output_tokens_count_column",
"prefix_column",
"text_column",
"image_column",
"video_column",
Expand Down Expand Up @@ -195,6 +196,7 @@ class GenerativeDatasetArgs(StandardBaseDict):
split: str | None = None
prompt_tokens_count_column: str | None = None
output_tokens_count_column: str | None = None
prefix_column: str | None = None
text_column: str | list[str] | None = None
image_column: str | list[str] | None = None
video_column: str | list[str] | None = None
Expand Down
5 changes: 5 additions & 0 deletions src/guidellm/data/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,11 @@
DEFAULT_COLUMN_NAMES: dict[str, list[str]] = {
"prompt_tokens_count": ["prompt_tokens_count", "input_tokens_count"],
"output_tokens_count": ["output_tokens_count", "completion_tokens_count"],
"prefix_column": [
"system_prompt",
"system",
"prefix",
],
"text_column": [
"prompt",
"instruction",
Expand Down
File renamed without changes.
Empty file.
Loading
Loading