Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/datasets.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ guidellm benchmark \
- `output_tokens_stdev`: Standard deviation for output tokens. If not supplied and min/max are not specified, no deviation is applied. If not supplied and min/max are specified, a uniform distribution is used.
- `output_tokens_min`: Minimum number of tokens in outputs. If unset and `output_tokens_stdev` is set, the minimum is 1.
- `output_tokens_max`: Maximum number of tokens in outputs. If unset and `output_tokens_stdev` is set, the maximum is 5 times the standard deviation.
- `prefix_tokens`: Number of tokens to share as a prefix across all prompts. Is additive to the prompt tokens distribution so each request is `prefix_tokens + prompt_tokens_sample()`. If unset, defaults to 0.
- `samples`: Number of samples to generate (default: 1000). More samples will increase the time taken to generate the dataset before benchmarking, but will also decrease the likelihood of caching requests.
- `source`: Source text for generation (default: `data:prideandprejudice.txt.gz`). This can be any text file, URL containing a text file, or a compressed text file. The text is used to sample from at a word and punctuation granularity and then combined into a single string of the desired lengths.

Expand Down
2 changes: 2 additions & 0 deletions src/guidellm/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from .hf_datasets import HFDatasetsCreator
from .in_memory import InMemoryDatasetCreator
from .synthetic import (
PrefixBucketConfig,
SyntheticDatasetConfig,
SyntheticDatasetCreator,
SyntheticTextItemsGenerator,
Expand All @@ -15,6 +16,7 @@
"FileDatasetCreator",
"HFDatasetsCreator",
"InMemoryDatasetCreator",
"PrefixBucketConfig",
"SyntheticDatasetConfig",
"SyntheticDatasetCreator",
"SyntheticTextItemsGenerator",
Expand Down
122 changes: 110 additions & 12 deletions src/guidellm/dataset/synthetic.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import json
import math
import random
from collections.abc import Iterable, Iterator
from itertools import cycle
from pathlib import Path
from typing import Any, Literal, Optional, Union

Expand All @@ -11,20 +13,48 @@
IterableDataset,
IterableDatasetDict,
)
from pydantic import BaseModel, Field
from pydantic import BaseModel, ConfigDict, Field, model_validator
from transformers import PreTrainedTokenizerBase # type: ignore[import]
from typing_extensions import Self

from guidellm.dataset.creator import ColumnInputTypes, DatasetCreator
from guidellm.utils import EndlessTextCreator, IntegerRangeSampler, check_load_processor

__all__ = [
"PrefixBucketConfig",
"SyntheticDatasetConfig",
"SyntheticDatasetCreator",
"SyntheticTextItemsGenerator",
]


class PrefixBucketConfig(BaseModel):
bucket_weight: int = Field(
description="Weight of this bucket in the overall distribution.",
gt=0,
default=100,
)
prefix_count: int = Field(
description="The number of unique prefixs to generate for this bucket.",
ge=1,
default=1,
)
prefix_tokens: int = Field(
description="The number of prefix tokens per-prompt for this bucket.",
ge=0,
default=0,
)


class SyntheticDatasetConfig(BaseModel):
model_config = ConfigDict(
extra="allow",
)

prefix_buckets: Optional[list[PrefixBucketConfig]] = Field(
description="Buckets for the prefix tokens distribution.",
default=None,
)
prompt_tokens: int = Field(
description="The average number of text tokens generated for prompts.",
gt=0,
Expand Down Expand Up @@ -73,6 +103,26 @@ class SyntheticDatasetConfig(BaseModel):
default="data:prideandprejudice.txt.gz",
)

@model_validator(mode="after")
def check_prefix_options(self) -> Self:
prefix_count = self.__pydantic_extra__.get("prefix_count", None) # type: ignore[attr-defined]
prefix_tokens = self.__pydantic_extra__.get("prefix_count", None) # type: ignore[attr-defined]
if prefix_count is not None or prefix_tokens is not None:
if self.prefix_buckets:
raise ValueError(
"prefix_buckets is mutually exclusive"
" with prefix_count and prefix_tokens"
)

self.prefix_buckets = [
PrefixBucketConfig(
prefix_count=prefix_count or 1,
prefix_tokens=prefix_tokens or 0,
)
]

return self

@staticmethod
def parse_str(data: Union[str, Path]) -> "SyntheticDatasetConfig":
if (
Expand Down Expand Up @@ -162,40 +212,88 @@ def __iter__(
random_seed=self.random_seed + 1, # ensure diff dist from prompts
)
# ensure diff distribution from output tokens
rand = random.Random(self.random_seed + 2) # noqa: S311
rand = random.Random(self.random_seed + 3) # noqa: S311
shared_prefix_iter = self._create_prefix_iter(rand)
unique_prefix_iter = cycle(self.processor.get_vocab().values())

for _, prompt_tokens, output_tokens in zip(
range(self.config.samples),
prompt_tokens_sampler,
output_tokens_sampler,
):
start_index = rand.randint(0, len(self.text_creator.words))
start_index = self._rand_start_index(rand)
prefix_tokens = next(shared_prefix_iter, [])
prompt_text = self.processor.decode(
prefix_tokens
+ self._create_prompt(
prompt_tokens, start_index, next(unique_prefix_iter)
),
skip_special_tokens=True,
)
yield {
"prompt": self._create_prompt(prompt_tokens, start_index),
"prompt_tokens_count": prompt_tokens,
"prompt": prompt_text,
"prompt_tokens_count": len(prefix_tokens) + prompt_tokens,
"output_tokens_count": output_tokens,
}

def _create_prompt(self, prompt_tokens: int, start_index: int) -> str:
def _rand_start_index(self, rand: random.Random) -> int:
"""Generate a random start index for text generation."""
return rand.randint(0, len(self.text_creator.words) - 1)

def _create_prefix_iter(self, rand: random.Random) -> Iterator[list[int]]:
if not self.config.prefix_buckets:
while True:
yield []

# Increase weights to ensure an integer number of samples per per-prefix
least_common_prefix_count = math.lcm(
*(bucket.prefix_count for bucket in self.config.prefix_buckets)
)
unnorm_weights = [
least_common_prefix_count * bucket.bucket_weight // bucket.prefix_count
for bucket in self.config.prefix_buckets
]
# Use GCD to reduce the weights to smallest integer ratio
common_divisor = math.gcd(*unnorm_weights)

# Create prefix list maintaining the correct distribution
prefixes = []
for bucket, weight in zip(self.config.prefix_buckets, unnorm_weights):
bucket_prefixes = []
for _ in range(bucket.prefix_count):
start_index = self._rand_start_index(rand)
prompt_tokens = self._create_prompt(bucket.prefix_tokens, start_index)
bucket_prefixes.append(prompt_tokens)
sample_count = weight // common_divisor
prefixes.extend(bucket_prefixes * sample_count)

while True:
yield rand.choice(prefixes)

def _create_prompt(
self, prompt_tokens: int, start_index: int, unique_prefix: Optional[int] = None
) -> list[int]:
if prompt_tokens <= 0:
return ""
return []

left = start_index
right = start_index + 4 * prompt_tokens
start_tokens = [unique_prefix] if unique_prefix else []

while left < right:
mid = (left + right) // 2
test_prompt = self.text_creator.create_text(start_index, mid - start_index)
test_tokens = len(self.processor.tokenize(test_prompt))
test_tokens = start_tokens + self.processor.encode(test_prompt)

if test_tokens == prompt_tokens:
return test_prompt
elif test_tokens < prompt_tokens:
if len(test_tokens) == prompt_tokens:
return test_tokens
elif len(test_tokens) < prompt_tokens:
left = mid + 1
else:
right = mid

return self.text_creator.create_text(start_index, left - start_index)
final_text = self.text_creator.create_text(start_index, left - start_index)
return start_tokens + self.processor.encode(final_text)


class SyntheticDatasetCreator(DatasetCreator):
Expand Down
Empty file added tests/unit/dataset/__init__.py
Empty file.
Loading