vllm-project
diff --git a/‎docs/guides/datasets.md‎
Lines changed: 31 additions & 8 deletions b/‎docs/guides/datasets.md‎
Lines changed: 31 additions & 8 deletions
diff --git a/‎src/guidellm/__main__.py‎
Lines changed: 5 additions & 13 deletions b/‎src/guidellm/__main__.py‎
Lines changed: 5 additions & 13 deletions
diff --git a/‎src/guidellm/data/config.py‎
Lines changed: 121 additions & 0 deletions b/‎src/guidellm/data/config.py‎
Lines changed: 121 additions & 0 deletions
diff --git a/‎src/guidellm/data/deserializers/__init__.py‎
Lines changed: 0 additions & 4 deletions b/‎src/guidellm/data/deserializers/__init__.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎src/guidellm/data/deserializers/deserializer.py‎
Lines changed: 1 addition & 5 deletions b/‎src/guidellm/data/deserializers/deserializer.py‎
Lines changed: 1 addition & 5 deletions
@@ -251,12 +251,37 @@ guidellm preprocess dataset \
     "path/to/input_dataset.jsonl" \
     "path/to/processed_dataset.jsonl" \
     --processor "gpt2" \
-    --config "prompt_tokens=512,output_tokens=256"
+    --config "prompt_tokens=512,output_tokens=256,prefix_tokens_max=100"
 ```
 
 ### Configuration and Processor Options
 
-The `--config` parameter uses the same format as synthetic data configuration. It accepts a JSON string, key=value pairs, or a configuration file path. For detailed information about available configuration parameters (such as `prompt_tokens`, `output_tokens`, `prompt_tokens_stdev`, etc.), see the [Synthetic Data Configuration Options](../datasets.md#configuration-options) in the Dataset Configurations guide.
+The `--config` parameter accepts a `PreprocessDatasetConfig` as a JSON string, key=value pairs, or a configuration file path (.json, .yaml, .yml, .config). This configuration is similar to the synthetic data configuration but includes additional fields specific to preprocessing.
+
+**PreprocessDatasetConfig Options:**
+
+- `prompt_tokens`: Average number of tokens in prompts. If nothing else is specified, all prompts will be resized to this number of tokens.
+- `prompt_tokens_stdev`: Standard deviation for prompt tokens. If not supplied and min/max are not specified, no deviation is applied. If not supplied and min/max are specified, a uniform distribution is used.
+- `prompt_tokens_min`: Minimum number of tokens in prompts. If unset and `prompt_tokens_stdev` is set, the minimum is 1.
+- `prompt_tokens_max`: Maximum number of tokens in prompts. If unset and `prompt_tokens_stdev` is set, the maximum is 5 times the standard deviation.
+- `output_tokens`: Average number of tokens in outputs. If nothing else is specified, all outputs will have this number of tokens.
+- `output_tokens_stdev`: Standard deviation for output tokens. If not supplied and min/max are not specified, no deviation is applied. If not supplied and min/max are specified, a uniform distribution is used.
+- `output_tokens_min`: Minimum number of tokens in outputs. If unset and `output_tokens_stdev` is set, the minimum is 1.
+- `output_tokens_max`: Maximum number of tokens in outputs. If unset and `output_tokens_stdev` is set, the maximum is 5 times the standard deviation.
+- `prefix_tokens_max`: Maximum number of prefix tokens to keep. If set, prefixes will be trimmed to this maximum length. If not set, prefixes are kept as-is (unless `--include-prefix-in-token-count` is used, which disables prefix trimming).
+
+**Example configurations:**
+
+```bash
+# Using key=value pairs
+--config "prompt_tokens=512,output_tokens=256,prefix_tokens_max=100"
+
+# Using JSON string
+--config '{"prompt_tokens": 512, "output_tokens": 256, "prefix_tokens_max": 100}'
+
+# Using a configuration file
+--config "path/to/config.json"
+```
 
 The `--processor` argument specifies the tokenizer to use for calculating token counts. This is required because the preprocessing command needs to tokenize prompts to ensure they match the target token sizes. For information about using processors, including Hugging Face model IDs, local paths, and processor arguments, see the [Data Arguments Overview](../datasets.md#data-arguments-overview) section.
 
@@ -358,7 +383,6 @@ guidellm preprocess dataset \
 | Option                            | Description                                                                                                                             |
 | --------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------- |
 | `--data-args <JSON>`              | JSON string of arguments to pass to dataset loading. See [Data Arguments Overview](../datasets.md#data-arguments-overview) for details. |
-| `--prefix-tokens <NUMBER>`        | Single prefix token count (alternative to `prefix_tokens` in config).                                                                   |
 | `--include-prefix-in-token-count` | Include prefix tokens in prompt token count calculation (flag). When enabled, prefix trimming is disabled and the prefix is kept as-is. |
 | `--random-seed <NUMBER>`          | Random seed for reproducible token sampling (default: 42).                                                                              |
 | `--push-to-hub`                   | Push the processed dataset to Hugging Face Hub (flag).                                                                                  |
@@ -390,16 +414,15 @@ guidellm preprocess dataset \
     --random-seed 123
 ```
 
-**Example 3: Preprocessing with processor arguments and prefix tokens**
+**Example 3: Preprocessing with processor arguments and prefix token limits**
 
 ```bash
 guidellm preprocess dataset \
     "dataset.jsonl" \
     "processed.jsonl" \
     --processor "gpt2" \
     --processor-args '{"use_fast": false}' \
-    --config "prompt_tokens=512,output_tokens=256" \
-    --prefix-tokens 100 \
+    --config "prompt_tokens=512,output_tokens=256,prefix_tokens_max=100" \
     --include-prefix-in-token-count
 ```
 
@@ -417,9 +440,9 @@ guidellm preprocess dataset \
 
 ### Notes
 
-- The `--config` parameter uses the same format as synthetic data configuration. See the [Synthetic Data Configuration Options](../datasets.md#configuration-options) for all available parameters.
+- The `--config` parameter accepts a `PreprocessDatasetConfig` which includes all token count fields (prompt_tokens, output_tokens, etc.) plus `prefix_tokens_max` for controlling prefix length. See the [Configuration and Processor Options](#configuration-and-processor-options) section above for all available parameters.
 - The processor/tokenizer is required because the preprocessing command needs to tokenize prompts to ensure they match target token sizes. See the [Data Arguments Overview](../datasets.md#data-arguments-overview) for processor usage details.
 - Column mappings are only needed when your dataset uses non-standard column names. GuideLLM will automatically try common column names if no mapping is provided.
 - When using `--short-prompt-strategy concatenate`, ensure your dataset has enough samples to concatenate, or some prompts may be skipped.
 - The output format is determined by the file extension of `OUTPUT_PATH` (e.g., `.jsonl`, `.csv`, `.parquet`).
-- The prefix handling only trims prefixes. It doesn't expand them. Prefix buckets, if specified, only trim the given prefixes by bucket weighting. It doesn't generate unique prefixes for each bucket.
+- The prefix handling only trims prefixes. It doesn't expand them. Use `prefix_tokens_max` in the config to set a maximum prefix length, which will trim prefixes that exceed this limit.
@@ -520,10 +520,11 @@ def preprocess():
     type=str,
     required=True,
     help=(
-        "SyntheticTextDatasetConfig as JSON string, key=value pairs, "
+        "PreprocessDatasetConfig as JSON string, key=value pairs, "
         "or file path (.json, .yaml, .yml, .config). "
-        "Example: 'prompt_tokens=100,output_tokens=50'"
-        " or '{\"prompt_tokens\": 100, \"output_tokens\": 50}'"
+        "Example: 'prompt_tokens=100,output_tokens=50,prefix_tokens_max=10'"
+        " or '{\"prompt_tokens\": 100, \"output_tokens\": 50, "
+        "\"prefix_tokens_max\": 10}'"
     ),
 )
 @click.option(
@@ -565,18 +566,11 @@ def preprocess():
         "Delimiter for concatenating short prompts (used with 'concatenate' strategy)."
     ),
 )
-@click.option(
-    "--prefix-tokens",
-    type=int,
-    default=None,
-    help="Single prefix token count (alternative to prefix_buckets in config).",
-)
 @click.option(
     "--include-prefix-in-token-count",
     is_flag=True,
     default=False,
-    help="Include prefix tokens in prompt token count calculation. When enabled, "
-         "prefix trimming is disabled and the prefix is kept as-is.",
+    help="Include prefix tokens in prompt token count calculation.",
 )
 @click.option(
     "--push-to-hub",
@@ -607,7 +601,6 @@ def dataset(
     short_prompt_strategy,
     pad_char,
     concat_delimiter,
-    prefix_tokens,
     include_prefix_in_token_count,
     push_to_hub,
     hub_dataset_id,
@@ -624,7 +617,6 @@ def dataset(
         short_prompt_strategy=short_prompt_strategy,
         pad_char=pad_char,
         concat_delimiter=concat_delimiter,
-        prefix_tokens=prefix_tokens,
         include_prefix_in_token_count=include_prefix_in_token_count,
         push_to_hub=push_to_hub,
         hub_dataset_id=hub_dataset_id,
 
@@ -0,0 +1,121 @@
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any, TypeVar
+
+import yaml
+from pydantic import ValidationError
+
+from guidellm.data.schemas import DataNotSupportedError
+from guidellm.schemas import StandardBaseModel
+
+ConfigT = TypeVar("ConfigT", bound=StandardBaseModel)
+
+
+def load_config(config: Any, config_class: type[ConfigT]) -> ConfigT | None:
+    # Try file path first
+    if (loaded_config := _load_config_file(config, config_class)) is not None:
+        return loaded_config
+
+    # Try dict parsing next
+    if (loaded_config := _load_config_dict(config, config_class)) is not None:
+        return loaded_config
+
+    # Try string parsing
+    if (loaded_config := _load_config_str(config, config_class)) is not None:
+        return loaded_config
+
+    return None
+
+
+def _load_config_dict(data: Any, config_class: type[ConfigT]) -> ConfigT | None:
+    if not isinstance(data, dict | list):
+        return None
+
+    try:
+        return config_class.model_validate(data)
+    except ValidationError:
+        return None
+
+
+def _load_config_file(data: Any, config_class: type[ConfigT]) -> ConfigT | None:
+    if (not isinstance(data, str) and not isinstance(data, Path)) or (
+        not Path(data).is_file()
+    ):
+        return None
+
+    data_path = Path(data) if isinstance(data, str) else data
+    error = None
+
+    if Path(data).is_file() and data_path.suffix.lower() == ".json":
+        try:
+            return config_class.model_validate_json(
+                data_path.read_text()
+            )
+        except Exception as err:  # noqa: BLE001
+            error = err
+
+    if Path(data).is_file() and data_path.suffix.lower() in {
+        ".yaml",
+        ".yml",
+        ".config",
+    }:
+        try:
+            return config_class.model_validate(
+                yaml.safe_load(data_path.read_text())
+            )
+        except Exception as err:  # noqa: BLE001
+            error = err
+
+    err_message = (
+        f"Unsupported file {data_path} for "
+        f"{config_class.__name__}, expected .json, "
+        f".yaml, .yml, or .config"
+    )
+
+    if error is not None:
+        err_message += f" with error: {error}"
+        raise DataNotSupportedError(err_message) from error
+    raise DataNotSupportedError(err_message)
+
+
+def _load_config_str(data: str, config_class: type[ConfigT]) -> ConfigT | None:
+    if not isinstance(data, str):
+        return None
+
+    data_str = data.strip()
+    error = None
+
+    if (data_str.startswith("{") and data_str.endswith("}")) or (
+        data_str.startswith("[") and data_str.endswith("]")
+    ):
+        try:
+            return config_class.model_validate_json(data_str)
+        except Exception as err:  # noqa: BLE001
+            error = err
+
+    if data_str.count("=") > 1:
+        # key=value pairs separated by commas
+        try:
+            config_dict = {}
+            items = data_str.split(",")
+            for item in items:
+                key, value = item.split("=")
+                config_dict[key.strip()] = (
+                    int(value.strip())
+                    if value.strip().isnumeric()
+                    else value.strip()
+                )
+
+            return config_class.model_validate(config_dict)
+        except Exception as err:  # noqa: BLE001
+            error = err
+
+    err_message = (
+        f"Unsupported string data for {config_class.__name__}, "
+        f"expected JSON or key-value pairs, got {data}"
+    )
+    if error is not None:
+        err_message += f" with error: {error}"
+        raise DataNotSupportedError(err_message) from error
+    raise DataNotSupportedError(err_message)
@@ -23,9 +23,7 @@
 )
 from .synthetic import (
     SyntheticTextDataset,
-    SyntheticTextDatasetConfig,
     SyntheticTextDatasetDeserializer,
-    SyntheticTextPrefixBucketConfig,
 )
 
 __all__ = [
@@ -45,9 +43,7 @@
     "JSONFileDatasetDeserializer",
     "ParquetFileDatasetDeserializer",
     "SyntheticTextDataset",
-    "SyntheticTextDatasetConfig",
     "SyntheticTextDatasetDeserializer",
-    "SyntheticTextPrefixBucketConfig",
     "TarFileDatasetDeserializer",
     "TextFileDatasetDeserializer",
 ]
@@ -6,20 +6,16 @@
 from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
 from transformers import PreTrainedTokenizerBase
 
+from guidellm.data.schemas import DataNotSupportedError
 from guidellm.data.utils import resolve_dataset_split
 from guidellm.utils import RegistryMixin
 
 __all__ = [
-    "DataNotSupportedError",
     "DatasetDeserializer",
     "DatasetDeserializerFactory",
 ]
 
 
-class DataNotSupportedError(Exception):
-    """Exception raised when data format is not supported by deserializer."""
-
-
 @runtime_checkable
 class DatasetDeserializer(Protocol):
     def __call__(
Original file line number	Diff line number	Diff line change
`@@ -23,9 +23,7 @@`
`23`	`23`	`)`
`24`	`24`	`from .synthetic import (`
`25`	`25`	`SyntheticTextDataset,`
`26`		`- SyntheticTextDatasetConfig,`
`27`	`26`	`SyntheticTextDatasetDeserializer,`
`28`		`- SyntheticTextPrefixBucketConfig,`
`29`	`27`	`)`
`30`	`28`
`31`	`29`	`__all__ = [`
`@@ -45,9 +43,7 @@`
`45`	`43`	`"JSONFileDatasetDeserializer",`
`46`	`44`	`"ParquetFileDatasetDeserializer",`
`47`	`45`	`"SyntheticTextDataset",`
`48`		`- "SyntheticTextDatasetConfig",`
`49`	`46`	`"SyntheticTextDatasetDeserializer",`
`50`		`- "SyntheticTextPrefixBucketConfig",`
`51`	`47`	`"TarFileDatasetDeserializer",`
`52`	`48`	`"TextFileDatasetDeserializer",`
`53`	`49`	`]`