vllm-project
diff --git a/‎src/guidellm/data/deserializers/deserializer.py‎
Lines changed: 52 additions & 32 deletions b/‎src/guidellm/data/deserializers/deserializer.py‎
Lines changed: 52 additions & 32 deletions
diff --git a/‎src/guidellm/data/deserializers/file.py‎
Lines changed: 14 additions & 14 deletions b/‎src/guidellm/data/deserializers/file.py‎
Lines changed: 14 additions & 14 deletions
diff --git a/‎src/guidellm/data/deserializers/huggingface.py‎
Lines changed: 1 addition & 1 deletion b/‎src/guidellm/data/deserializers/huggingface.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/guidellm/data/deserializers/memory.py‎
Lines changed: 20 additions & 18 deletions b/‎src/guidellm/data/deserializers/memory.py‎
Lines changed: 20 additions & 18 deletions
diff --git a/‎src/guidellm/data/deserializers/synthetic.py‎
Lines changed: 16 additions & 12 deletions b/‎src/guidellm/data/deserializers/synthetic.py‎
Lines changed: 16 additions & 12 deletions
@@ -4,7 +4,7 @@
 from collections.abc import Callable
 from typing import Any, Protocol, Union, runtime_checkable
 
-from datasets import Dataset, IterableDataset
+from datasets import Dataset, DatasetDict, IterableDataset, IterableDatasetDict
 from transformers import PreTrainedTokenizerBase
 
 from guidellm.data.utils import resolve_dataset_split
@@ -29,7 +29,7 @@ def __call__(
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]: ...
+    ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict: ...
 
 
 class DatasetDeserializerFactory(
@@ -47,40 +47,19 @@ def deserialize(
         remove_columns: list[str] | None = None,
         **data_kwargs: dict[str, Any],
     ) -> Dataset | IterableDataset:
-        dataset = None
+        dataset: Dataset | None = None
 
         if type_ is None:
-            errors = []
-            # Note: There is no priority order for the deserializers, so all deserializers
-            #  must be mutually exclusive to ensure deterministic behavior.
-            for name, deserializer in cls.registry.items():
-                deserializer_fn: DatasetDeserializer = (
-                    deserializer() if isinstance(deserializer, type) else deserializer
-                )
-
-                try:
-                    with contextlib.suppress(DataNotSupportedError):
-                        dataset = deserializer_fn(
-                            data=data,
-                            processor_factory=processor_factory,
-                            random_seed=random_seed,
-                            **data_kwargs,
-                        )
-                except Exception as e:
-                    errors.append(e)
-
-                if dataset is not None:
-                    break # Found one that works. Continuing could overwrite it.
-
-            if dataset is None and len(errors) > 0:
-                raise DataNotSupportedError(f"data deserialization failed; {len(errors)} errors occurred while "
-                                            f"attempting to deserialize data {data}: {errors}")
-
-        elif deserializer := cls.get_registered_object(type_) is not None:
-            deserializer_fn: DatasetDeserializer = (
-                deserializer() if isinstance(deserializer, type) else deserializer
+            dataset = cls._deserialize_with_registered_deserializers(
+                data, processor_factory, random_seed, **data_kwargs
             )
 
+        elif (deserializer_from_type := cls.get_registered_object(type_)) is not None:
+            if isinstance(deserializer_from_type, type):
+                deserializer_fn = deserializer_from_type()
+            else:
+                deserializer_fn = deserializer_from_type
+
             dataset = deserializer_fn(
                 data=data,
                 processor_factory=processor_factory,
@@ -107,3 +86,44 @@ def deserialize(
             dataset = dataset.remove_columns(remove_columns)
 
         return dataset
+
+    @classmethod
+    def _deserialize_with_registered_deserializers(
+        cls,
+        data: Any,
+        processor_factory: Callable[[], PreTrainedTokenizerBase],
+        random_seed: int = 42,
+        **data_kwargs: dict[str, Any],
+    ) -> Dataset:
+        if cls.registry is None:
+            raise RuntimeError("registry is None; cannot deserialize dataset")
+        dataset: Dataset | None = None
+
+        errors = []
+        # Note: There is no priority order for the deserializers, so all deserializers
+        #  must be mutually exclusive to ensure deterministic behavior.
+        for _name, deserializer in cls.registry.items():
+            deserializer_fn: DatasetDeserializer = (
+                deserializer() if isinstance(deserializer, type) else deserializer
+            )
+
+            try:
+                with contextlib.suppress(DataNotSupportedError):
+                    dataset = deserializer_fn(
+                        data=data,
+                        processor_factory=processor_factory,
+                        random_seed=random_seed,
+                        **data_kwargs,
+                    )
+            except Exception as e:  # noqa: BLE001 # The exceptions are saved.
+                errors.append(e)
+
+            if dataset is not None:
+                break  # Found one that works. Continuing could overwrite it.
+
+        if dataset is None and len(errors) > 0:
+            raise DataNotSupportedError(
+                f"data deserialization failed; {len(errors)} errors occurred while "
+                f"attempting to deserialize data {data}: {errors}"
+            )
+        return dataset
@@ -34,11 +34,11 @@ def __call__(
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)  # Ignore unused args format errors
 
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() not in {".txt", ".text"}
@@ -62,10 +62,10 @@ def __call__(
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() != ".csv"
@@ -86,10 +86,10 @@ def __call__(
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() not in {".json", ".jsonl"}
@@ -110,10 +110,10 @@ def __call__(
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() != ".parquet"
@@ -134,10 +134,10 @@ def __call__(
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() != ".arrow"
@@ -158,10 +158,10 @@ def __call__(
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() not in {".hdf5", ".h5"}
@@ -185,7 +185,7 @@ def __call__(
     ) -> dict[str, list]:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() != ".db"
@@ -209,7 +209,7 @@ def __call__(
     ) -> dict[str, list]:
         _ = (processor_factory, random_seed)
         if (
-            not isinstance(data, (str, Path))
+            not isinstance(data, str | Path)
             or not (path := Path(data)).exists()
             or not path.is_file()
             or path.suffix.lower() != ".tar"
 
@@ -36,7 +36,7 @@ def __call__(
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
         _ = (processor_factory, random_seed)
 
         if isinstance(
 
@@ -33,7 +33,7 @@ def __call__(
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)  # Ignore unused args format errors
 
         if (
@@ -67,7 +67,7 @@ def __call__(
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)  # Ignore unused args format errors
 
         if (
@@ -81,9 +81,9 @@ def __call__(
                 f"expected list of dicts, got {data}"
             )
 
-        data: list[dict[str, Any]] = cast("list[dict[str, Any]]", data)
-        first_keys = set(data[0].keys())
-        for index, item in enumerate(data):
+        typed_data: list[dict[str, Any]] = cast("list[dict[str, Any]]", data)
+        first_keys = set(typed_data[0].keys())
+        for index, item in enumerate(typed_data):
             if set(item.keys()) != first_keys:
                 raise DataNotSupportedError(
                     f"All dictionaries must have the same keys. "
@@ -92,8 +92,8 @@ def __call__(
                 )
 
         # Convert list of dicts to dict of lists
-        result_dict = {key: [] for key in first_keys}
-        for item in data:
+        result_dict: dict = {key: [] for key in first_keys}
+        for item in typed_data:
             for key, value in item.items():
                 result_dict[key].append(value)
 
@@ -108,7 +108,7 @@ def __call__(
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         _ = (processor_factory, random_seed)  # Ignore unused args format errors
 
         primitive_types = (str, int, float, bool, type(None))
@@ -135,7 +135,7 @@ def __call__(
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         if (
             isinstance(data, str)
             and (json_str := data.strip())
@@ -145,16 +145,18 @@ def __call__(
             )
         ):
             with contextlib.suppress(Exception):
-                parsed = json.loads(data)
+                parsed_data = json.loads(data)
 
-            for deserializer in [
-                InMemoryDictDatasetDeserializer,
-                InMemoryDictListDatasetDeserializer,
-                InMemoryItemListDatasetDeserializer,
-            ]:
+            deserializers = [
+                InMemoryDictDatasetDeserializer(),
+                InMemoryDictListDatasetDeserializer(),
+                InMemoryItemListDatasetDeserializer(),
+            ]
+
+            for deserializer in deserializers:
                 with contextlib.suppress(DataNotSupportedError):
-                    return deserializer()(
-                        parsed, data_kwargs, processor_factory, random_seed
+                    return deserializer(
+                        parsed_data, processor_factory, random_seed, **data_kwargs
                     )
 
         raise DataNotSupportedError(
@@ -171,7 +173,7 @@ def __call__(
         processor_factory: Callable[[], PreTrainedTokenizerBase],
         random_seed: int,
         **data_kwargs: dict[str, Any],
-    ) -> dict[str, list]:
+    ) -> Dataset:
         if (
             isinstance(data, str)
             and (csv_str := data.strip())
 
@@ -99,21 +99,25 @@ class SyntheticTextDatasetConfig(StandardBaseModel):
 
     @model_validator(mode="after")
     def check_prefix_options(self) -> SyntheticTextDatasetConfig:
-        prefix_count = self.__pydantic_extra__.get("prefix_count", None)  # type: ignore[attr-defined]
-        prefix_tokens = self.__pydantic_extra__.get("prefix_tokens", None)  # type: ignore[attr-defined]
-        if prefix_count is not None or prefix_tokens is not None:
-            if self.prefix_buckets:
+        prefix_count: Any | None = None
+        prefix_tokens: Any | None = None
+        if self.__pydantic_extra__ is not None:
+            prefix_count = self.__pydantic_extra__.get("prefix_count", None)  # type: ignore[attr-defined]
+            prefix_tokens = self.__pydantic_extra__.get("prefix_tokens", None)  # type: ignore[attr-defined]
+
+            if (prefix_count is not None or prefix_tokens is not None
+                and self.prefix_buckets):
                 raise ValueError(
                     "prefix_buckets is mutually exclusive"
                     " with prefix_count and prefix_tokens"
                 )
 
-            self.prefix_buckets = [
-                SyntheticTextPrefixBucketConfig(
-                    prefix_count=prefix_count or 1,
-                    prefix_tokens=prefix_tokens or 0,
-                )
-            ]
+        self.prefix_buckets = [
+            SyntheticTextPrefixBucketConfig(
+                prefix_count=prefix_count or 1,
+                prefix_tokens=prefix_tokens or 0,
+            )
+        ]
 
         return self
 
@@ -174,14 +178,14 @@ def __iter__(self) -> Iterator[dict[str, Any]]:
     def _create_prompt(
         self, prompt_tokens_count: int, faker: Faker, unique: str = ""
     ) -> str:
-        prompt_token_ids = []
+        prompt_token_ids: list[int] = []
         avg_chars_per_token = 5
         margin_of_safety = 1.5
         attempts = 0
 
         while len(prompt_token_ids) < prompt_tokens_count:
             attempts += 1
-            num_chars = (
+            num_chars = math.ceil(
                 prompt_tokens_count * avg_chars_per_token * margin_of_safety * attempts
             )
             text = unique + faker.text(max_nb_chars=num_chars)