Skip to content

Commit 93836bb

Browse files
committed
Address review comment regarding the err messages
Signed-off-by: Jared O'Connell <[email protected]>
1 parent 81817f4 commit 93836bb

File tree

1 file changed

+49
-34
lines changed

1 file changed

+49
-34
lines changed

src/guidellm/data/deserializers/deserializer.py

Lines changed: 49 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
from __future__ import annotations
22

3-
import contextlib
43
from collections.abc import Callable
54
from typing import Any, Protocol, Union, runtime_checkable
65

@@ -47,30 +46,16 @@ def deserialize(
4746
remove_columns: list[str] | None = None,
4847
**data_kwargs: dict[str, Any],
4948
) -> Dataset | IterableDataset:
50-
dataset: Dataset | None = None
49+
dataset: Dataset
5150

5251
if type_ is None:
5352
dataset = cls._deserialize_with_registered_deserializers(
5453
data, processor_factory, random_seed, **data_kwargs
5554
)
5655

57-
elif (deserializer_from_type := cls.get_registered_object(type_)) is not None:
58-
if isinstance(deserializer_from_type, type):
59-
deserializer_fn = deserializer_from_type()
60-
else:
61-
deserializer_fn = deserializer_from_type
62-
63-
dataset = deserializer_fn(
64-
data=data,
65-
processor_factory=processor_factory,
66-
random_seed=random_seed,
67-
**data_kwargs,
68-
)
69-
70-
if dataset is None:
71-
raise DataNotSupportedError(
72-
f"No suitable deserializer found for data {data} "
73-
f"with kwargs {data_kwargs} and deserializer type {type_}."
56+
else:
57+
dataset = cls._deserialize_with_specified_deserializer(
58+
data, type_, processor_factory, random_seed, **data_kwargs
7459
)
7560

7661
if resolve_split:
@@ -99,7 +84,7 @@ def _deserialize_with_registered_deserializers(
9984
raise RuntimeError("registry is None; cannot deserialize dataset")
10085
dataset: Dataset | None = None
10186

102-
errors = []
87+
errors: dict[str, Exception] = {}
10388
# Note: There is no priority order for the deserializers, so all deserializers
10489
# must be mutually exclusive to ensure deterministic behavior.
10590
for _name, deserializer in cls.registry.items():
@@ -108,22 +93,52 @@ def _deserialize_with_registered_deserializers(
10893
)
10994

11095
try:
111-
with contextlib.suppress(DataNotSupportedError):
112-
dataset = deserializer_fn(
113-
data=data,
114-
processor_factory=processor_factory,
115-
random_seed=random_seed,
116-
**data_kwargs,
117-
)
96+
dataset = deserializer_fn(
97+
data=data,
98+
processor_factory=processor_factory,
99+
random_seed=random_seed,
100+
**data_kwargs,
101+
)
118102
except Exception as e: # noqa: BLE001 # The exceptions are saved.
119-
errors.append(e)
103+
errors[_name] = e
120104

121105
if dataset is not None:
122-
break # Found one that works. Continuing could overwrite it.
123-
124-
if dataset is None and len(errors) > 0:
125-
raise DataNotSupportedError(
126-
f"data deserialization failed; {len(errors)} errors occurred while "
127-
f"attempting to deserialize data {data}: {errors}"
106+
return dataset # Success
107+
108+
if len(errors) > 0:
109+
err_msgs = ""
110+
def sort_key(item):
111+
return (isinstance(item[1], DataNotSupportedError), item[0])
112+
for key, err in sorted(errors.items(), key=sort_key):
113+
err_msgs += f"\n - Deserializer '{key}': ({type(err).__name__}) {err}"
114+
raise ValueError(
115+
"Data deserialization failed, likely because the input doesn't "
116+
f"match any of the input formats. See the {len(errors)} error(s) that "
117+
f"occurred while attempting to deserialize the data {data}:{err_msgs}"
128118
)
129119
return dataset
120+
121+
@classmethod
122+
def _deserialize_with_specified_deserializer(
123+
cls,
124+
data: Any,
125+
type_: str,
126+
processor_factory: Callable[[], PreTrainedTokenizerBase],
127+
random_seed: int = 42,
128+
**data_kwargs: dict[str, Any],
129+
) -> Dataset:
130+
deserializer_from_type = cls.get_registered_object(type_)
131+
if deserializer_from_type is None:
132+
raise ValueError(f"Deserializer type '{type_}' is not registered.")
133+
if isinstance(deserializer_from_type, type):
134+
deserializer_fn = deserializer_from_type()
135+
else:
136+
deserializer_fn = deserializer_from_type
137+
138+
return deserializer_fn(
139+
data=data,
140+
processor_factory=processor_factory,
141+
random_seed=random_seed,
142+
**data_kwargs,
143+
)
144+

0 commit comments

Comments
 (0)