11from __future__ import annotations
22
3- import contextlib
43from collections .abc import Callable
54from typing import Any , Protocol , Union , runtime_checkable
65
@@ -47,30 +46,16 @@ def deserialize(
4746 remove_columns : list [str ] | None = None ,
4847 ** data_kwargs : dict [str , Any ],
4948 ) -> Dataset | IterableDataset :
50- dataset : Dataset | None = None
49+ dataset : Dataset
5150
5251 if type_ is None :
5352 dataset = cls ._deserialize_with_registered_deserializers (
5453 data , processor_factory , random_seed , ** data_kwargs
5554 )
5655
57- elif (deserializer_from_type := cls .get_registered_object (type_ )) is not None :
58- if isinstance (deserializer_from_type , type ):
59- deserializer_fn = deserializer_from_type ()
60- else :
61- deserializer_fn = deserializer_from_type
62-
63- dataset = deserializer_fn (
64- data = data ,
65- processor_factory = processor_factory ,
66- random_seed = random_seed ,
67- ** data_kwargs ,
68- )
69-
70- if dataset is None :
71- raise DataNotSupportedError (
72- f"No suitable deserializer found for data { data } "
73- f"with kwargs { data_kwargs } and deserializer type { type_ } ."
56+ else :
57+ dataset = cls ._deserialize_with_specified_deserializer (
58+ data , type_ , processor_factory , random_seed , ** data_kwargs
7459 )
7560
7661 if resolve_split :
@@ -99,7 +84,7 @@ def _deserialize_with_registered_deserializers(
9984 raise RuntimeError ("registry is None; cannot deserialize dataset" )
10085 dataset : Dataset | None = None
10186
102- errors = []
87+ errors : dict [ str , Exception ] = {}
10388 # Note: There is no priority order for the deserializers, so all deserializers
10489 # must be mutually exclusive to ensure deterministic behavior.
10590 for _name , deserializer in cls .registry .items ():
@@ -108,22 +93,52 @@ def _deserialize_with_registered_deserializers(
10893 )
10994
11095 try :
111- with contextlib .suppress (DataNotSupportedError ):
112- dataset = deserializer_fn (
113- data = data ,
114- processor_factory = processor_factory ,
115- random_seed = random_seed ,
116- ** data_kwargs ,
117- )
96+ dataset = deserializer_fn (
97+ data = data ,
98+ processor_factory = processor_factory ,
99+ random_seed = random_seed ,
100+ ** data_kwargs ,
101+ )
118102 except Exception as e : # noqa: BLE001 # The exceptions are saved.
119- errors . append ( e )
103+ errors [ _name ] = e
120104
121105 if dataset is not None :
122- break # Found one that works. Continuing could overwrite it.
123-
124- if dataset is None and len (errors ) > 0 :
125- raise DataNotSupportedError (
126- f"data deserialization failed; { len (errors )} errors occurred while "
127- f"attempting to deserialize data { data } : { errors } "
106+ return dataset # Success
107+
108+ if len (errors ) > 0 :
109+ err_msgs = ""
110+ def sort_key (item ):
111+ return (isinstance (item [1 ], DataNotSupportedError ), item [0 ])
112+ for key , err in sorted (errors .items (), key = sort_key ):
113+ err_msgs += f"\n - Deserializer '{ key } ': ({ type (err ).__name__ } ) { err } "
114+ raise ValueError (
115+ "Data deserialization failed, likely because the input doesn't "
116+ f"match any of the input formats. See the { len (errors )} error(s) that "
117+ f"occurred while attempting to deserialize the data { data } :{ err_msgs } "
128118 )
129119 return dataset
120+
121+ @classmethod
122+ def _deserialize_with_specified_deserializer (
123+ cls ,
124+ data : Any ,
125+ type_ : str ,
126+ processor_factory : Callable [[], PreTrainedTokenizerBase ],
127+ random_seed : int = 42 ,
128+ ** data_kwargs : dict [str , Any ],
129+ ) -> Dataset :
130+ deserializer_from_type = cls .get_registered_object (type_ )
131+ if deserializer_from_type is None :
132+ raise ValueError (f"Deserializer type '{ type_ } ' is not registered." )
133+ if isinstance (deserializer_from_type , type ):
134+ deserializer_fn = deserializer_from_type ()
135+ else :
136+ deserializer_fn = deserializer_from_type
137+
138+ return deserializer_fn (
139+ data = data ,
140+ processor_factory = processor_factory ,
141+ random_seed = random_seed ,
142+ ** data_kwargs ,
143+ )
144+
0 commit comments