@@ -34,11 +34,11 @@ def __call__(
3434 processor_factory : Callable [[], PreTrainedTokenizerBase ],
3535 random_seed : int ,
3636 ** data_kwargs : dict [str , Any ],
37- ) -> dict [ str , list ] :
37+ ) -> Dataset :
3838 _ = (processor_factory , random_seed ) # Ignore unused args format errors
3939
4040 if (
41- not isinstance (data , ( str , Path ) )
41+ not isinstance (data , str | Path )
4242 or not (path := Path (data )).exists ()
4343 or not path .is_file ()
4444 or path .suffix .lower () not in {".txt" , ".text" }
@@ -62,10 +62,10 @@ def __call__(
6262 processor_factory : Callable [[], PreTrainedTokenizerBase ],
6363 random_seed : int ,
6464 ** data_kwargs : dict [str , Any ],
65- ) -> dict [ str , list ] :
65+ ) -> Dataset :
6666 _ = (processor_factory , random_seed )
6767 if (
68- not isinstance (data , ( str , Path ) )
68+ not isinstance (data , str | Path )
6969 or not (path := Path (data )).exists ()
7070 or not path .is_file ()
7171 or path .suffix .lower () != ".csv"
@@ -86,10 +86,10 @@ def __call__(
8686 processor_factory : Callable [[], PreTrainedTokenizerBase ],
8787 random_seed : int ,
8888 ** data_kwargs : dict [str , Any ],
89- ) -> dict [ str , list ] :
89+ ) -> Dataset :
9090 _ = (processor_factory , random_seed )
9191 if (
92- not isinstance (data , ( str , Path ) )
92+ not isinstance (data , str | Path )
9393 or not (path := Path (data )).exists ()
9494 or not path .is_file ()
9595 or path .suffix .lower () not in {".json" , ".jsonl" }
@@ -110,10 +110,10 @@ def __call__(
110110 processor_factory : Callable [[], PreTrainedTokenizerBase ],
111111 random_seed : int ,
112112 ** data_kwargs : dict [str , Any ],
113- ) -> dict [ str , list ] :
113+ ) -> Dataset :
114114 _ = (processor_factory , random_seed )
115115 if (
116- not isinstance (data , ( str , Path ) )
116+ not isinstance (data , str | Path )
117117 or not (path := Path (data )).exists ()
118118 or not path .is_file ()
119119 or path .suffix .lower () != ".parquet"
@@ -134,10 +134,10 @@ def __call__(
134134 processor_factory : Callable [[], PreTrainedTokenizerBase ],
135135 random_seed : int ,
136136 ** data_kwargs : dict [str , Any ],
137- ) -> dict [ str , list ] :
137+ ) -> Dataset :
138138 _ = (processor_factory , random_seed )
139139 if (
140- not isinstance (data , ( str , Path ) )
140+ not isinstance (data , str | Path )
141141 or not (path := Path (data )).exists ()
142142 or not path .is_file ()
143143 or path .suffix .lower () != ".arrow"
@@ -158,10 +158,10 @@ def __call__(
158158 processor_factory : Callable [[], PreTrainedTokenizerBase ],
159159 random_seed : int ,
160160 ** data_kwargs : dict [str , Any ],
161- ) -> dict [ str , list ] :
161+ ) -> Dataset :
162162 _ = (processor_factory , random_seed )
163163 if (
164- not isinstance (data , ( str , Path ) )
164+ not isinstance (data , str | Path )
165165 or not (path := Path (data )).exists ()
166166 or not path .is_file ()
167167 or path .suffix .lower () not in {".hdf5" , ".h5" }
@@ -185,7 +185,7 @@ def __call__(
185185 ) -> dict [str , list ]:
186186 _ = (processor_factory , random_seed )
187187 if (
188- not isinstance (data , ( str , Path ) )
188+ not isinstance (data , str | Path )
189189 or not (path := Path (data )).exists ()
190190 or not path .is_file ()
191191 or path .suffix .lower () != ".db"
@@ -209,7 +209,7 @@ def __call__(
209209 ) -> dict [str , list ]:
210210 _ = (processor_factory , random_seed )
211211 if (
212- not isinstance (data , ( str , Path ) )
212+ not isinstance (data , str | Path )
213213 or not (path := Path (data )).exists ()
214214 or not path .is_file ()
215215 or path .suffix .lower () != ".tar"
0 commit comments