@@ -177,15 +177,20 @@ def _find_files_without_glob(
177
177
178
178
179
179
def _find_files_with_glob (
180
- folder : epath .Path , globs : list [str ], file_names : list [str ]
180
+ folder : epath .Path ,
181
+ globs : list [str ],
182
+ file_names : list [str ],
181
183
) -> Iterator [epath .Path ]:
182
184
"""Finds files matching any of the given globs and given file names."""
183
185
for glob in globs :
186
+ found_files = folder .glob (glob )
184
187
try :
185
- for file in folder . glob ( glob ) :
188
+ for file in found_files :
186
189
if file .name in file_names :
187
190
yield file
188
- except OSError :
191
+ except (
192
+ OSError ,
193
+ ):
189
194
# If permission was denied on any subfolder, then the glob fails. Manually
190
195
# iterate through the subfolders instead to be more robust against this.
191
196
yield from _find_files_without_glob (folder , globs , file_names )
@@ -197,6 +202,7 @@ def _find_references_with_glob(
197
202
is_dataset_dir : bool ,
198
203
namespace : str | None = None ,
199
204
include_old_tfds_version : bool = True ,
205
+ glob_suffixes : Sequence [str ] = ('json' ,),
200
206
) -> Iterator [naming .DatasetReference ]:
201
207
"""Yields all dataset references in the given folder.
202
208
@@ -208,6 +214,8 @@ def _find_references_with_glob(
208
214
namespace: Optional namespace to which the found datasets belong to.
209
215
include_old_tfds_version: include datasets that have been generated with
210
216
TFDS before 4.0.0.
217
+ glob_suffixes: list of file suffixes to use to create the the glob for
218
+ interesting TFDS files. Defaults to json files.
211
219
212
220
Yields:
213
221
all dataset references in the given folder.
@@ -220,16 +228,26 @@ def _find_references_with_glob(
220
228
if is_data_dir :
221
229
data_dir = folder
222
230
dataset_name = None
223
- globs = ['*/*/*/*.json ' , '*/*/*.json ' ]
231
+ stars = ['*/*/*/*' , '*/*/*' ]
224
232
else :
225
233
data_dir = folder .parent
226
234
dataset_name = folder .name
227
- globs = ['*/*/*.json' , '*/*.json' ]
235
+ stars = ['*/*/*' , '*/*' ]
236
+
237
+ globs = [f'{ star } .{ suffix } ' for star in stars for suffix in glob_suffixes ] # pylint:disable=g-complex-comprehension
228
238
229
239
# Check files matching the globs and are files we are interested in.
230
240
matched_files_per_folder = collections .defaultdict (set )
231
- file_names = [constants .FEATURES_FILENAME , constants .DATASET_INFO_FILENAME ]
232
- for file in _find_files_with_glob (folder , globs = globs , file_names = file_names ):
241
+ file_names = [
242
+ constants .FEATURES_FILENAME ,
243
+ constants .DATASET_INFO_FILENAME ,
244
+ ]
245
+
246
+ for file in _find_files_with_glob (
247
+ folder ,
248
+ globs = globs ,
249
+ file_names = file_names ,
250
+ ):
233
251
matched_files_per_folder [file .parent ].add (file .name )
234
252
235
253
for data_folder , matched_files in matched_files_per_folder .items ():
@@ -284,6 +302,7 @@ def _find_references_with_glob(
284
302
dataset_name = dataset_name ,
285
303
config = config ,
286
304
version = version ,
305
+ info_filenames = matched_files ,
287
306
)
288
307
289
308
@@ -292,6 +311,7 @@ def list_dataset_variants(
292
311
namespace : str | None = None ,
293
312
include_versions : bool = True ,
294
313
include_old_tfds_version : bool = False ,
314
+ glob_suffixes : Sequence [str ] = ('json' ,),
295
315
) -> Iterator [naming .DatasetReference ]:
296
316
"""Yields all variants (config + version) found in `dataset_dir`.
297
317
@@ -301,6 +321,8 @@ def list_dataset_variants(
301
321
include_versions: whether to list what versions are available.
302
322
include_old_tfds_version: include datasets that have been generated with
303
323
TFDS before 4.0.0.
324
+ glob_suffixes: list of file suffixes to use to create the the glob for
325
+ interesting TFDS files. Defaults to json files.
304
326
305
327
Yields:
306
328
all variants of the given dataset.
@@ -313,6 +335,7 @@ def list_dataset_variants(
313
335
is_dataset_dir = True ,
314
336
namespace = namespace ,
315
337
include_old_tfds_version = include_old_tfds_version ,
338
+ glob_suffixes = glob_suffixes ,
316
339
):
317
340
if include_versions :
318
341
key = f'{ reference .dataset_name } /{ reference .config } :{ reference .version } '
0 commit comments