@@ -105,8 +105,16 @@ class ShardInstruction:
105
105
out_path : epath .Path
106
106
config : ConvertConfig
107
107
108
- def convert (self ) -> None :
109
- """Converts the shard to the desired file format."""
108
+ def convert (self ) -> epath .Path | None :
109
+ """Converts the shard to the desired file format.
110
+
111
+ Returns:
112
+ The path of the converted shard or `None` if the shard was not converted.
113
+
114
+ Raises:
115
+ Exception: if the shard conversion failed and `config.fail_on_error` is
116
+ `True`, else logs the error.
117
+ """
110
118
111
119
def read_in () -> Iterator [type_utils .KeySerializedExample ]:
112
120
in_dataset = self .config .in_file_adapter .make_tf_data (
@@ -127,12 +135,13 @@ def read_in() -> Iterator[type_utils.KeySerializedExample]:
127
135
self .config .out_file_adapter .write_examples (
128
136
path = tmp_file , iterator = read_in ()
129
137
)
138
+ return self .out_path
130
139
except Exception as e : # pylint: disable=broad-except
131
140
if self .config .fail_on_error :
132
141
raise e
133
142
else :
134
143
logging .exception (
135
- 'Failed to convert shard %s (format=%s) to %s (format=%s!' ,
144
+ 'Failed to convert shard %s (format=%s) to %s (format=%s) !' ,
136
145
self .in_path ,
137
146
self .config .in_file_adapter .FILE_SUFFIX ,
138
147
self .out_path ,
@@ -220,45 +229,87 @@ def _get_root_data_dir(
220
229
return epath .Path (re .sub (rf'{ relative_data_dir } /?$' , '' , in_dir ))
221
230
222
231
232
+ class ConvertMetadataFn (beam .DoFn ):
233
+ """Beam DoFn to convert metadata for a single dataset version."""
234
+
235
+ def process (
236
+ self ,
237
+ count ,
238
+ in_dir : epath .Path ,
239
+ info : dataset_info_pb2 .DatasetInfo ,
240
+ out_path : epath .Path ,
241
+ convert_config : ConvertConfig ,
242
+ ):
243
+ # This is necessary because sometimes `beam.combiners.Count.Globally()`
244
+ # returned an integer and not a pCollection.
245
+ if not isinstance (count , int ):
246
+ count = beam .pvalue .AsSingleton (count )
247
+ convert_metadata (
248
+ in_dir = in_dir ,
249
+ info = info ,
250
+ out_path = out_path ,
251
+ convert_config = convert_config ,
252
+ num_converted_shards = count ,
253
+ )
254
+
255
+
223
256
def convert_metadata (
224
257
in_dir : epath .Path ,
225
258
info : dataset_info_pb2 .DatasetInfo ,
226
- out_file_format : file_adapters .FileFormat ,
227
259
out_path : epath .Path ,
260
+ convert_config : ConvertConfig ,
261
+ num_converted_shards : int | None = None ,
228
262
) -> None :
229
263
"""Converts all metadata to the converted dataset.
230
264
231
265
Args:
232
266
in_dir: folder that contains the dataset to convert.
233
267
info: dataset info of the dataset to convert.
234
- out_file_format: the format to which the dataset should be converted to.
235
268
out_path: folder where the converted dataset should be stored.
269
+ convert_config: configuration for the conversion.
270
+ num_converted_shards: number of shards that were successfully converted,
271
+ which is used to check that the conversion was successful. If part of a
272
+ beam pipeline, this comes from `beam.combiners.Count.Globally()`.
236
273
"""
237
274
splits_dict = dataset_info_lib .get_split_dict_from_proto (
238
275
dataset_info_proto = info ,
239
276
data_dir = in_dir ,
240
- file_format = out_file_format ,
277
+ file_format = convert_config . out_file_format ,
241
278
)
242
279
243
280
missing_shards_per_split = {}
244
281
for split_info in splits_dict .values ():
245
- available_shards = split_info .get_available_shards (
246
- out_path , file_format = out_file_format
282
+ num_available_shards = len (
283
+ split_info .get_available_shards (
284
+ out_path , file_format = convert_config .out_file_format
285
+ )
247
286
)
248
- if len (available_shards ) < split_info .num_shards :
287
+ if num_converted_shards != num_available_shards :
288
+ logging .warning (
289
+ 'Amount of converted shards calculated during conversion (%d) does'
290
+ ' not match the amount of available shards in the data dir (%d) for'
291
+ ' split %s.'
292
+ )
293
+ if num_available_shards < split_info .num_shards :
249
294
missing_shards_per_split [split_info .name ] = (
250
- len ( available_shards ) ,
295
+ num_available_shards ,
251
296
split_info .num_shards ,
252
297
)
253
- logging .warning (
254
- 'Found %d shards for split %s, but expected %d shards.' ,
255
- len (available_shards ),
256
- split_info .name ,
257
- split_info .num_shards ,
298
+ error_message = (
299
+ (
300
+ f'Found { num_available_shards } shards for split'
301
+ f' { split_info .name } , but expected'
302
+ f' { split_info .num_shards } shards.'
303
+ ),
258
304
)
259
- elif len (available_shards ) > split_info .num_shards :
305
+ if convert_config .fail_on_error :
306
+ raise ValueError (error_message )
307
+ else :
308
+ logging .warning (error_message )
309
+
310
+ elif num_available_shards > split_info .num_shards :
260
311
raise ValueError (
261
- f'Found more shards ({ len ( available_shards ) } ) for split'
312
+ f'Found more shards ({ num_available_shards } ) for split'
262
313
f' { split_info .name } , but expected only'
263
314
f' { split_info .num_shards } shards.'
264
315
)
@@ -278,14 +329,14 @@ def convert_metadata(
278
329
279
330
# File format was added to an existing dataset.
280
331
# Add the file format to `alternative_file_formats` field.
281
- if out_file_format not in info .alternative_file_formats :
282
- info .alternative_file_formats .append (out_file_format .value )
332
+ if convert_config . out_file_format not in info .alternative_file_formats :
333
+ info .alternative_file_formats .append (convert_config . out_file_format .value )
283
334
dataset_info_lib .write_dataset_info_proto (info , dataset_info_dir = out_path )
284
335
else :
285
336
logging .info (
286
337
'File format %s is already an alternative file format of the dataset'
287
338
' in %s. Skipping updating metadata..' ,
288
- out_file_format .value ,
339
+ convert_config . out_file_format .value ,
289
340
os .fspath (in_dir ),
290
341
)
291
342
return
@@ -318,7 +369,7 @@ def convert_metadata(
318
369
dataset_info_proto = info ,
319
370
dataset_reference = in_dataset_reference ,
320
371
)
321
- info .file_format = out_file_format .value
372
+ info .file_format = convert_config . out_file_format .value
322
373
dataset_info_lib .write_dataset_info_proto (info , dataset_info_dir = out_path )
323
374
324
375
@@ -359,21 +410,46 @@ def _convert_dataset(
359
410
logging .info ('Found %d shards to convert.' , len (shard_instructions ))
360
411
361
412
if pipeline is not None :
362
- _ = (
413
+ converted_shards = (
363
414
pipeline
364
415
| f'CreateShardInstructions for { dataset_dir } '
365
416
>> beam .Create (shard_instructions )
366
417
| f'ConvertShards for { dataset_dir } '
367
418
>> beam .Map (lambda shard_instruction : shard_instruction .convert ())
419
+ | f'Filter out shards that were not successfully converted for { dataset_dir } '
420
+ >> beam .Filter (lambda shard_instruction : shard_instruction is not None )
421
+ )
422
+ count_shards = (
423
+ converted_shards
424
+ | f'CountConvertedShards for { dataset_dir } '
425
+ >> beam .combiners .Count .Globally ()
426
+ )
427
+ _ = count_shards | f'ConvertMetadata for { dataset_dir } ' >> beam .ParDo (
428
+ ConvertMetadataFn (),
429
+ in_dir = dataset_dir ,
430
+ info = info ,
431
+ out_path = out_dir ,
432
+ convert_config = convert_config ,
368
433
)
369
434
370
435
else :
436
+ converted_shards = 0
371
437
for shard_instruction in tqdm .tqdm (
372
438
shard_instructions ,
373
439
unit = ' shards' ,
374
440
desc = f'Shards in { os .fspath (dataset_dir )} ' ,
375
441
):
376
- shard_instruction .convert ()
442
+ result = shard_instruction .convert ()
443
+ if result is not None :
444
+ converted_shards += 1
445
+ logging .info ('Converting metadata in %s.' , dataset_dir )
446
+ convert_metadata (
447
+ in_dir = dataset_dir ,
448
+ info = info ,
449
+ out_path = out_dir ,
450
+ convert_config = convert_config ,
451
+ num_converted_shards = converted_shards ,
452
+ )
377
453
378
454
379
455
def _remove_incomplete_files (path : epath .Path ) -> None :
@@ -521,21 +597,9 @@ def _process_get_infos(from_to_dir):
521
597
out_dir = out_dir ,
522
598
)
523
599
524
- logging .info ('All shards have been converted. Now converting metadata.' )
525
- for dataset_dir , info in tqdm .tqdm (
526
- found_dataset_versions .items (), unit = ' datasets'
527
- ):
528
- out_dir = from_to_dirs [dataset_dir ]
529
- logging .info ('Converting metadata in %s.' , dataset_dir )
530
- convert_metadata (
531
- in_dir = dataset_dir ,
532
- info = info ,
533
- out_file_format = convert_config .out_file_format ,
534
- out_path = out_dir ,
535
- )
536
-
537
600
logging .info (
538
- 'All metadata has been converted. Now removing incomplete files.'
601
+ 'All metadata and shards have been converted. Now removing incomplete'
602
+ ' files.'
539
603
)
540
604
for out_dir in from_to_dirs .values ():
541
605
logging .info ('Removing incomplete files in %s.' , out_dir )
0 commit comments