33
33
from __future__ import annotations
34
34
35
35
import abc
36
- from collections .abc import Iterable
36
+ from collections .abc import Iterable , Sequence
37
37
import dataclasses
38
38
import json
39
39
import os
@@ -194,6 +194,9 @@ def __init__(
194
194
license : str | None = None , # pylint: disable=redefined-builtin
195
195
redistribution_info : Optional [dict [str , str ]] = None ,
196
196
split_dict : Optional [splits_lib .SplitDict ] = None ,
197
+ alternative_file_formats : (
198
+ Sequence [str | file_adapters .FileFormat ] | None
199
+ ) = None ,
197
200
# LINT.ThenChange(:setstate)
198
201
):
199
202
# pyformat: disable
@@ -238,6 +241,8 @@ def __init__(
238
241
subfield will automatically be written to a LICENSE file stored with the
239
242
dataset.
240
243
split_dict: information about the splits in this dataset.
244
+ alternative_file_formats: alternative file formats that are availablefor
245
+ this dataset.
241
246
"""
242
247
# pyformat: enable
243
248
self ._builder_or_identity = builder
@@ -246,6 +251,13 @@ def __init__(
246
251
else :
247
252
self ._identity = DatasetIdentity .from_builder (builder )
248
253
254
+ self ._alternative_file_formats : list [file_adapters .FileFormat ] = []
255
+ if alternative_file_formats :
256
+ for file_format in alternative_file_formats :
257
+ if isinstance (file_format , str ):
258
+ file_format = file_adapters .FileFormat .from_value (file_format )
259
+ self .add_alternative_file_format (file_format )
260
+
249
261
self ._info_proto = dataset_info_pb2 .DatasetInfo (
250
262
name = self ._identity .name ,
251
263
description = utils .dedent (description ),
@@ -260,6 +272,9 @@ def __init__(
260
272
redistribution_info = _create_redistribution_info_proto (
261
273
license = license , redistribution_info = redistribution_info
262
274
),
275
+ alternative_file_formats = [
276
+ f .value for f in self ._alternative_file_formats
277
+ ],
263
278
)
264
279
265
280
if homepage :
@@ -328,6 +343,7 @@ def from_proto(
328
343
repeated_split_infos = proto .splits ,
329
344
filename_template = filename_template ,
330
345
),
346
+ alternative_file_formats = proto .alternative_file_formats ,
331
347
)
332
348
333
349
@property
@@ -415,6 +431,10 @@ def download_size(self, size):
415
431
def features (self ):
416
432
return self ._features
417
433
434
+ @property
435
+ def alternative_file_formats (self ) -> Sequence [file_adapters .FileFormat ]:
436
+ return self ._alternative_file_formats
437
+
418
438
@property
419
439
def metadata (self ) -> Metadata | None :
420
440
return self ._metadata
@@ -444,6 +464,7 @@ def set_file_format(
444
464
self ,
445
465
file_format : None | str | file_adapters .FileFormat ,
446
466
override : bool = False ,
467
+ override_if_initialized : bool = False ,
447
468
) -> None :
448
469
"""Internal function to define the file format.
449
470
@@ -454,6 +475,8 @@ def set_file_format(
454
475
file_format: The file format.
455
476
override: Whether the file format should be overridden if it is already
456
477
set.
478
+ override_if_initialized: Whether the file format should be overridden if
479
+ the DatasetInfo is already fully initialized.
457
480
458
481
Raises:
459
482
ValueError: if the file format was already set and the `override`
@@ -474,12 +497,39 @@ def set_file_format(
474
497
raise ValueError (
475
498
f"File format is already set to { self .file_format } . Got { file_format } "
476
499
)
477
- if override and self ._fully_initialized :
500
+ if override and self ._fully_initialized and not override_if_initialized :
478
501
raise RuntimeError (
479
- "Cannot override the file format "
480
- "when the DatasetInfo is already fully initialized!"
502
+ "Cannot override the file format when the DatasetInfo is already "
503
+ " fully initialized!"
481
504
)
482
505
self ._info_proto .file_format = file_format .value
506
+ if override_if_initialized :
507
+ # Update the splits to point to the new file format.
508
+ updated_split_infos = []
509
+ for split_info in self .splits .values ():
510
+ if split_info .filename_template is None :
511
+ continue
512
+ updated_split_info = split_info .replace (
513
+ filename_template = split_info .filename_template .replace (
514
+ filetype_suffix = file_format .value
515
+ )
516
+ )
517
+ updated_split_infos .append (updated_split_info )
518
+ self ._splits = splits_lib .SplitDict (updated_split_infos )
519
+
520
+ def add_alternative_file_format (
521
+ self ,
522
+ file_format : str | file_adapters .FileFormat ,
523
+ ) -> None :
524
+ """Adds an alternative file format to the dataset info."""
525
+ if isinstance (file_format , str ):
526
+ file_format = file_adapters .FileFormat .from_value (file_format )
527
+ if file_format in self .alternative_file_formats :
528
+ raise ValueError (
529
+ f"Alternative file format { file_format } is already present."
530
+ )
531
+ self ._alternative_file_formats .append (file_format )
532
+ self .as_proto .alternative_file_formats .append (file_format .value )
483
533
484
534
@property
485
535
def splits (self ) -> splits_lib .SplitDict :
@@ -882,6 +932,7 @@ def __getstate__(self):
882
932
"metadata" : self .metadata ,
883
933
"license" : self .redistribution_info .license ,
884
934
"split_dict" : self .splits ,
935
+ "alternative_file_formats" : self .alternative_file_formats ,
885
936
}
886
937
def __setstate__ (self , state ):
887
938
# LINT.IfChange(setstate)
@@ -896,6 +947,7 @@ def __setstate__(self, state):
896
947
metadata = state ["metadata" ],
897
948
license = state ["license" ],
898
949
split_dict = state ["split_dict" ],
950
+ alternative_file_formats = state ["alternative_file_formats" ],
899
951
)
900
952
# LINT.ThenChange(:dataset_info_args)
901
953
0 commit comments