18
18
from __future__ import annotations
19
19
20
20
import abc
21
+ from collections .abc import Iterable
21
22
import dataclasses
22
23
import functools
23
24
import itertools
26
27
import os
27
28
import re
28
29
import typing
29
- from typing import Any , Dict , Iterable , List , Optional , Union
30
+ from typing import Any , Union
30
31
31
32
from absl import logging
32
33
from etils import epath
@@ -73,7 +74,7 @@ class _AbsoluteInstruction:
73
74
from_ : int # uint (starting index).
74
75
to : int # uint (ending index).
75
76
76
- def to_absolute (self , split_infos ) -> List ['_AbsoluteInstruction' ]:
77
+ def to_absolute (self , split_infos ) -> list ['_AbsoluteInstruction' ]:
77
78
del split_infos # unused
78
79
return [self ]
79
80
@@ -94,9 +95,9 @@ class SplitInfo:
94
95
"""
95
96
96
97
name : str
97
- shard_lengths : List [int ]
98
+ shard_lengths : list [int ]
98
99
num_bytes : int
99
- filename_template : Optional [ naming .ShardedFileTemplate ] = None
100
+ filename_template : naming .ShardedFileTemplate | None = None
100
101
statistics : statistics_pb2 .DatasetFeatureStatistics = dataclasses .field (
101
102
default_factory = statistics_pb2 .DatasetFeatureStatistics ,
102
103
)
@@ -163,7 +164,7 @@ def __repr__(self) -> str:
163
164
)
164
165
165
166
@property
166
- def file_instructions (self ) -> List [shard_utils .FileInstruction ]:
167
+ def file_instructions (self ) -> list [shard_utils .FileInstruction ]:
167
168
"""Returns the list of dict(filename, take, skip).
168
169
169
170
This allows for creating your own `tf.data.Dataset` using the low-level
@@ -199,7 +200,7 @@ def file_instructions(self) -> List[shard_utils.FileInstruction]:
199
200
)
200
201
201
202
@property
202
- def filenames (self ) -> List [str ]:
203
+ def filenames (self ) -> list [str ]:
203
204
"""Returns the list of filenames."""
204
205
if not self .filename_template :
205
206
raise ValueError ('No filename templates available.' )
@@ -208,7 +209,7 @@ def filenames(self) -> List[str]:
208
209
)
209
210
210
211
@property
211
- def filepaths (self ) -> List [epath .Path ]:
212
+ def filepaths (self ) -> list [epath .Path ]:
212
213
"""All the paths for all the files that are part of this split."""
213
214
if not self .filename_template :
214
215
raise ValueError ('No filename templates available.' )
@@ -228,9 +229,9 @@ class MultiSplitInfo(SplitInfo):
228
229
This should only be used to read data and not when producing data.
229
230
"""
230
231
231
- split_infos : List [SplitInfo ] = dataclasses .field (default_factory = list )
232
+ split_infos : list [SplitInfo ] = dataclasses .field (default_factory = list )
232
233
233
- def __init__ (self , name : str , split_infos : List [SplitInfo ]):
234
+ def __init__ (self , name : str , split_infos : list [SplitInfo ]):
234
235
if not split_infos :
235
236
raise ValueError ('Need to pass a non-empty list of SplitInfos' )
236
237
object .__setattr__ (self , 'split_infos' , split_infos )
@@ -262,22 +263,22 @@ def __repr__(self) -> str:
262
263
)
263
264
264
265
@property
265
- def file_instructions (self ) -> List [shard_utils .FileInstruction ]:
266
+ def file_instructions (self ) -> list [shard_utils .FileInstruction ]:
266
267
result = []
267
268
for split_info in self .split_infos :
268
269
result .extend (split_info .file_instructions )
269
270
return result
270
271
271
272
@property
272
- def filenames (self ) -> List [str ]:
273
+ def filenames (self ) -> list [str ]:
273
274
"""Returns the list of filenames."""
274
275
result = []
275
276
for split_info in self .split_infos :
276
277
result .extend (split_info .filenames )
277
278
return result
278
279
279
280
@property
280
- def filepaths (self ) -> List [epath .Path ]:
281
+ def filepaths (self ) -> list [epath .Path ]:
281
282
"""All the paths for all the files that are part of this split."""
282
283
result = []
283
284
for split_info in self .split_infos :
@@ -301,10 +302,10 @@ class SubSplitInfo:
301
302
"""
302
303
303
304
name : str
304
- file_instructions : List [shard_utils .FileInstruction ]
305
+ file_instructions : list [shard_utils .FileInstruction ]
305
306
306
307
@property
307
- def shard_lengths (self ) -> List [int ]:
308
+ def shard_lengths (self ) -> list [int ]:
308
309
return [f .take for f in self .file_instructions ]
309
310
310
311
@property
@@ -321,12 +322,12 @@ def num_shards(self) -> int:
321
322
return len (self .file_instructions )
322
323
323
324
@property
324
- def filenames (self ) -> List [str ]:
325
+ def filenames (self ) -> list [str ]:
325
326
"""Returns the list of filenames."""
326
327
return sorted (os .path .basename (f .filename ) for f in self .file_instructions )
327
328
328
329
@property
329
- def filepaths (self ) -> List [epath .Path ]:
330
+ def filepaths (self ) -> list [epath .Path ]:
330
331
"""Returns the list of filepaths."""
331
332
return sorted (epath .Path (f .filename ) for f in self .file_instructions )
332
333
@@ -384,7 +385,7 @@ def __init__(
384
385
split_infos : Iterable [SplitInfo ],
385
386
* ,
386
387
# TODO(b/216470058): remove this parameter
387
- dataset_name : Optional [ str ] = None , # deprecated, please don't use
388
+ dataset_name : str | None = None , # deprecated, please don't use
388
389
):
389
390
super (SplitDict , self ).__init__ (
390
391
{split_info .name : split_info for split_info in split_infos },
@@ -401,7 +402,7 @@ def __getitem__(self, key):
401
402
if not self :
402
403
raise KeyError (
403
404
f'Trying to access `splits[{ key !r} ]` but `splits` is empty. '
404
- 'This likely indicate the dataset has not been generated yet.'
405
+ 'This likely indicates the dataset has not been generated yet.'
405
406
)
406
407
# 1st case: The key exists: `info.splits['train']`
407
408
elif str (key ) in self .keys ():
@@ -435,11 +436,11 @@ def to_proto(self):
435
436
436
437
@property
437
438
def total_num_examples (self ):
438
- """Return the total number of examples."""
439
+ """Returns the total number of examples."""
439
440
return sum (s .num_examples for s in self .values ())
440
441
441
442
@classmethod
442
- def merge_multiple (cls , split_dicts : List ['SplitDict' ]) -> 'SplitDict' :
443
+ def merge_multiple (cls , split_dicts : list ['SplitDict' ]) -> 'SplitDict' :
443
444
info_per_split = []
444
445
for split in set (itertools .chain (* split_dicts )):
445
446
infos_of_split = []
@@ -461,7 +462,7 @@ def merge_multiple(cls, split_dicts: List['SplitDict']) -> 'SplitDict':
461
462
def _make_absolute_instructions (
462
463
split_infos : Iterable [SplitInfo ],
463
464
instruction : SplitArg ,
464
- ) -> List [_AbsoluteInstruction ]:
465
+ ) -> list [_AbsoluteInstruction ]:
465
466
if isinstance (instruction , str ):
466
467
instruction = AbstractSplit .from_spec (instruction )
467
468
@@ -473,7 +474,7 @@ def _make_absolute_instructions(
473
474
def _file_instructions_for_split (
474
475
instruction : _AbsoluteInstruction ,
475
476
split_info : SplitInfo ,
476
- ) -> List [shard_utils .FileInstruction ]:
477
+ ) -> list [shard_utils .FileInstruction ]:
477
478
"""Returns the file instructions from the given instruction applied to the given split info."""
478
479
if not split_info .num_examples :
479
480
logging .warning (
@@ -491,9 +492,9 @@ def _file_instructions_for_split(
491
492
492
493
493
494
def _make_file_instructions (
494
- split_infos : List [SplitInfo ],
495
+ split_infos : list [SplitInfo ],
495
496
instruction : SplitArg ,
496
- ) -> List [shard_utils .FileInstruction ]:
497
+ ) -> list [shard_utils .FileInstruction ]:
497
498
"""Returns file instructions by applying the given instruction on the given splits.
498
499
499
500
Args:
@@ -566,7 +567,7 @@ def from_spec(cls, spec: SplitArg) -> 'AbstractSplit':
566
567
return functools .reduce (operator .add , instructions )
567
568
568
569
@abc .abstractmethod
569
- def to_absolute (self , split_infos ) -> List [_AbsoluteInstruction ]:
570
+ def to_absolute (self , split_infos ) -> list [_AbsoluteInstruction ]:
570
571
"""Translate instruction into a list of absolute instructions.
571
572
572
573
Those absolute instructions are then to be added together.
@@ -603,7 +604,7 @@ class _SplitAdd(AbstractSplit):
603
604
def __repr__ (self ):
604
605
return f'{ self .left !r} +{ self .right !r} '
605
606
606
- def to_absolute (self , split_infos ) -> List [_AbsoluteInstruction ]:
607
+ def to_absolute (self , split_infos ) -> list [_AbsoluteInstruction ]:
607
608
# Merge instructions from left and right
608
609
return self .left .to_absolute (split_infos ) + self .right .to_absolute (
609
610
split_infos
@@ -613,7 +614,7 @@ def to_absolute(self, split_infos) -> List[_AbsoluteInstruction]:
613
614
class _SplitAll (AbstractSplit ):
614
615
"""Union of all splits of the dataset."""
615
616
616
- def to_absolute (self , split_infos ) -> List [_AbsoluteInstruction ]:
617
+ def to_absolute (self , split_infos ) -> list [_AbsoluteInstruction ]:
617
618
# Create the union of all splits
618
619
split_names = split_infos .keys ()
619
620
split = AbstractSplit .from_spec ('+' .join (split_names ))
@@ -645,8 +646,8 @@ class ReadInstruction(AbstractSplit):
645
646
646
647
split_name : str
647
648
# TODO(py3.10): Add `_ = dataclasses.KW_ONLY`
648
- from_ : Optional [ int | float ] = None
649
- to : Optional [ int | float ] = None
649
+ from_ : int | float | None = None
650
+ to : int | float | None = None
650
651
unit : str = 'abs'
651
652
rounding : str = 'closest'
652
653
@@ -681,7 +682,7 @@ def __repr__(self) -> str:
681
682
rounding = f', rounding={ self .rounding !r} ' if self .unit == '%' else ''
682
683
return f"ReadInstruction('{ self .split_name } { slice_str } '{ rounding } )"
683
684
684
- def to_absolute (self , split_infos ) -> List [_AbsoluteInstruction ]:
685
+ def to_absolute (self , split_infos ) -> list [_AbsoluteInstruction ]:
685
686
return [_rel_to_abs_instr (self , split_infos )]
686
687
687
688
@@ -763,7 +764,7 @@ def _pct_to_abs_closest(boundary, num_examples: int) -> int:
763
764
764
765
def _rel_to_abs_instr (
765
766
rel_instr : ReadInstruction ,
766
- split_infos : Dict [str , SplitInfo ],
767
+ split_infos : dict [str , SplitInfo ],
767
768
) -> _AbsoluteInstruction :
768
769
"""Returns _AbsoluteInstruction instance for given RelativeInstruction.
769
770
0 commit comments