Skip to content

Commit 1a2a924

Browse files
fineguyThe TensorFlow Datasets Authors
authored andcommitted
Add get_file_spec method to DatasetBuilder.
PiperOrigin-RevId: 671362394
1 parent 099144b commit 1a2a924

File tree

3 files changed

+24
-11
lines changed

3 files changed

+24
-11
lines changed

tensorflow_datasets/core/dataset_builder.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,11 @@ def get_reference(
553553
data_dir=self.data_dir_root,
554554
)
555555

556+
def get_file_spec(self, split: str) -> str:
557+
"""Returns the file spec of the split."""
558+
split_info: splits_lib.SplitInfo = self.info.splits[split]
559+
return split_info.file_spec(self.info.file_format)
560+
556561
def is_prepared(self) -> bool:
557562
"""Returns whether this dataset is already downloaded and prepared."""
558563
return self.data_path.exists()
@@ -1082,7 +1087,7 @@ def _should_cache_ds(self, split, shuffle_files, read_config) -> bool:
10821087
# shuffling is enabled, as this would effectively disable shuffling.
10831088
# An exception is for single shard (as shuffling is a no-op).
10841089
# Another exception is if reshuffle is disabled (shuffling already cached)
1085-
num_shards = len(self.info.splits[split].file_instructions)
1090+
num_shards = self.info.splits[split].num_shards
10861091
if (
10871092
shuffle_files
10881093
and
@@ -1658,9 +1663,7 @@ def _get_filename_template(
16581663
split=split_name,
16591664
dataset_name=self.name,
16601665
data_dir=self.data_path,
1661-
filetype_suffix=file_adapters.ADAPTER_FOR_FORMAT[
1662-
self.info.file_format
1663-
].FILE_SUFFIX,
1666+
filetype_suffix=self.info.file_format.file_suffix, # pytype: disable=attribute-error
16641667
)
16651668

16661669
def _generate_splits(

tensorflow_datasets/core/dataset_builder_test.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -634,6 +634,20 @@ def test_get_reference(self):
634634
)
635635
self.assertEqual(reference, expected_reference)
636636

637+
def test_get_file_spec(self):
638+
builder = DummyDatasetWithConfigs(
639+
config="plus1", data_dir=self.get_temp_dir()
640+
)
641+
builder.download_and_prepare()
642+
self.assertEndsWith(
643+
builder.get_file_spec("train"),
644+
"dummy_dataset_with_configs/plus1/0.0.1/dummy_dataset_with_configs-train.tfrecord@1",
645+
)
646+
self.assertEndsWith(
647+
builder.get_file_spec("test"),
648+
"dummy_dataset_with_configs/plus1/0.0.1/dummy_dataset_with_configs-test.tfrecord@1",
649+
)
650+
637651
def test_load_as_data_source(self):
638652
data_dir = self.get_temp_dir()
639653
builder = DummyDatasetWithConfigs(

tensorflow_datasets/core/splits.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -205,18 +205,14 @@ def filenames(self) -> list[str]:
205205
"""Returns the list of filenames."""
206206
if not self.filename_template:
207207
raise ValueError('No filename templates available.')
208-
return sorted(
209-
self.filename_template.sharded_filenames(len(self.shard_lengths))
210-
)
208+
return sorted(self.filename_template.sharded_filenames(self.num_shards))
211209

212210
@property
213211
def filepaths(self) -> list[epath.Path]:
214212
"""All the paths for all the files that are part of this split."""
215213
if not self.filename_template:
216214
raise ValueError('No filename templates available.')
217-
return sorted(
218-
self.filename_template.sharded_filepaths(len(self.shard_lengths))
219-
)
215+
return sorted(self.filename_template.sharded_filepaths(self.num_shards))
220216

221217
def replace(self, **kwargs: Any) -> SplitInfo:
222218
"""Returns a copy of the `SplitInfo` with updated attributes."""
@@ -421,7 +417,7 @@ def __init__(
421417
)
422418
self._dataset_name = dataset_name # deprecated, please don't use
423419

424-
def __getitem__(self, key):
420+
def __getitem__(self, key) -> SplitInfo | SubSplitInfo:
425421
if not self:
426422
raise KeyError(
427423
f'Trying to access `splits[{key!r}]` but `splits` is empty. '

0 commit comments

Comments
 (0)