Skip to content

Commit 762a908

Browse files
marcenacpThe TensorFlow Datasets Authors
authored andcommitted
Remove dead code.
PiperOrigin-RevId: 689486450
1 parent b59103b commit 762a908

File tree

4 files changed

+0
-186
lines changed

4 files changed

+0
-186
lines changed

tensorflow_datasets/core/dataset_info.py

Lines changed: 0 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@
4747
from etils import epy
4848
from tensorflow_datasets.core import constants
4949
from tensorflow_datasets.core import file_adapters
50-
from tensorflow_datasets.core import lazy_imports_lib
5150
from tensorflow_datasets.core import naming
5251
from tensorflow_datasets.core import splits as splits_lib
5352
from tensorflow_datasets.core import utils
@@ -1079,49 +1078,6 @@ def _populate_shape(shape_or_dict, prefix, schema_features):
10791078
prefix.pop()
10801079

10811080

1082-
def get_dataset_feature_statistics(builder, split):
1083-
"""Calculate statistics for the specified split."""
1084-
tfdv = lazy_imports_lib.lazy_imports.tensorflow_data_validation
1085-
# TODO(epot): Avoid hardcoding file format.
1086-
filetype_suffix = "tfrecord"
1087-
if filetype_suffix not in ["tfrecord", "csv"]:
1088-
raise ValueError(
1089-
"Cannot generate statistics for filetype {}".format(filetype_suffix)
1090-
)
1091-
filename_template = naming.ShardedFileTemplate(
1092-
data_dir=builder.data_dir,
1093-
dataset_name=builder.name,
1094-
split=split,
1095-
filetype_suffix=filetype_suffix,
1096-
)
1097-
filepattern = filename_template.sharded_filepaths_pattern()
1098-
# Avoid generating a large number of buckets in rank histogram
1099-
# (default is 1000).
1100-
stats_options = tfdv.StatsOptions(
1101-
num_top_values=10,
1102-
num_rank_histogram_buckets=10,
1103-
use_sketch_based_topk_uniques=False,
1104-
)
1105-
if filetype_suffix == "csv":
1106-
statistics = tfdv.generate_statistics_from_csv(
1107-
filepattern, stats_options=stats_options
1108-
)
1109-
else:
1110-
statistics = tfdv.generate_statistics_from_tfrecord(
1111-
filepattern, stats_options=stats_options
1112-
)
1113-
schema = tfdv.infer_schema(statistics)
1114-
schema_features = {feature.name: feature for feature in schema.feature}
1115-
# Override shape in the schema.
1116-
for feature_name, feature in builder.info.features.items():
1117-
_populate_shape(feature.shape, [feature_name], schema_features)
1118-
1119-
# Remove legacy field.
1120-
if getattr(schema, "generate_legacy_feature_spec", None) is not None:
1121-
schema.ClearField("generate_legacy_feature_spec")
1122-
return statistics.datasets[0], schema
1123-
1124-
11251081
def get_dataset_info_json(
11261082
dataset_info_proto: dataset_info_pb2.DatasetInfo,
11271083
) -> str:

tensorflow_datasets/core/dataset_utils.py

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -200,15 +200,3 @@ def as_numpy(dataset: Tree[TensorflowElem]) -> Tree[NumpyElem]:
200200
return tree.map_structure(_elem_to_numpy_eager, dataset)
201201
else:
202202
return _nested_to_numpy_graph(dataset)
203-
204-
205-
def dataset_shape_is_fully_defined(ds):
206-
output_shapes = tf.compat.v1.data.get_output_shapes(ds)
207-
return all([ts.is_fully_defined() for ts in tf.nest.flatten(output_shapes)])
208-
209-
210-
def features_shape_is_fully_defined(features):
211-
return all([
212-
tf.TensorShape(info.shape).is_fully_defined()
213-
for info in tf.nest.flatten(features.get_tensor_info())
214-
])

tensorflow_datasets/core/naming.py

Lines changed: 0 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -413,18 +413,6 @@ def snake_to_camelcase(name: str) -> str:
413413
return ''.join(n.capitalize() for n in name.split('_'))
414414

415415

416-
def filename_prefix_for_name(name: str) -> str:
417-
if os.path.basename(name) != name:
418-
raise ValueError('Should be a dataset name, not a path: %s' % name)
419-
return camelcase_to_snakecase(name)
420-
421-
422-
def filename_prefix_for_split(name: str, split: str) -> str:
423-
if os.path.basename(name) != name:
424-
raise ValueError('Should be a dataset name, not a path: %s' % name)
425-
return '%s-%s' % (filename_prefix_for_name(name), split)
426-
427-
428416
def _strip_encoding_suffix(path: str) -> str:
429417
"""Strips the encoding suffix from the path."""
430418
if '%' not in path:
@@ -710,75 +698,6 @@ def replace(self, **kwargs: Any) -> 'ShardedFileTemplate':
710698
return dataclasses.replace(self, **kwargs)
711699

712700

713-
def filepattern_for_dataset_split(
714-
*,
715-
dataset_name: str,
716-
split: str,
717-
data_dir: str,
718-
filetype_suffix: str | None = None,
719-
num_shards: int | None = None,
720-
) -> str:
721-
"""Returns the file pattern for the given dataset.
722-
723-
TODO(tfds): remove this by start using ShardedFileTemplate
724-
725-
Args:
726-
dataset_name: Name of the dataset
727-
split: Name of the requested split
728-
data_dir: The base folder that contains the dataset.
729-
filetype_suffix: Optional suffix, e.g. tfrecord
730-
num_shards: Optional argument. If specified, will return file@num_shards
731-
notation, otherwise file*.
732-
"""
733-
template = ShardedFileTemplate(
734-
data_dir=epath.Path(data_dir),
735-
dataset_name=dataset_name,
736-
split=split,
737-
filetype_suffix=filetype_suffix,
738-
)
739-
return os.fspath(template.sharded_filepaths_pattern(num_shards=num_shards))
740-
741-
742-
def filenames_for_dataset_split(
743-
dataset_name: str,
744-
split: str,
745-
num_shards: int,
746-
filetype_suffix: str,
747-
data_dir: epath.PathLike | None = None,
748-
) -> list[str]:
749-
"""Returns the list of filenames for the given dataset and split."""
750-
# TODO(tfds): remove this by start using ShardedFileTemplate
751-
template = ShardedFileTemplate(
752-
dataset_name=dataset_name,
753-
split=split,
754-
filetype_suffix=filetype_suffix,
755-
data_dir=epath.Path(data_dir),
756-
)
757-
return [
758-
os.fspath(fp) for fp in template.sharded_filenames(num_shards=num_shards)
759-
]
760-
761-
762-
def filepaths_for_dataset_split(
763-
dataset_name: str,
764-
split: str,
765-
num_shards: int,
766-
data_dir: str,
767-
filetype_suffix: str,
768-
) -> list[str]:
769-
"""File paths of a given dataset split."""
770-
# TODO(tfds): remove this by start using ShardedFileTemplate
771-
template = ShardedFileTemplate(
772-
dataset_name=dataset_name,
773-
split=split,
774-
filetype_suffix=filetype_suffix,
775-
data_dir=epath.Path(data_dir),
776-
)
777-
return [
778-
os.fspath(fp) for fp in template.sharded_filepaths(num_shards=num_shards)
779-
]
780-
781-
782701
def _get_filename_template(
783702
filename: str, filename_template: ShardedFileTemplate | None
784703
) -> ShardedFileTemplate:

tensorflow_datasets/core/naming_test.py

Lines changed: 0 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
import pytest
2222
from tensorflow_datasets import testing
2323
from tensorflow_datasets.core import naming
24-
from tensorflow_datasets.core import splits
2524

2625
_FILENAME_TEMPLATE_DEFAULT = naming.ShardedFileTemplate(data_dir='.')
2726
_FILENAME_TEMPLATE_MNIST_DEFAULT = naming.ShardedFileTemplate(
@@ -108,54 +107,6 @@ def test_encryption_suffix(self):
108107
[path_template % s + encryption_suffix for s in shards],
109108
)
110109

111-
@parameterized.parameters(
112-
('foo', 'foo-train'),
113-
('Foo', 'foo-train'),
114-
('FooBar', 'foo_bar-train'),
115-
)
116-
def test_filename_prefix_for_split(self, prefix, expected):
117-
split = splits.Split.TRAIN
118-
self.assertEqual(expected, naming.filename_prefix_for_split(prefix, split))
119-
120-
def test_filenames_for_dataset_split(self):
121-
actual = naming.filenames_for_dataset_split(
122-
dataset_name='foo',
123-
split=splits.Split.TRAIN,
124-
num_shards=2,
125-
filetype_suffix='bar',
126-
data_dir='/path',
127-
)
128-
self.assertEqual(
129-
actual, ['foo-train.bar-00000-of-00002', 'foo-train.bar-00001-of-00002']
130-
)
131-
132-
def test_filepaths_for_dataset_split(self):
133-
actual = naming.filepaths_for_dataset_split(
134-
dataset_name='foo',
135-
split=splits.Split.TRAIN,
136-
num_shards=2,
137-
data_dir='/tmp/bar/',
138-
filetype_suffix='bar',
139-
)
140-
self.assertEqual(
141-
actual,
142-
[
143-
'/tmp/bar/foo-train.bar-00000-of-00002',
144-
'/tmp/bar/foo-train.bar-00001-of-00002',
145-
],
146-
)
147-
148-
def test_filepattern_for_dataset_split(self):
149-
self.assertEqual(
150-
'/tmp/bar/foo-test.bar*',
151-
naming.filepattern_for_dataset_split(
152-
dataset_name='foo',
153-
split=splits.Split.TEST,
154-
data_dir='/tmp/bar/',
155-
filetype_suffix='bar',
156-
),
157-
)
158-
159110

160111
@pytest.mark.parametrize(
161112
('tfds_name', 'expected'),

0 commit comments

Comments
 (0)