Skip to content

Commit a1f762a

Browse files
fineguyThe TensorFlow Datasets Authors
authored andcommitted
Add publish option to build_croissant command.
PiperOrigin-RevId: 631379406
1 parent cdfab7f commit a1f762a

File tree

7 files changed

+519
-321
lines changed

7 files changed

+519
-321
lines changed

tensorflow_datasets/core/utils/file_utils.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -417,3 +417,22 @@ def expand_glob(path: epath.PathLike) -> Sequence[epath.Path]:
417417
)
418418
return [path]
419419
return list(epath.Path('/').glob(path_str[1:]))
420+
421+
422+
def publish_data(
423+
from_data_dir: epath.Path,
424+
to_data_dir: epath.Path,
425+
overwrite: bool = False,
426+
) -> None:
427+
"""Publishes the data from the given `from_data_dir` to `to_data_dir`.
428+
429+
Arguments:
430+
from_data_dir: the folder whose data needs to be published.
431+
to_data_dir: folder where the data should be published. Should include
432+
config and version.
433+
overwrite: whether to overwrite existing data in the `publish_root_dir` if
434+
it exists.
435+
"""
436+
to_data_dir.mkdir(parents=True, exist_ok=True)
437+
for filepath in from_data_dir.iterdir():
438+
filepath.copy(dst=to_data_dir / filepath.name, overwrite=overwrite)

tensorflow_datasets/core/utils/file_utils_test.py

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,5 +130,146 @@ def test_list_dataset_variants_without_configs(mock_fs: testing.MockFs):
130130
]
131131

132132

133+
def test_list_datasets_in_data_dir(mock_fs: testing.MockFs):
134+
data_dir = epath.Path('/a')
135+
mock_fs.add_file(data_dir / 'ds1/config1/1.0.0/dataset_info.json')
136+
mock_fs.add_file(data_dir / 'ds1/config1/1.0.0/features.json')
137+
mock_fs.add_file(data_dir / 'ds1/config1/2.0.0/dataset_info.json')
138+
mock_fs.add_file(data_dir / 'ds1/config1/2.0.0/features.json')
139+
mock_fs.add_file(data_dir / 'ds1/config2/1.0.0/dataset_info.json')
140+
mock_fs.add_file(data_dir / 'ds1/config2/1.0.0/features.json')
141+
mock_fs.add_file(data_dir / 'ds2/1.0.0/dataset_info.json')
142+
mock_fs.add_file(data_dir / 'ds2/1.0.0/features.json')
143+
144+
# The following are problematic and should thus be ignored.
145+
mock_fs.add_file(
146+
os.path.join(data_dir, 'invalid-name/1.0.0/features.json'), content='x'
147+
)
148+
mock_fs.add_file(
149+
os.path.join(data_dir, 'invalid_version1/1.a.b/features.json'),
150+
content='x',
151+
)
152+
mock_fs.add_file(
153+
os.path.join(data_dir, 'invalid_version2/1.2.3.4/features.json'),
154+
content='x',
155+
)
156+
157+
references = sorted(
158+
file_utils.list_datasets_in_data_dir(data_dir=epath.Path(data_dir))
159+
)
160+
data_dir = epath.Path('/a')
161+
assert references == [
162+
naming.DatasetReference(
163+
dataset_name='ds1',
164+
config='config1',
165+
version='1.0.0',
166+
data_dir=data_dir,
167+
),
168+
naming.DatasetReference(
169+
dataset_name='ds1',
170+
config='config1',
171+
version='2.0.0',
172+
data_dir=data_dir,
173+
),
174+
naming.DatasetReference(
175+
dataset_name='ds1',
176+
config='config2',
177+
version='1.0.0',
178+
data_dir=data_dir,
179+
),
180+
naming.DatasetReference(
181+
dataset_name='ds2', version='1.0.0', data_dir=data_dir
182+
),
183+
]
184+
185+
186+
def test_list_datasets_in_data_dir_with_namespace(mock_fs: testing.MockFs):
187+
namespace = 'ns'
188+
data_dir = epath.Path('/a')
189+
mock_fs.add_file(data_dir / 'ds1/config1/1.0.0/dataset_info.json')
190+
mock_fs.add_file(data_dir / 'ds1/config1/1.0.0/features.json')
191+
192+
references = sorted(
193+
file_utils.list_datasets_in_data_dir(
194+
data_dir=epath.Path(data_dir),
195+
namespace=namespace,
196+
include_configs=True,
197+
include_versions=True,
198+
)
199+
)
200+
data_dir = epath.Path('/a')
201+
assert references == [
202+
naming.DatasetReference(
203+
dataset_name='ds1',
204+
namespace=namespace,
205+
config='config1',
206+
version='1.0.0',
207+
data_dir=data_dir,
208+
),
209+
]
210+
211+
212+
def test_find_files_without_glob(mock_fs: testing.MockFs):
213+
folder = epath.Path('/')
214+
mock_fs.add_file(folder / 'a' / 'b' / 'x')
215+
mock_fs.add_file(folder / 'a' / 'c' / 'x')
216+
mock_fs.add_file(folder / 'b' / 'd' / 'x')
217+
mock_fs.add_file(folder / 'b' / 'd' / 'y') # Should be ignored.
218+
mock_fs.add_file(folder / 'b' / '.config' / 'x') # Should be ignored.
219+
mock_fs.add_file(folder / 'b' / 'x')
220+
mock_fs.add_file(folder / 'b' / 'y') # Should be ignored.
221+
actual = file_utils._find_files_without_glob(
222+
folder, globs=['*/*', '*/*/*'], file_names=['x']
223+
)
224+
actual = [os.fspath(p) for p in actual]
225+
assert sorted(actual) == ['/a/b/x', '/a/c/x', '/b/d/x', '/b/x']
226+
227+
228+
@pytest.mark.parametrize(
229+
['filename', 'result'],
230+
[
231+
('abc', False),
232+
('dataset_info.json', True),
233+
('features.json', True),
234+
('mnist-test.tfrecord-00000-of-00001', True),
235+
('mnist-test.arrayrecord-00000-of-00001', True),
236+
],
237+
)
238+
def test_looks_like_a_tfds_file(filename, result):
239+
assert file_utils._looks_like_a_tfds_file(filename) == result
240+
241+
242+
@pytest.mark.parametrize(
243+
['path', 'glob_result', 'expected'],
244+
[
245+
('/a/*', ['/a/b', '/a/c'], ['/a/b', '/a/c']),
246+
('/a/b', None, ['/a/b']),
247+
('a/*', None, ['a/*']),
248+
('/a/b@*', None, ['/a/b@*']),
249+
],
250+
)
251+
def test_expand_glob(path, glob_result, expected):
252+
with mock.patch.object(epath, 'Path') as mock_epath:
253+
mock_epath.return_value.expanduser.return_value = path
254+
mock_epath.return_value.glob.return_value = glob_result
255+
actual = file_utils.expand_glob(path)
256+
if glob_result is not None:
257+
mock_epath.return_value.glob.assert_called_once_with(path[1:])
258+
else:
259+
mock_epath.return_value.glob.assert_not_called()
260+
actual = [os.fspath(p) for p in actual]
261+
assert actual == expected
262+
263+
264+
def test_publish_data(mock_fs: testing.MockFs):
265+
from_data_dir = epath.Path('/tmp') / 'dummy_mnist/3.0.1'
266+
filename = 'dataset_info.json'
267+
content = 'a'
268+
mock_fs.add_file(path=from_data_dir / filename, content=content)
269+
to_data_dir = epath.Path('/a/b')
270+
file_utils.publish_data(from_data_dir=from_data_dir, to_data_dir=to_data_dir)
271+
assert mock_fs.read_file(to_data_dir / filename) == content
272+
273+
133274
if __name__ == '__main__':
134275
testing.test_main()

0 commit comments

Comments
 (0)