Skip to content

Commit d041311

Browse files
authored
fix save_infos (#7639)
1 parent 161f99d commit d041311

File tree

3 files changed

+8
-41
lines changed

3 files changed

+8
-41
lines changed

src/datasets/builder.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1050,15 +1050,6 @@ def _save_info(self):
10501050
with file_lock:
10511051
self.info.write_to_directory(self._output_dir, storage_options=self._fs.storage_options)
10521052

1053-
def _save_infos(self):
1054-
file_lock = (
1055-
FileLock(self._output_dir + "_infos.lock")
1056-
if not is_remote_filesystem(self._fs)
1057-
else contextlib.nullcontext()
1058-
)
1059-
with file_lock:
1060-
DatasetInfosDict(**{self.config.name: self.info}).write_to_directory(self.get_imported_module_dir())
1061-
10621053
def _make_split_generators_kwargs(self, prepare_split_kwargs):
10631054
"""Get kwargs for `self._split_generators()` from `prepare_split_kwargs`."""
10641055
del prepare_split_kwargs

src/datasets/commands/test.py

Lines changed: 8 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,13 @@
22
import os
33
from argparse import ArgumentParser
44
from collections.abc import Generator
5-
from pathlib import Path
6-
from shutil import copyfile, rmtree
5+
from shutil import rmtree
76

87
import datasets.config
98
from datasets.builder import DatasetBuilder
109
from datasets.commands import BaseDatasetsCLICommand
1110
from datasets.download.download_manager import DownloadMode
11+
from datasets.info import DatasetInfosDict
1212
from datasets.load import dataset_module_factory, get_dataset_builder_class
1313
from datasets.utils.info_utils import VerificationMode
1414
from datasets.utils.logging import ERROR, get_logger
@@ -157,35 +157,15 @@ def get_builders() -> Generator[DatasetBuilder, None, None]:
157157
num_proc=self._num_proc,
158158
)
159159
builder.as_dataset()
160-
if self._save_infos:
161-
builder._save_infos()
162160

163-
# If save_infos=True, the dataset card (README.md) is created next to the loaded module file.
161+
# If save_infos=True, we create the dataset card (README.md)
164162
# The dataset_infos are saved in the YAML part of the README.md
165-
166-
# Let's move it to the original directory of the dataset, to allow the user to
167-
# upload them on HF at the same time afterwards.
163+
# This is to allow the user to upload them on HF afterwards.
168164
if self._save_infos:
169-
dataset_readme_path = os.path.join(
170-
builder_cls.get_imported_module_dir(), datasets.config.REPOCARD_FILENAME
171-
)
172-
name = Path(path).name + ".py"
173-
combined_path = os.path.join(path, name)
174-
if os.path.isfile(path):
175-
dataset_dir = os.path.dirname(path)
176-
elif os.path.isfile(combined_path):
177-
dataset_dir = path
178-
elif os.path.isdir(path): # for local directories containing only data files
179-
dataset_dir = path
180-
else: # in case of a remote dataset
181-
dataset_dir = None
182-
print(f"Dataset card saved at {dataset_readme_path}")
183-
184-
# Move dataset_info back to the user
185-
if dataset_dir is not None:
186-
user_dataset_readme_path = os.path.join(dataset_dir, datasets.config.REPOCARD_FILENAME)
187-
copyfile(dataset_readme_path, user_dataset_readme_path)
188-
print(f"Dataset card saved at {user_dataset_readme_path}")
165+
save_infos_dir = os.path.basename(path) if not os.path.isdir(path) else path
166+
os.makedirs(save_infos_dir, exist_ok=True)
167+
DatasetInfosDict(**{builder.config.name: builder.info}).write_to_directory(save_infos_dir)
168+
print(f"Dataset card saved at {os.path.join(save_infos_dir, datasets.config.REPOCARD_FILENAME)}")
189169

190170
# If clear_cache=True, the download folder and the dataset builder cache directory are deleted
191171
if self._clear_cache:

src/datasets/load.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1283,8 +1283,6 @@ def load_dataset(
12831283
Whether to copy the dataset in-memory. If `None`, the dataset
12841284
will not be copied in-memory unless explicitly enabled by setting `datasets.config.IN_MEMORY_MAX_SIZE` to
12851285
nonzero. See more details in the [improve performance](../cache#improve-performance) section.
1286-
save_infos (`bool`, defaults to `False`):
1287-
Save the dataset information (checksums/size/splits/...).
12881286
revision ([`Version`] or `str`, *optional*):
12891287
Version of the dataset to load.
12901288
As datasets have their own git repository on the Datasets Hub, the default version "main" corresponds to their "main" branch.
@@ -1428,8 +1426,6 @@ def load_dataset(
14281426
keep_in_memory if keep_in_memory is not None else is_small_dataset(builder_instance.info.dataset_size)
14291427
)
14301428
ds = builder_instance.as_dataset(split=split, verification_mode=verification_mode, in_memory=keep_in_memory)
1431-
if save_infos:
1432-
builder_instance._save_infos()
14331429

14341430
return ds
14351431

0 commit comments

Comments
 (0)