Skip to content

Commit 02ee330

Browse files
authored
Less api calls when resolving data_files (#7805)
less api calls when resolving data_files
1 parent cfcdfce commit 02ee330

File tree

1 file changed

+12
-0
lines changed

1 file changed

+12
-0
lines changed

src/datasets/data_files.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,18 @@ def _get_origin_metadata(
503503
max_workers: Optional[int] = None,
504504
) -> list[SingleOriginMetadata]:
505505
max_workers = max_workers if max_workers is not None else config.HF_DATASETS_MULTITHREADING_MAX_WORKERS
506+
if all("hf://" in data_file for data_file in data_files):
507+
# No need for multithreading here since the origin metadata of HF files
508+
# is (repo_id, revision) and is cached after first .info() call.
509+
return [
510+
_get_single_origin_metadata(data_file, download_config=download_config)
511+
for data_file in hf_tqdm(
512+
data_files,
513+
desc="Resolving data files",
514+
# set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
515+
disable=len(data_files) <= 16 or None,
516+
)
517+
]
506518
return thread_map(
507519
partial(_get_single_origin_metadata, download_config=download_config),
508520
data_files,

0 commit comments

Comments
 (0)