diff --git a/docs/source/repository_structure.mdx b/docs/source/repository_structure.mdx index c527a3ca730..1b5600558d9 100644 --- a/docs/source/repository_structure.mdx +++ b/docs/source/repository_structure.mdx @@ -277,3 +277,27 @@ my_dataset_repository/ ├── shard_0.csv └── shard_1.csv ``` + +#### Automatic Subset Grouping + +When using folder-based datasets, `datasets` will automatically group files by subset if their names only differ by trailing digits or standard sharding patterns. + +For example: + +```bash +train0.jsonl +train1.jsonl +train2.jsonl +animals.jsonl +metadata.jsonl +``` + +will be grouped into: + +* `"train"` subset → `train0.jsonl`, `train1.jsonl`, `train2.jsonl` +* `"animals"` subset → `animals.jsonl` +* `"metadata"` subset → `metadata.jsonl` + +This logic enables users to provide multiple logical subsets per split name without needing a nested folder structure. It's especially useful for datasets that are sharded or organized by topic. + +This grouping is enabled by default in all builders that inherit from `FolderBasedBuilder`. diff --git a/src/datasets/data_files.py b/src/datasets/data_files.py index 087e037a186..be6b69700e1 100644 --- a/src/datasets/data_files.py +++ b/src/datasets/data_files.py @@ -19,7 +19,30 @@ from .utils import tqdm as hf_tqdm from .utils.file_utils import _prepare_path_and_storage_options, is_local_path, is_relative_path, xbasename, xjoin from .utils.py_utils import string_to_dict +import re +from collections import defaultdict +from pathlib import Path +def group_files_by_subset(filepaths): + """ + Groups files by subset according to a heuristic: + - Files whose names only differ by digits or shard suffixes are grouped together. + - Others are placed in separate groups. + """ + def normalize(filename): + # Remove trailing numbers or known sharding conventions + name = Path(filename).stem + # Remove patterns like -00000-of-00003 + name = re.sub(r'(-\d{5,}-of-\d{5,})$', '', name) + # Remove trailing digits, underscores, or hyphens + name = re.sub(r'[\d_]+$', '', name) + return name + + groups = defaultdict(list) + for path in filepaths: + key = normalize(path) + groups[key].append(path) + return dict(groups) SingleOriginMetadata = Union[tuple[str, str], tuple[str], tuple[()]] diff --git a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py index f1ec638ab3b..b1cd4d570da 100644 --- a/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py +++ b/src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py @@ -15,7 +15,7 @@ from datasets import config from datasets.features.features import FeatureType, _visit, _visit_with_path, _VisitPath, require_storage_cast from datasets.utils.file_utils import readline - +from datasets.data_files import group_files_by_subset logger = datasets.utils.logging.get_logger(__name__) @@ -120,51 +120,57 @@ def analyze(files_or_archives, downloaded_files_or_dirs, split): data_files = self.config.data_files splits = [] for split_name, files in data_files.items(): - if isinstance(files, str): - files = [files] - files, archives = self._split_files_and_archives(files) - downloaded_files = dl_manager.download(files) - downloaded_dirs = dl_manager.download_and_extract(archives) - if do_analyze: # drop_metadata is None or False, drop_labels is None or False - logger.info(f"Searching for labels and/or metadata files in {split_name} data files...") - analyze(files, downloaded_files, split_name) - analyze(archives, downloaded_dirs, split_name) - - if metadata_files: - # add metadata if `metadata_files` are found and `drop_metadata` is None (default) or False - add_metadata = not self.config.drop_metadata - # if `metadata_files` are found, don't add labels - add_labels = False + if isinstance(files, str): + files = [files] + files, archives = self._split_files_and_archives(files) + downloaded_files = dl_manager.download(files) + downloaded_dirs = dl_manager.download_and_extract(archives) + + if do_analyze: + logger.info(f"Searching for labels and/or metadata files in {split_name} data files...") + analyze(files, downloaded_files, split_name) + analyze(archives, downloaded_dirs, split_name) + + if metadata_files: + add_metadata = not self.config.drop_metadata + add_labels = False + else: + add_metadata = False + add_labels = ( + (len(labels) > 1 and len(path_depths) == 1) + if self.config.drop_labels is None + else not self.config.drop_labels + ) + + if add_labels: + logger.info("Adding the labels inferred from data directories to the dataset's features...") + if add_metadata: + logger.info("Adding metadata to the dataset...") else: - # if `metadata_files` are not found, don't add metadata - add_metadata = False - # if `metadata_files` are not found and `drop_labels` is None (default) - - # add labels if files are on the same level in directory hierarchy and there is more than one label - add_labels = ( - (len(labels) > 1 and len(path_depths) == 1) - if self.config.drop_labels is None - else not self.config.drop_labels - ) + add_labels, add_metadata, metadata_files = False, False, {} - if add_labels: - logger.info("Adding the labels inferred from data directories to the dataset's features...") - if add_metadata: - logger.info("Adding metadata to the dataset...") - else: - add_labels, add_metadata, metadata_files = False, False, {} - - splits.append( - datasets.SplitGenerator( - name=split_name, - gen_kwargs={ - "files": tuple(zip(files, downloaded_files)) - + tuple((None, dl_manager.iter_files(downloaded_dir)) for downloaded_dir in downloaded_dirs), - "metadata_files": metadata_files.get(split_name, []), - "add_labels": add_labels, - "add_metadata": add_metadata, - }, - ) - ) + grouped = group_files_by_subset(files) + for subset_name, grouped_files in grouped.items(): + grouped_downloaded_files = [ + downloaded_files[files.index(original)] for original in grouped_files if original in files + ] + + split_id = ( + f"{split_name}_{subset_name}" if subset_name != split_name else split_name + ) + + splits.append( + datasets.SplitGenerator( + name=split_id, + gen_kwargs={ + "files": tuple(zip(grouped_files, grouped_downloaded_files)) + + tuple((None, dl_manager.iter_files(downloaded_dir)) for downloaded_dir in downloaded_dirs), + "metadata_files": metadata_files.get(split_name, []), + "add_labels": add_labels, + "add_metadata": add_metadata, + }, + ) + ) if add_metadata: # Verify that: diff --git a/tests/test_data_files.py b/tests/test_data_files.py index 74f48dbd2d5..8e4dd33ac86 100644 --- a/tests/test_data_files.py +++ b/tests/test_data_files.py @@ -21,6 +21,7 @@ resolve_pattern, ) from datasets.fingerprint import Hasher +from datasets.data_files import group_files_by_subset _TEST_PATTERNS = ["*", "**", "**/*", "*.txt", "data/*", "**/*.txt", "**/train.txt"] @@ -509,6 +510,23 @@ def test_DataFilesPatternsDict(text_file): assert isinstance(data_files_dict["train"], DataFilesList) +def test_group_files_by_subset(): + files = [ + "animals.jsonl", + "trees.jsonl", + "metadata.jsonl", + "train0.jsonl", + "train1.jsonl", + "train2.jsonl", + ] + groups = group_files_by_subset(files) + assert "train" in groups + assert set(groups["train"]) == {"train0.jsonl", "train1.jsonl", "train2.jsonl"} + assert "animals" in groups + assert "trees" in groups + assert "metadata" in groups + + def mock_fs(file_paths: List[str]): """ Set up a mock filesystem for fsspec containing the provided files