huggingface
diff --git a/‎src/datasets/builder.py‎
Lines changed: 40 additions & 0 deletions b/‎src/datasets/builder.py‎
Lines changed: 40 additions & 0 deletions
diff --git a/‎src/datasets/packaged_modules/arrow/arrow.py‎
Lines changed: 6 additions & 9 deletions b/‎src/datasets/packaged_modules/arrow/arrow.py‎
Lines changed: 6 additions & 9 deletions
diff --git a/‎src/datasets/packaged_modules/cache/cache.py‎
Lines changed: 3 additions & 0 deletions b/‎src/datasets/packaged_modules/cache/cache.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎src/datasets/packaged_modules/csv/csv.py‎
Lines changed: 27 additions & 20 deletions b/‎src/datasets/packaged_modules/csv/csv.py‎
Lines changed: 27 additions & 20 deletions
diff --git a/‎src/datasets/packaged_modules/eval/eval.py‎
Lines changed: 24 additions & 10 deletions b/‎src/datasets/packaged_modules/eval/eval.py‎
Lines changed: 24 additions & 10 deletions
diff --git a/‎src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py‎
Lines changed: 28 additions & 12 deletions b/‎src/datasets/packaged_modules/folder_based_builder/folder_based_builder.py‎
Lines changed: 28 additions & 12 deletions
diff --git a/‎src/datasets/packaged_modules/hdf5/hdf5.py‎
Lines changed: 6 additions & 9 deletions b/‎src/datasets/packaged_modules/hdf5/hdf5.py‎
Lines changed: 6 additions & 9 deletions
@@ -1335,6 +1335,26 @@ class GeneratorBasedBuilder(DatasetBuilder):
     (`_split_generators`). See the method docstrings for details.
     """
 
+    def _generate_shards(self, **kwargs) -> Iterator[Union[str, dict[str, Any]]]:
+        """Default function generating shards paths for each `SplitGenerator`.
+
+        This function is useful to list the original shards from where the data
+        comes from and is either converted to Arrow or streamed to an IterableDataset.
+
+        This is optional and only used for certain utilities, but not in Dataset
+        nor IterableDataset. E.g. it's used to map original shard files to Parquet
+        files in the Dataset Viewer after conversion.
+
+        Args:
+            **kwargs (additional keyword arguments):
+                Arguments forwarded from the SplitGenerator.gen_kwargs
+
+        Yields:
+            shard: generally a string representing the shard path, or a dict
+                representing the shard in case of shards spanning intra or inter-files.
+        """
+        raise NotImplementedError()
+
     @abc.abstractmethod
     def _generate_examples(self, **kwargs) -> Iterator[tuple[Key, dict[str, Any]]]:
         """Default function generating examples for each `SplitGenerator`.
@@ -1624,6 +1644,26 @@ def _get_examples_iterable_for_split(self, split_generator: SplitGenerator) -> E
 class ArrowBasedBuilder(DatasetBuilder):
     """Base class for datasets with data generation based on Arrow loading functions (CSV/JSON/Parquet)."""
 
+    def _generate_shards(self, **kwargs) -> Iterator[Union[str, dict[str, Any]]]:
+        """Default function generating shards paths for each `SplitGenerator`.
+
+        This function is useful to list the original shards from where the data
+        comes from and is either converted to Arrow or streamed to an IterableDataset.
+
+        This is optional and only used for certain utilities, but not in Dataset
+        nor IterableDataset. E.g. it's used to map original shard files to Parquet
+        files in the Dataset Viewer after conversion.
+
+        Args:
+            **kwargs (additional keyword arguments):
+                Arguments forwarded from the SplitGenerator.gen_kwargs
+
+        Yields:
+            shard: generally a string representing the shard path, or a dict
+                representing the shard in case of shards spanning intra or inter-files.
+        """
+        raise NotImplementedError()
+
     @abc.abstractmethod
     def _generate_tables(self, **kwargs) -> Iterator[tuple[Key, pa.Table]]:
         """Default function generating examples for each `SplitGenerator`.
 
@@ -1,4 +1,3 @@
-import itertools
 from dataclasses import dataclass
 from typing import Optional
 
@@ -32,17 +31,12 @@ def _split_generators(self, dl_manager):
         """We handle string, list and dicts in datafiles"""
         if not self.config.data_files:
             raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
-        dl_manager.download_config.extract_on_the_fly = True
-        data_files = dl_manager.download_and_extract(self.config.data_files)
+        data_files = dl_manager.download(self.config.data_files)
         splits = []
         for split_name, files in data_files.items():
-            if isinstance(files, str):
-                files = [files]
-            # Use `dl_manager.iter_files` to skip hidden files in an extracted archive
-            files = [dl_manager.iter_files(file) for file in files]
             # Infer features if they are stored in the arrow schema
             if self.info.features is None:
-                for file in itertools.chain.from_iterable(files):
+                for file in files:
                     with open(file, "rb") as f:
                         try:
                             reader = pa.ipc.open_stream(f)
@@ -60,8 +54,11 @@ def _cast_table(self, pa_table: pa.Table) -> pa.Table:
             pa_table = table_cast(pa_table, self.info.features.arrow_schema)
         return pa_table
 
+    def _generate_shards(self, files):
+        yield from files
+
     def _generate_tables(self, files):
-        for file_idx, file in enumerate(itertools.chain.from_iterable(files)):
+        for file_idx, file in enumerate(files):
             with open(file, "rb") as f:
                 try:
                     try:
 
@@ -176,6 +176,9 @@ def _split_generators(self, dl_manager):
             for split_info in split_infos
         ]
 
+    def _generate_shards(self, files):
+        yield from files
+
     def _generate_tables(self, files):
         # used to stream from cache
         for file_idx, file in enumerate(files):
 
@@ -1,4 +1,3 @@
-import itertools
 from dataclasses import dataclass
 from typing import Any, Callable, Optional, Union
 
@@ -154,13 +153,17 @@ def _split_generators(self, dl_manager):
         if not self.config.data_files:
             raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
         dl_manager.download_config.extract_on_the_fly = True
-        data_files = dl_manager.download_and_extract(self.config.data_files)
+        base_data_files = dl_manager.download(self.config.data_files)
+        extracted_data_files = dl_manager.extract(base_data_files)
         splits = []
-        for split_name, files in data_files.items():
-            if isinstance(files, str):
-                files = [files]
-            files = [dl_manager.iter_files(file) for file in files]
-            splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
+        for split_name, extracted_files in extracted_data_files.items():
+            files_iterables = [dl_manager.iter_files(extracted_file) for extracted_file in extracted_files]
+            splits.append(
+                datasets.SplitGenerator(
+                    name=split_name,
+                    gen_kwargs={"files_iterables": files_iterables, "base_files": base_data_files[split_name]},
+                )
+            )
         return splits
 
     def _cast_table(self, pa_table: pa.Table) -> pa.Table:
@@ -174,7 +177,10 @@ def _cast_table(self, pa_table: pa.Table) -> pa.Table:
                 pa_table = table_cast(pa_table, schema)
         return pa_table
 
-    def _generate_tables(self, files):
+    def _generate_shards(self, base_files, files_iterables):
+        yield from base_files
+
+    def _generate_tables(self, base_files, files_iterables):
         schema = self.config.features.arrow_schema if self.config.features else None
         # dtype allows reading an int column as str
         dtype = (
@@ -185,15 +191,16 @@ def _generate_tables(self, files):
             if schema is not None
             else None
         )
-        for file_idx, file in enumerate(itertools.chain.from_iterable(files)):
-            csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.pd_read_csv_kwargs)
-            try:
-                for batch_idx, df in enumerate(csv_file_reader):
-                    pa_table = pa.Table.from_pandas(df)
-                    # Uncomment for debugging (will print the Arrow table size and elements)
-                    # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
-                    # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
-                    yield Key(file_idx, batch_idx), self._cast_table(pa_table)
-            except ValueError as e:
-                logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
-                raise
+        for shard_idx, files_iterable in enumerate(files_iterables):
+            for file in files_iterable:
+                csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, **self.config.pd_read_csv_kwargs)
+                try:
+                    for batch_idx, df in enumerate(csv_file_reader):
+                        pa_table = pa.Table.from_pandas(df)
+                        # Uncomment for debugging (will print the Arrow table size and elements)
+                        # logger.warning(f"pa_table: {pa_table} num rows: {pa_table.num_rows}")
+                        # logger.warning('\n'.join(str(pa_table.slice(i, 1).to_pydict()) for i in range(pa_table.num_rows)))
+                        yield Key(shard_idx, batch_idx), self._cast_table(pa_table)
+                except ValueError as e:
+                    logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
+                    raise
@@ -1,6 +1,7 @@
 import json
 import os
 from itertools import islice
+from typing import Iterable
 
 import pyarrow as pa
 
@@ -22,16 +23,26 @@ def _split_generators(self, dl_manager):
         if not self.config.data_files:
             raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
         dl_manager.download_config.extract_on_the_fly = True
-        data_files = dl_manager.download_and_extract(self.config.data_files)
+        base_data_files = dl_manager.download(self.config.data_files)
+        extracted_data_files = dl_manager.extract(base_data_files)
         splits = []
-        for split_name, logs in data_files.items():
-            if isinstance(logs, str):
-                logs = [logs]
-            logs_files = [dl_manager.iter_files(log) for log in logs]
-            splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"logs_files": logs_files}))
+        for split_name, logs in extracted_data_files.items():
+            logs_files_iterables = [dl_manager.iter_files(log) for log in logs]
+            splits.append(
+                datasets.SplitGenerator(
+                    name=split_name,
+                    gen_kwargs={
+                        "logs_files_iterables": logs_files_iterables,
+                        "base_files": base_data_files[split_name],
+                    },
+                )
+            )
         if not self.info.features:
             first_examples = list(
-                islice(self._iter_samples_from_log_files(logs_files[0]), self.NUM_EXAMPLES_FOR_FEATURES_INFERENCE)
+                islice(
+                    self._iter_samples_from_log_files(logs_files_iterables[0]),
+                    self.NUM_EXAMPLES_FOR_FEATURES_INFERENCE,
+                )
             )
             pa_tables = [pa.Table.from_pylist([example]) for example in first_examples]
             inferred_arrow_schema = pa.concat_tables(pa_tables, promote_options="default").schema
@@ -44,7 +55,7 @@ def _sort_samples_key(self, sample_path: str):
         (sample_idx_str, epoch_idx_str) = os.path.splitext(os.path.basename(sample_path))[0].split("_epoch_")
         return (int(epoch_idx_str), int(sample_idx_str))
 
-    def _iter_samples_from_log_files(self, log_files: list[str]):
+    def _iter_samples_from_log_files(self, log_files: Iterable[str]):
         sample_files = [log_file for log_file in log_files if os.path.basename(os.path.dirname(log_file)) == "samples"]
         sample_files.sort(key=self._sort_samples_key)
         for sample_file in sample_files:
@@ -57,7 +68,10 @@ def _iter_samples_from_log_files(self, log_files: list[str]):
                         sample[field] = [json.dumps(x) for x in sample[field]]
                 yield sample
 
-    def _generate_examples(self, logs_files):
-        for file_idx, log_files in enumerate(logs_files):
+    def _generate_shards(self, base_files, logs_files_iterables):
+        yield from base_files
+
+    def _generate_examples(self, base_files, logs_files_iterables):
+        for file_idx, log_files in enumerate(logs_files_iterables):
             for sample_idx, sample in enumerate(self._iter_samples_from_log_files(log_files)):
                 yield Key(file_idx, sample_idx), sample
@@ -112,7 +112,7 @@ def analyze(files_or_archives, downloaded_files_or_dirs, split):
                                 labels.add(os.path.basename(os.path.dirname(downloaded_dir_file)))
                                 path_depths.add(count_path_segments(downloaded_dir_file))
                         elif os.path.basename(downloaded_dir_file) in metadata_filenames:
-                            metadata_files[split].add((None, downloaded_dir_file))
+                            metadata_files[split].add((None, downloaded_dir, downloaded_dir_file))
                         else:
                             archive_file_name = os.path.basename(archive)
                             original_file_name = os.path.basename(downloaded_dir_file)
@@ -123,8 +123,6 @@ def analyze(files_or_archives, downloaded_files_or_dirs, split):
         data_files = self.config.data_files
         splits = []
         for split_name, files in data_files.items():
-            if isinstance(files, str):
-                files = [files]
             files, archives = self._split_files_and_archives(files)
             downloaded_files = dl_manager.download(files)
             downloaded_dirs = dl_manager.download_and_extract(archives)
@@ -156,12 +154,17 @@ def analyze(files_or_archives, downloaded_files_or_dirs, split):
             else:
                 add_labels, add_metadata, metadata_files = False, False, {}
 
+            # files info (original_file, downloaded_file)
+            files = tuple(zip(files, downloaded_files))
+            # dirs info (original_file, downloaded_dir, downloaded_files)
+            files += tuple(
+                (None, downloaded_dir, dl_manager.iter_files(downloaded_dir)) for downloaded_dir in downloaded_dirs
+            )
             splits.append(
                 datasets.SplitGenerator(
                     name=split_name,
                     gen_kwargs={
-                        "files": tuple(zip(files, downloaded_files))
-                        + tuple((None, dl_manager.iter_files(downloaded_dir)) for downloaded_dir in downloaded_dirs),
+                        "files": files,
                         "metadata_files": metadata_files.get(split_name, []),
                         "add_labels": add_labels,
                         "add_metadata": add_metadata,
@@ -267,7 +270,7 @@ def _split_files_and_archives(self, data_files):
                 files.append(data_file)
             elif os.path.basename(data_file) in metadata_filenames:
                 files.append(data_file)
-            else:
+            elif data_file_ext.lower() == ".zip":
                 archives.append(data_file)
         return files, archives
 
@@ -354,6 +357,14 @@ def _read_metadata(self, metadata_file: str, metadata_ext: str = "") -> Iterator
                 ):
                     yield pa.Table.from_batches([record_batch])
 
+    def _generate_shards(self, files, metadata_files, add_metadata, add_labels):
+        if add_metadata:
+            for original_metadata_file, downloaded_metadata_file in metadata_files:
+                yield downloaded_metadata_file
+        else:
+            for original_file, downloaded_file_or_dir in files:
+                yield downloaded_file_or_dir
+
     def _generate_examples(self, files, metadata_files, add_metadata, add_labels):
         if add_metadata:
             feature_paths = []
@@ -365,7 +376,11 @@ def find_feature_path(feature, feature_path):
 
             _visit_with_path(self.info.features, find_feature_path)
 
-            for shard_idx, (original_metadata_file, downloaded_metadata_file) in enumerate(metadata_files):
+            for shard_idx, metadata_file_info in enumerate(metadata_files):
+                if len(metadata_file_info) == 2:
+                    original_metadata_file, downloaded_metadata_file = metadata_file_info
+                else:
+                    original_metadata_file, downloaded_metadata_dir, downloaded_metadata_file = metadata_file_info
                 metadata_ext = os.path.splitext(original_metadata_file or downloaded_metadata_file)[-1]
                 downloaded_metadata_dir = os.path.dirname(downloaded_metadata_file)
 
@@ -395,12 +410,13 @@ def set_feature(item, feature_path: _VisitPath):
                     if isinstance(self.config.filters, list)
                     else self.config.filters
                 )
-            for shard_idx, (original_file, downloaded_file_or_dir) in enumerate(files):
-                downloaded_files = [downloaded_file_or_dir] if original_file else downloaded_file_or_dir
+            for shard_idx, file_or_dir_info in enumerate(files):
+                if len(file_or_dir_info) == 2:
+                    original_file, downloaded_file = file_or_dir_info
+                    downloaded_files = [downloaded_file]
+                else:
+                    original_file, downloaded_dir, downloaded_files = file_or_dir_info
                 for sample_idx, downloaded_file in enumerate(downloaded_files):
-                    original_file_ext = os.path.splitext(original_file or downloaded_file)[-1]
-                    if original_file_ext.lower() not in self.EXTENSIONS:
-                        continue
                     sample = {self.BASE_COLUMN_NAME: downloaded_file}
                     if add_labels:
                         sample["label"] = os.path.basename(os.path.dirname(original_file or downloaded_file))
 
@@ -1,4 +1,3 @@
-import itertools
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Optional
 
@@ -51,29 +50,27 @@ def _split_generators(self, dl_manager):
 
         if not self.config.data_files:
             raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}")
-        dl_manager.download_config.extract_on_the_fly = True
-        data_files = dl_manager.download_and_extract(self.config.data_files)
+        data_files = dl_manager.download(self.config.data_files)
         splits = []
         for split_name, files in data_files.items():
-            if isinstance(files, str):
-                files = [files]
-
-            files = [dl_manager.iter_files(file) for file in files]
             # Infer features from first file
             if self.info.features is None:
-                for first_file in itertools.chain.from_iterable(files):
+                for first_file in files:
                     with open(first_file, "rb") as f:
                         with h5py.File(f, "r") as h5:
                             self.info.features = _recursive_infer_features(h5)
                     break
             splits.append(datasets.SplitGenerator(name=split_name, gen_kwargs={"files": files}))
         return splits
 
+    def _generate_shards(self, files):
+        yield from files
+
     def _generate_tables(self, files):
         import h5py
 
         batch_size_cfg = self.config.batch_size
-        for file_idx, file in enumerate(itertools.chain.from_iterable(files)):
+        for file_idx, file in enumerate(files):
             try:
                 with open(file, "rb") as f:
                     with h5py.File(f, "r") as h5:
Original file line number	Diff line number	Diff line change
`@@ -176,6 +176,9 @@ def _split_generators(self, dl_manager):`
`176`	`176`	`for split_info in split_infos`
`177`	`177`	`]`
`178`	`178`
	`179`	`+ def _generate_shards(self, files):`
	`180`	`+ yield from files`
	`181`	`+`
`179`	`182`	`def _generate_tables(self, files):`
`180`	`183`	`# used to stream from cache`
`181`	`184`	`for file_idx, file in enumerate(files):`