|
1 | 1 | import io |
2 | 2 | import itertools |
3 | 3 | from dataclasses import dataclass |
4 | | -from typing import Optional |
| 4 | +from typing import List, Optional |
5 | 5 |
|
6 | 6 | import pandas as pd |
7 | 7 | import pyarrow as pa |
@@ -70,12 +70,18 @@ def _info(self): |
70 | 70 | raise ValueError("The JSON loader parameter `newlines_in_values` is no longer supported") |
71 | 71 | return datasets.DatasetInfo(features=self.config.features) |
72 | 72 |
|
73 | | - def _split_generators(self, dl_manager): |
| 73 | + def _available_splits(self) -> Optional[List[str]]: |
| 74 | + return [str(split) for split in self.config.data_files] if isinstance(self.config.data_files, dict) else None |
| 75 | + |
| 76 | + def _split_generators(self, dl_manager, splits: Optional[List[str]] = None): |
74 | 77 | """We handle string, list and dicts in datafiles""" |
75 | 78 | if not self.config.data_files: |
76 | 79 | raise ValueError(f"At least one data file must be specified, but got data_files={self.config.data_files}") |
77 | 80 | dl_manager.download_config.extract_on_the_fly = True |
78 | | - data_files = dl_manager.download_and_extract(self.config.data_files) |
| 81 | + data_files = self.config.data_files |
| 82 | + if splits and isinstance(data_files, dict): |
| 83 | + data_files = {split: data_files[split] for split in splits} |
| 84 | + data_files = dl_manager.download_and_extract(data_files) |
79 | 85 | splits = [] |
80 | 86 | for split_name, files in data_files.items(): |
81 | 87 | if isinstance(files, str): |
|
0 commit comments