Skip to content

Commit 06c3235

Browse files
authored
update load_dataset doctring (#7301)
* update load_dataset doctring * style * minor * drop python 3.8
1 parent 17f17b3 commit 06c3235

File tree

10 files changed

+111
-99
lines changed

10 files changed

+111
-99
lines changed

.github/workflows/ci.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ jobs:
2121
- name: Set up Python
2222
uses: actions/setup-python@v5
2323
with:
24-
python-version: "3.8"
24+
python-version: "3.9"
2525
- name: Install dependencies
2626
run: |
2727
python -m pip install --upgrade pip
@@ -44,10 +44,10 @@ jobs:
4444
- uses: actions/checkout@v4
4545
with:
4646
fetch-depth: 0
47-
- name: Set up Python 3.8
47+
- name: Set up Python 3.9
4848
uses: actions/setup-python@v5
4949
with:
50-
python-version: "3.8"
50+
python-version: "3.9"
5151
- name: Upgrade pip
5252
run: python -m pip install --upgrade pip
5353
- name: Pin setuptools-scm

.github/workflows/release-conda.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ jobs:
2525
auto-update-conda: true
2626
auto-activate-base: false
2727
activate-environment: "build-datasets"
28-
python-version: 3.8
28+
python-version: 3.9
2929
channels: huggingface
3030

3131
- name: Setup conda env

setup.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@
251251
"datasets.utils.resources": ["*.json", "*.yaml", "*.tsv"],
252252
},
253253
entry_points={"console_scripts": ["datasets-cli=datasets.commands.datasets_cli:main"]},
254-
python_requires=">=3.8.0",
254+
python_requires=">=3.9.0",
255255
install_requires=REQUIRED_PKGS,
256256
extras_require=EXTRAS_REQUIRE,
257257
classifiers=[
@@ -262,7 +262,6 @@
262262
"License :: OSI Approved :: Apache Software License",
263263
"Operating System :: OS Independent",
264264
"Programming Language :: Python :: 3",
265-
"Programming Language :: Python :: 3.8",
266265
"Programming Language :: Python :: 3.9",
267266
"Programming Language :: Python :: 3.10",
268267
"Programming Language :: Python :: 3.11",

src/datasets/arrow_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -372,7 +372,7 @@ def to_tf_dataset(
372372
a small buffer of batches for training. Improves performance by allowing data to be loaded in the
373373
background while the model is training.
374374
num_workers (`int`, defaults to `0`):
375-
Number of workers to use for loading the dataset. Only supported on Python versions >= 3.8.
375+
Number of workers to use for loading the dataset.
376376
num_test_batches (`int`, defaults to `20`):
377377
Number of batches to use to infer the output signature of the dataset.
378378
The higher this number, the more accurate the signature will be, but the longer it will take to

src/datasets/load.py

Lines changed: 59 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -242,9 +242,11 @@ def __reduce__(self): # to make dynamically created class pickable, see _Initia
242242
def get_dataset_builder_class(
243243
dataset_module: "DatasetModule", dataset_name: Optional[str] = None
244244
) -> Type[DatasetBuilder]:
245-
with lock_importable_file(
246-
dataset_module.importable_file_path
247-
) if dataset_module.importable_file_path else nullcontext():
245+
with (
246+
lock_importable_file(dataset_module.importable_file_path)
247+
if dataset_module.importable_file_path
248+
else nullcontext()
249+
):
248250
builder_cls = import_main_class(dataset_module.module_path)
249251
if dataset_module.builder_configs_parameters.builder_configs:
250252
dataset_name = dataset_name or dataset_module.builder_kwargs.get("dataset_name")
@@ -1751,42 +1753,36 @@ def load_dataset_builder(
17511753
_require_default_config_name=True,
17521754
**config_kwargs,
17531755
) -> DatasetBuilder:
1754-
"""Load a dataset builder from the Hugging Face Hub, or a local dataset. A dataset builder can be used to inspect general information that is required to build a dataset (cache directory, config, dataset info, etc.)
1755-
without downloading the dataset itself.
1756-
1757-
You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].
1756+
"""Load a dataset builder which can be used to:
17581757
1759-
A dataset is a directory that contains:
1758+
- Inspect general information that is required to build a dataset (cache directory, config, dataset info, features, data files, etc.)
1759+
- Download and prepare the dataset as Arrow files in the cache
1760+
- Get a streaming dataset without downloading or caching anything
17601761
1761-
- some data files in generic formats (JSON, CSV, Parquet, text, etc.)
1762-
- and optionally a dataset script, if it requires some code to read the data files. This is used to load any kind of formats or structures.
1762+
You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].
17631763
1764-
Note that dataset scripts can also download and read data files from anywhere - in case your data files already exist online.
1764+
A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly
1765+
in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.)
17651766
17661767
Args:
17671768
17681769
path (`str`):
17691770
Path or name of the dataset.
1770-
Depending on `path`, the dataset builder that is used comes from a generic dataset script (JSON, CSV, Parquet, text etc.) or from the dataset script (a python file) inside the dataset directory.
17711771
1772-
For local datasets:
1772+
- if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`])
1773+
-> load the dataset builder from supported files in the repository (csv, json, parquet, etc.)
1774+
e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files.
17731775
1774-
- if `path` is a local directory (containing data files only)
1775-
-> load a generic dataset builder (csv, json, text etc.) based on the content of the directory
1776+
- if `path` is a local directory
1777+
-> load the dataset builder from supported files in the directory (csv, json, parquet, etc.)
17761778
e.g. `'./path/to/directory/with/my/csv/data'`.
1777-
- if `path` is a local dataset script or a directory containing a local dataset script (if the script has the same name as the directory)
1778-
-> load the dataset builder from the dataset script
1779-
e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`.
1780-
1781-
For datasets on the Hugging Face Hub (list all available datasets with [`huggingface_hub.list_datasets`])
17821779
1783-
- if `path` is a dataset repository on the HF hub (containing data files only)
1784-
-> load a generic dataset builder (csv, text etc.) based on the content of the repository
1785-
e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing your data files.
1786-
- if `path` is a dataset repository on the HF hub with a dataset script (if the script has the same name as the directory)
1787-
-> load the dataset builder from the dataset script in the dataset repository
1788-
e.g. `glue`, `squad`, `'username/dataset_name'`, a dataset repository on the HF hub containing a dataset script `'dataset_name.py'`.
1780+
- if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified
1781+
(available builders are "json", "csv", "parquet", "arrow", "text", "xml", "webdataset", "imagefolder", "audiofolder", "videofolder")
1782+
-> load the dataset builder from the files in `data_files` or `data_dir`
1783+
e.g. `'parquet'`.
17891784
1785+
It can also point to a local dataset script but this is not recommended.
17901786
name (`str`, *optional*):
17911787
Defining the name of the dataset configuration.
17921788
data_dir (`str`, *optional*):
@@ -1837,7 +1833,7 @@ def load_dataset_builder(
18371833
18381834
```py
18391835
>>> from datasets import load_dataset_builder
1840-
>>> ds_builder = load_dataset_builder('rotten_tomatoes')
1836+
>>> ds_builder = load_dataset_builder('cornell-movie-review-data/rotten_tomatoes')
18411837
>>> ds_builder.info.features
18421838
{'label': ClassLabel(num_classes=2, names=['neg', 'pos'], id=None),
18431839
'text': Value(dtype='string', id=None)}
@@ -1931,61 +1927,55 @@ def load_dataset(
19311927
19321928
You can find the list of datasets on the [Hub](https://huggingface.co/datasets) or with [`huggingface_hub.list_datasets`].
19331929
1934-
A dataset is a directory that contains:
1935-
1936-
- some data files in generic formats (JSON, CSV, Parquet, text, etc.).
1937-
- and optionally a dataset script, if it requires some code to read the data files. This is used to load any kind of formats or structures.
1938-
1939-
Note that dataset scripts can also download and read data files from anywhere - in case your data files already exist online.
1930+
A dataset is a directory that contains some data files in generic formats (JSON, CSV, Parquet, etc.) and possibly
1931+
in a generic structure (Webdataset, ImageFolder, AudioFolder, VideoFolder, etc.)
19401932
19411933
This function does the following under the hood:
19421934
1943-
1. Download and import in the library the dataset script from `path` if it's not already cached inside the library.
1935+
1. Load a dataset builder:
19441936
1945-
If the dataset has no dataset script, then a generic dataset script is imported instead (JSON, CSV, Parquet, text, etc.)
1937+
* Find the most common data format in the dataset and pick its associated builder (JSON, CSV, Parquet, Webdataset, ImageFolder, AudioFolder, etc.)
1938+
* Find which file goes into which split (e.g. train/test) based on file and directory names or on the YAML configuration
1939+
* It is also possible to specify `data_files` manually, and which dataset builder to use (e.g. "parquet").
19461940
1947-
Dataset scripts are small python scripts that define dataset builders. They define the citation, info and format of the dataset,
1948-
contain the path or URL to the original data files and the code to load examples from the original data files.
1941+
2. Run the dataset builder:
19491942
1950-
You can find the complete list of datasets in the Datasets [Hub](https://huggingface.co/datasets).
1943+
In the general case:
19511944
1952-
2. Run the dataset script which will:
1953-
1954-
* Download the dataset file from the original URL (see the script) if it's not already available locally or cached.
1945+
* Download the data files from the dataset if they are not already available locally or cached.
19551946
* Process and cache the dataset in typed Arrow tables for caching.
19561947
19571948
Arrow table are arbitrarily long, typed tables which can store nested objects and be mapped to numpy/pandas/python generic types.
19581949
They can be directly accessed from disk, loaded in RAM or even streamed over the web.
19591950
1951+
In the streaming case:
1952+
1953+
* Don't download or cache anything. Instead, the dataset is lazily loaded and will be streamed on-the-fly when iterating on it.
1954+
19601955
3. Return a dataset built from the requested splits in `split` (default: all).
19611956
1962-
It also allows to load a dataset from a local directory or a dataset repository on the Hugging Face Hub without dataset script.
1963-
In this case, it automatically loads all the data files from the directory or the dataset repository.
1957+
It can also use a custom dataset builder if the dataset contains a dataset script, but this feature is mostly for backward compatibility.
1958+
In this case the dataset script file must be named after the dataset repository or directory and end with ".py".
19641959
19651960
Args:
19661961
19671962
path (`str`):
19681963
Path or name of the dataset.
1969-
Depending on `path`, the dataset builder that is used comes from a generic dataset script (JSON, CSV, Parquet, text etc.) or from the dataset script (a python file) inside the dataset directory.
19701964
1971-
For local datasets:
1965+
- if `path` is a dataset repository on the HF hub (list all available datasets with [`huggingface_hub.list_datasets`])
1966+
-> load the dataset from supported files in the repository (csv, json, parquet, etc.)
1967+
e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing the data files.
19721968
1973-
- if `path` is a local directory (containing data files only)
1974-
-> load a generic dataset builder (csv, json, text etc.) based on the content of the directory
1969+
- if `path` is a local directory
1970+
-> load the dataset from supported files in the directory (csv, json, parquet, etc.)
19751971
e.g. `'./path/to/directory/with/my/csv/data'`.
1976-
- if `path` is a local dataset script or a directory containing a local dataset script (if the script has the same name as the directory)
1977-
-> load the dataset builder from the dataset script
1978-
e.g. `'./dataset/squad'` or `'./dataset/squad/squad.py'`.
19791972
1980-
For datasets on the Hugging Face Hub (list all available datasets with [`huggingface_hub.list_datasets`])
1981-
1982-
- if `path` is a dataset repository on the HF hub (containing data files only)
1983-
-> load a generic dataset builder (csv, text etc.) based on the content of the repository
1984-
e.g. `'username/dataset_name'`, a dataset repository on the HF hub containing your data files.
1985-
- if `path` is a dataset repository on the HF hub with a dataset script (if the script has the same name as the directory)
1986-
-> load the dataset builder from the dataset script in the dataset repository
1987-
e.g. `glue`, `squad`, `'username/dataset_name'`, a dataset repository on the HF hub containing a dataset script `'dataset_name.py'`.
1973+
- if `path` is the name of a dataset builder and `data_files` or `data_dir` is specified
1974+
(available builders are "json", "csv", "parquet", "arrow", "text", "xml", "webdataset", "imagefolder", "audiofolder", "videofolder")
1975+
-> load the dataset from the files in `data_files` or `data_dir`
1976+
e.g. `'parquet'`.
19881977
1978+
It can also point to a local dataset script but this is not recommended.
19891979
name (`str`, *optional*):
19901980
Defining the name of the dataset configuration.
19911981
data_dir (`str`, *optional*):
@@ -2072,11 +2062,18 @@ def load_dataset(
20722062
20732063
```py
20742064
>>> from datasets import load_dataset
2075-
>>> ds = load_dataset('rotten_tomatoes', split='train')
2065+
>>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train')
20762066
2077-
# Map data files to splits
2067+
# Load a subset or dataset configuration (here 'sst2')
2068+
>>> from datasets import load_dataset
2069+
>>> ds = load_dataset('nyu-mll/glue', 'sst2', split='train')
2070+
2071+
# Manual mapping of data files to splits
20782072
>>> data_files = {'train': 'train.csv', 'test': 'test.csv'}
20792073
>>> ds = load_dataset('namespace/your_dataset_name', data_files=data_files)
2074+
2075+
# Manual selection of a directory to load
2076+
>>> ds = load_dataset('namespace/your_dataset_name', data_dir='folder_name')
20802077
```
20812078
20822079
Load a local dataset:
@@ -2090,7 +2087,7 @@ def load_dataset(
20902087
>>> from datasets import load_dataset
20912088
>>> ds = load_dataset('json', data_files='path/to/local/my_dataset.json')
20922089
2093-
# Load from a local loading script
2090+
# Load from a local loading script (not recommended)
20942091
>>> from datasets import load_dataset
20952092
>>> ds = load_dataset('path/to/local/loading_script/loading_script.py', split='train')
20962093
```
@@ -2099,7 +2096,7 @@ def load_dataset(
20992096
21002097
```py
21012098
>>> from datasets import load_dataset
2102-
>>> ds = load_dataset('rotten_tomatoes', split='train', streaming=True)
2099+
>>> ds = load_dataset('cornell-movie-review-data/rotten_tomatoes', split='train', streaming=True)
21032100
```
21042101
21052102
Load an image dataset with the `ImageFolder` dataset builder:

tests/test_arrow_dataset.py

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2717,9 +2717,11 @@ def test_format_vectors(self, in_memory):
27172717
import tensorflow as tf
27182718
import torch
27192719

2720-
with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset(
2721-
in_memory, tmp_dir
2722-
) as dset, dset.map(lambda ex, i: {"vec": np.ones(3) * i}, with_indices=True) as dset:
2720+
with (
2721+
tempfile.TemporaryDirectory() as tmp_dir,
2722+
self._create_dummy_dataset(in_memory, tmp_dir) as dset,
2723+
dset.map(lambda ex, i: {"vec": np.ones(3) * i}, with_indices=True) as dset,
2724+
):
27232725
columns = dset.column_names
27242726

27252727
self.assertIsNotNone(dset[0])
@@ -2770,9 +2772,11 @@ def test_format_ragged_vectors(self, in_memory):
27702772
import tensorflow as tf
27712773
import torch
27722774

2773-
with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset(
2774-
in_memory, tmp_dir
2775-
) as dset, dset.map(lambda ex, i: {"vec": np.ones(3 + i) * i}, with_indices=True) as dset:
2775+
with (
2776+
tempfile.TemporaryDirectory() as tmp_dir,
2777+
self._create_dummy_dataset(in_memory, tmp_dir) as dset,
2778+
dset.map(lambda ex, i: {"vec": np.ones(3 + i) * i}, with_indices=True) as dset,
2779+
):
27762780
columns = dset.column_names
27772781

27782782
self.assertIsNotNone(dset[0])
@@ -2830,9 +2834,11 @@ def test_format_nested(self, in_memory):
28302834
import tensorflow as tf
28312835
import torch
28322836

2833-
with tempfile.TemporaryDirectory() as tmp_dir, self._create_dummy_dataset(
2834-
in_memory, tmp_dir
2835-
) as dset, dset.map(lambda ex: {"nested": [{"foo": np.ones(3)}] * len(ex["filename"])}, batched=True) as dset:
2837+
with (
2838+
tempfile.TemporaryDirectory() as tmp_dir,
2839+
self._create_dummy_dataset(in_memory, tmp_dir) as dset,
2840+
dset.map(lambda ex: {"nested": [{"foo": np.ones(3)}] * len(ex["filename"])}, batched=True) as dset,
2841+
):
28362842
self.assertDictEqual(
28372843
dset.features, Features({"filename": Value("string"), "nested": {"foo": Sequence(Value("float64"))}})
28382844
)
@@ -3224,11 +3230,11 @@ def test_concatenate_mixed_memory_and_disk(self):
32243230
info1 = DatasetInfo(description="Dataset1")
32253231
info2 = DatasetInfo(description="Dataset2")
32263232
with tempfile.TemporaryDirectory() as tmp_dir:
3227-
with Dataset.from_dict(data1, info=info1).map(
3228-
cache_file_name=os.path.join(tmp_dir, "d1.arrow")
3229-
) as dset1, Dataset.from_dict(data2, info=info2).map(
3230-
cache_file_name=os.path.join(tmp_dir, "d2.arrow")
3231-
) as dset2, Dataset.from_dict(data3) as dset3:
3233+
with (
3234+
Dataset.from_dict(data1, info=info1).map(cache_file_name=os.path.join(tmp_dir, "d1.arrow")) as dset1,
3235+
Dataset.from_dict(data2, info=info2).map(cache_file_name=os.path.join(tmp_dir, "d2.arrow")) as dset2,
3236+
Dataset.from_dict(data3) as dset3,
3237+
):
32323238
with concatenate_datasets([dset1, dset2, dset3]) as concatenated_dset:
32333239
self.assertEqual(len(concatenated_dset), len(dset1) + len(dset2) + len(dset3))
32343240
self.assertListEqual(concatenated_dset["id"], dset1["id"] + dset2["id"] + dset3["id"])
@@ -4130,9 +4136,10 @@ def test_dataset_to_json(dataset, tmp_path):
41304136
)
41314137
def test_pickle_dataset_after_transforming_the_table(in_memory, method_and_params, arrow_file):
41324138
method, args, kwargs = method_and_params
4133-
with Dataset.from_file(arrow_file, in_memory=in_memory) as dataset, Dataset.from_file(
4134-
arrow_file, in_memory=in_memory
4135-
) as reference_dataset:
4139+
with (
4140+
Dataset.from_file(arrow_file, in_memory=in_memory) as dataset,
4141+
Dataset.from_file(arrow_file, in_memory=in_memory) as reference_dataset,
4142+
):
41364143
out = getattr(dataset, method)(*args, **kwargs)
41374144
dataset = out if out is not None else dataset
41384145
pickled_dataset = pickle.dumps(dataset)

0 commit comments

Comments
 (0)