Skip to content

Commit 580b536

Browse files
PGijsbersmfeurer
andauthored
Download all files (#1188)
* Towards downloading buckets * Download entire bucket instead of dataset file * Dont download arff, skip files already cached * Automatically unzip any downloaded archives * Make downloading the bucket optional Additionally, rename old cached files to the new filename format. * Allow users to download the full bucket when pq is already cached Otherwise the only way would be to delete the cache. * Add unit test stub * Remove redundant try/catch * Remove commented out print statement * Still download arff * Towards downloading buckets * Download entire bucket instead of dataset file * Dont download arff, skip files already cached * Automatically unzip any downloaded archives * Make downloading the bucket optional Additionally, rename old cached files to the new filename format. * Allow users to download the full bucket when pq is already cached Otherwise the only way would be to delete the cache. * Add unit test stub * Remove redundant try/catch * Remove commented out print statement * Still download arff * ADD: download all files from minio bucket * Add note for #1184 * Fix pre-commit issues (mypy, flake) Co-authored-by: Matthias Feurer <[email protected]>
1 parent 1dfe398 commit 580b536

File tree

4 files changed

+91
-10
lines changed

4 files changed

+91
-10
lines changed

doc/progress.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,12 @@ Changelog
1212
* FIX#1058, #1100: Avoid ``NoneType`` error when printing task without ``class_labels`` attribute.
1313
* FIX#1110: Make arguments to ``create_study`` and ``create_suite`` that are defined as optional by the OpenML XSD actually optional.
1414
* FIX#1147: ``openml.flow.flow_exists`` no longer requires an API key.
15+
* FIX#1184: Automatically resolve proxies when downloading from minio. Turn this off by setting environment variable ``no_proxy="*"``.
1516
* MAIN#1088: Do CI for Windows on Github Actions instead of Appveyor.
1617
* MAINT#1104: Fix outdated docstring for ``list_task``.
1718
* MAIN#1146: Update the pre-commit dependencies.
1819
* ADD#1103: Add a ``predictions`` property to OpenMLRun for easy accessibility of prediction data.
20+
* ADD#1188: EXPERIMENTAL. Allow downloading all files from a minio bucket with ``download_all_files=True`` for ``get_dataset``.
1921

2022

2123
0.12.2

openml/_api_calls.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import xmltodict
1313
from urllib3 import ProxyManager
1414
from typing import Dict, Optional, Union
15+
import zipfile
1516

1617
import minio
1718

@@ -44,6 +45,7 @@ def resolve_env_proxies(url: str) -> Optional[str]:
4445
selected_proxy = requests.utils.select_proxy(url, resolved_proxies)
4546
return selected_proxy
4647

48+
4749
def _create_url_from_endpoint(endpoint: str) -> str:
4850
url = config.server
4951
if not url.endswith("/"):
@@ -137,18 +139,18 @@ def _download_minio_file(
137139

138140
proxy_client = ProxyManager(proxy) if proxy else None
139141

140-
client = minio.Minio(
141-
endpoint=parsed_url.netloc,
142-
secure=False,
143-
http_client=proxy_client
144-
)
142+
client = minio.Minio(endpoint=parsed_url.netloc, secure=False, http_client=proxy_client)
145143

146144
try:
147145
client.fget_object(
148146
bucket_name=bucket,
149147
object_name=object_name,
150148
file_path=str(destination),
151149
)
150+
if destination.is_file() and destination.suffix == ".zip":
151+
with zipfile.ZipFile(destination, "r") as zip_ref:
152+
zip_ref.extractall(destination.parent)
153+
152154
except minio.error.S3Error as e:
153155
if e.message.startswith("Object does not exist"):
154156
raise FileNotFoundError(f"Object at '{source}' does not exist.") from e
@@ -157,6 +159,39 @@ def _download_minio_file(
157159
raise FileNotFoundError("Bucket does not exist or is private.") from e
158160

159161

162+
def _download_minio_bucket(
163+
source: str,
164+
destination: Union[str, pathlib.Path],
165+
exists_ok: bool = True,
166+
) -> None:
167+
"""Download file ``source`` from a MinIO Bucket and store it at ``destination``.
168+
169+
Parameters
170+
----------
171+
source : Union[str, pathlib.Path]
172+
URL to a MinIO bucket.
173+
destination : str
174+
Path to a directory to store the bucket content in.
175+
exists_ok : bool, optional (default=True)
176+
If False, raise FileExists if a file already exists in ``destination``.
177+
"""
178+
179+
destination = pathlib.Path(destination)
180+
parsed_url = urllib.parse.urlparse(source)
181+
182+
# expect path format: /BUCKET/path/to/file.ext
183+
bucket = parsed_url.path[1:]
184+
185+
client = minio.Minio(endpoint=parsed_url.netloc, secure=False)
186+
187+
for file_object in client.list_objects(bucket, recursive=True):
188+
_download_minio_file(
189+
source=source + "/" + file_object.object_name,
190+
destination=pathlib.Path(destination, file_object.object_name),
191+
exists_ok=True,
192+
)
193+
194+
160195
def _download_text_file(
161196
source: str,
162197
output_path: Optional[str] = None,

openml/datasets/functions.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import os
66
from pyexpat import ExpatError
77
from typing import List, Dict, Union, Optional, cast
8+
import warnings
89

910
import numpy as np
1011
import arff
@@ -356,6 +357,7 @@ def get_dataset(
356357
error_if_multiple: bool = False,
357358
cache_format: str = "pickle",
358359
download_qualities: bool = True,
360+
download_all_files: bool = False,
359361
) -> OpenMLDataset:
360362
"""Download the OpenML dataset representation, optionally also download actual data file.
361363
@@ -389,11 +391,20 @@ def get_dataset(
389391
no.of.rows is very high.
390392
download_qualities : bool (default=True)
391393
Option to download 'qualities' meta-data in addition to the minimal dataset description.
394+
download_all_files: bool (default=False)
395+
EXPERIMENTAL. Download all files related to the dataset that reside on the server.
396+
Useful for datasets which refer to auxiliary files (e.g., meta-album).
397+
392398
Returns
393399
-------
394400
dataset : :class:`openml.OpenMLDataset`
395401
The downloaded dataset.
396402
"""
403+
if download_all_files:
404+
warnings.warn(
405+
"``download_all_files`` is experimental and is likely to break with new releases."
406+
)
407+
397408
if cache_format not in ["feather", "pickle"]:
398409
raise ValueError(
399410
"cache_format must be one of 'feather' or 'pickle. "
@@ -434,7 +445,12 @@ def get_dataset(
434445

435446
arff_file = _get_dataset_arff(description) if download_data else None
436447
if "oml:minio_url" in description and download_data:
437-
parquet_file = _get_dataset_parquet(description)
448+
try:
449+
parquet_file = _get_dataset_parquet(
450+
description, download_all_files=download_all_files
451+
)
452+
except urllib3.exceptions.MaxRetryError:
453+
parquet_file = None
438454
else:
439455
parquet_file = None
440456
remove_dataset_cache = False
@@ -967,7 +983,9 @@ def _get_dataset_description(did_cache_dir, dataset_id):
967983

968984

969985
def _get_dataset_parquet(
970-
description: Union[Dict, OpenMLDataset], cache_directory: str = None
986+
description: Union[Dict, OpenMLDataset],
987+
cache_directory: str = None,
988+
download_all_files: bool = False,
971989
) -> Optional[str]:
972990
"""Return the path to the local parquet file of the dataset. If is not cached, it is downloaded.
973991
@@ -987,23 +1005,40 @@ def _get_dataset_parquet(
9871005
Folder to store the parquet file in.
9881006
If None, use the default cache directory for the dataset.
9891007
1008+
download_all_files: bool, optional (default=False)
1009+
If `True`, download all data found in the bucket to which the description's
1010+
``minio_url`` points, only download the parquet file otherwise.
1011+
9901012
Returns
9911013
-------
9921014
output_filename : string, optional
9931015
Location of the Parquet file if successfully downloaded, None otherwise.
9941016
"""
9951017
if isinstance(description, dict):
996-
url = description.get("oml:minio_url")
1018+
url = cast(str, description.get("oml:minio_url"))
9971019
did = description.get("oml:id")
9981020
elif isinstance(description, OpenMLDataset):
999-
url = description._minio_url
1021+
url = cast(str, description._minio_url)
10001022
did = description.dataset_id
10011023
else:
10021024
raise TypeError("`description` should be either OpenMLDataset or Dict.")
10031025

10041026
if cache_directory is None:
10051027
cache_directory = _create_cache_directory_for_id(DATASETS_CACHE_DIR_NAME, did)
1006-
output_file_path = os.path.join(cache_directory, "dataset.pq")
1028+
output_file_path = os.path.join(cache_directory, f"dataset_{did}.pq")
1029+
1030+
old_file_path = os.path.join(cache_directory, "dataset.pq")
1031+
if os.path.isfile(old_file_path):
1032+
os.rename(old_file_path, output_file_path)
1033+
1034+
# For this release, we want to be able to force a new download even if the
1035+
# parquet file is already present when ``download_all_files`` is set.
1036+
# For now, it would be the only way for the user to fetch the additional
1037+
# files in the bucket (no function exists on an OpenMLDataset to do this).
1038+
if download_all_files:
1039+
if url.endswith(".pq"):
1040+
url, _ = url.rsplit("/", maxsplit=1)
1041+
openml._api_calls._download_minio_bucket(source=cast(str, url), destination=cache_directory)
10071042

10081043
if not os.path.isfile(output_file_path):
10091044
try:

tests/test_datasets/test_dataset_functions.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -322,6 +322,15 @@ def test_get_dataset_by_name(self):
322322
openml.config.server = self.production_server
323323
self.assertRaises(OpenMLPrivateDatasetError, openml.datasets.get_dataset, 45)
324324

325+
@pytest.mark.skip("Feature is experimental, can not test against stable server.")
326+
def test_get_dataset_download_all_files(self):
327+
# openml.datasets.get_dataset(id, download_all_files=True)
328+
# check for expected files
329+
# checking that no additional files are downloaded if
330+
# the default (false) is used, seems covered by
331+
# test_get_dataset_lazy
332+
raise NotImplementedError
333+
325334
def test_get_dataset_uint8_dtype(self):
326335
dataset = openml.datasets.get_dataset(1)
327336
self.assertEqual(type(dataset), OpenMLDataset)

0 commit comments

Comments
 (0)