Skip to content

Commit a3e57bb

Browse files
authored
Remove archive after it is extracted to save disk space (#1351)
* Remove archive after it is extracted to save disk space * Leave a marker after removing archive to avoid redownload * Automatic refresh if expected marker is absent * Be consistent about syntax use for path construction
1 parent 7764ddb commit a3e57bb

File tree

1 file changed

+19
-5
lines changed

1 file changed

+19
-5
lines changed

openml/_api_calls.py

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import logging
77
import math
88
import random
9+
import shutil
910
import time
1011
import urllib.parse
1112
import xml
@@ -186,14 +187,14 @@ def _download_minio_file(
186187
def _download_minio_bucket(source: str, destination: str | Path) -> None:
187188
"""Download file ``source`` from a MinIO Bucket and store it at ``destination``.
188189
190+
Does not redownload files which already exist.
191+
189192
Parameters
190193
----------
191194
source : str
192195
URL to a MinIO bucket.
193196
destination : str | Path
194197
Path to a directory to store the bucket content in.
195-
exists_ok : bool, optional (default=True)
196-
If False, raise FileExists if a file already exists in ``destination``.
197198
"""
198199
destination = Path(destination)
199200
parsed_url = urllib.parse.urlparse(source)
@@ -206,15 +207,28 @@ def _download_minio_bucket(source: str, destination: str | Path) -> None:
206207

207208
for file_object in client.list_objects(bucket, prefix=prefix, recursive=True):
208209
if file_object.object_name is None:
209-
raise ValueError("Object name is None.")
210+
raise ValueError(f"Object name is None for object {file_object!r}")
210211

211-
with contextlib.suppress(FileExistsError): # Simply use cached version instead
212+
marker = destination / file_object.etag
213+
if marker.exists():
214+
continue
215+
216+
file_destination = destination / file_object.object_name.rsplit("/", 1)[1]
217+
if (file_destination.parent / file_destination.stem).exists():
218+
# Marker is missing but archive exists means the server archive changed, force a refresh
219+
shutil.rmtree(file_destination.parent / file_destination.stem)
220+
221+
with contextlib.suppress(FileExistsError):
212222
_download_minio_file(
213223
source=source.rsplit("/", 1)[0] + "/" + file_object.object_name.rsplit("/", 1)[1],
214-
destination=Path(destination, file_object.object_name.rsplit("/", 1)[1]),
224+
destination=file_destination,
215225
exists_ok=False,
216226
)
217227

228+
if file_destination.is_file() and file_destination.suffix == ".zip":
229+
file_destination.unlink()
230+
marker.touch()
231+
218232

219233
def _download_text_file(
220234
source: str,

0 commit comments

Comments
 (0)