Skip to content

Commit a1cb66b

Browse files
Fix: update fetching a bucket from MinIO (#1314)
* Update fetching a bucket from MinIO Previously, each dataset had their own bucket: https://openml1.win.tue.nl/datasets61/dataset_61.pq But we were advised to reduce the amount of buckets and favor hosting many objects in hierarchical structure, so we now have instead some prefixes to divide up the dataset objects into separate subdirectories: https://openml1.win.tue.nl/datasets/0000/0061/dataset_61.pq This commit has bypassed pre-commit. Tests should be updated too. * ci: Trigger ci * ci: Add some files to .gitignore --------- Co-authored-by: PGijsbers <[email protected]>
1 parent decc7a8 commit a1cb66b

File tree

3 files changed

+13
-7
lines changed

3 files changed

+13
-7
lines changed

.gitignore

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,14 @@ doc/auto_examples/
1717
doc/modules/generated/
1818
doc/datasets/generated/
1919

20+
# Some stuff from testing?
21+
tests/files/org/openml/test/datasets/1/
22+
tests/files/org/openml/test/datasets/2/features.xml.pkl
23+
tests/files/org/openml/test/datasets/2/qualities.xml.pkl
24+
tests/files/org/openml/test/locks/
25+
tests/files/org/openml/test/tasks/1/datasplits.pkl.py3
26+
tests/files/org/openml/test/tasks/1882/datasplits.pkl.py3
27+
2028
# Distribution / packaging
2129

2230
.Python

openml/_api_calls.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -193,17 +193,18 @@ def _download_minio_bucket(source: str, destination: str | Path) -> None:
193193
parsed_url = urllib.parse.urlparse(source)
194194

195195
# expect path format: /BUCKET/path/to/file.ext
196-
bucket = parsed_url.path[1:]
196+
_, bucket, *prefixes, _file = parsed_url.path.split("/")
197+
prefix = "/".join(prefixes)
197198

198199
client = minio.Minio(endpoint=parsed_url.netloc, secure=False)
199200

200-
for file_object in client.list_objects(bucket, recursive=True):
201+
for file_object in client.list_objects(bucket, prefix=prefix, recursive=True):
201202
if file_object.object_name is None:
202203
raise ValueError("Object name is None.")
203204

204205
_download_minio_file(
205-
source=source + "/" + file_object.object_name,
206-
destination=Path(destination, file_object.object_name),
206+
source=source.rsplit("/", 1)[0] + "/" + file_object.object_name.rsplit("/", 1)[1],
207+
destination=Path(destination, file_object.object_name.rsplit("/", 1)[1]),
207208
exists_ok=True,
208209
)
209210

openml/datasets/functions.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1264,9 +1264,6 @@ def _get_dataset_parquet(
12641264
# For now, it would be the only way for the user to fetch the additional
12651265
# files in the bucket (no function exists on an OpenMLDataset to do this).
12661266
if download_all_files:
1267-
if url.endswith(".pq"):
1268-
url, _ = url.rsplit("/", maxsplit=1)
1269-
12701267
openml._api_calls._download_minio_bucket(source=url, destination=cache_directory)
12711268

12721269
if not output_file_path.is_file():

0 commit comments

Comments
 (0)