Fix: update fetching a bucket from MinIO (#1314)

eddiebergman · PGijsbers · web-flow · commit a1cb66bb3c07 · 2024-01-17T15:26:22.000+01:00
* Update fetching a bucket from MinIO Previously, each dataset had their own bucket: https://openml1.win.tue.nl/datasets61/dataset_61.pq But we were advised to reduce the amount of buckets and favor hosting many objects in hierarchical structure, so we now have instead some prefixes to divide up the dataset objects into separate subdirectories: https://openml1.win.tue.nl/datasets/0000/0061/dataset_61.pq This commit has bypassed pre-commit. Tests should be updated too. * ci: Trigger ci * ci: Add some files to .gitignore --------- Co-authored-by: PGijsbers <p.gijsbers@tue.nl>
diff --git a/.gitignore b/.gitignore
@@ -17,6 +17,14 @@ doc/auto_examples/
 doc/modules/generated/
 doc/datasets/generated/
 
+# Some stuff from testing?
+tests/files/org/openml/test/datasets/1/
+tests/files/org/openml/test/datasets/2/features.xml.pkl
+tests/files/org/openml/test/datasets/2/qualities.xml.pkl
+tests/files/org/openml/test/locks/
+tests/files/org/openml/test/tasks/1/datasplits.pkl.py3
+tests/files/org/openml/test/tasks/1882/datasplits.pkl.py3
+
 # Distribution / packaging
 
 .Python
diff --git a/openml/_api_calls.py b/openml/_api_calls.py
@@ -193,17 +193,18 @@ def _download_minio_bucket(source: str, destination: str | Path) -> None:
     parsed_url = urllib.parse.urlparse(source)
 
     # expect path format: /BUCKET/path/to/file.ext
-    bucket = parsed_url.path[1:]
+    _, bucket, *prefixes, _file = parsed_url.path.split("/")
+    prefix = "/".join(prefixes)
 
     client = minio.Minio(endpoint=parsed_url.netloc, secure=False)
 
-    for file_object in client.list_objects(bucket, recursive=True):
+    for file_object in client.list_objects(bucket, prefix=prefix, recursive=True):
         if file_object.object_name is None:
             raise ValueError("Object name is None.")
 
         _download_minio_file(
-            source=source + "/" + file_object.object_name,
-            destination=Path(destination, file_object.object_name),
+            source=source.rsplit("/", 1)[0] + "/" + file_object.object_name.rsplit("/", 1)[1],
+            destination=Path(destination, file_object.object_name.rsplit("/", 1)[1]),
             exists_ok=True,
         )
 
diff --git a/openml/datasets/functions.py b/openml/datasets/functions.py
@@ -1264,9 +1264,6 @@ def _get_dataset_parquet(
     # For now, it would be the only way for the user to fetch the additional
     # files in the bucket (no function exists on an OpenMLDataset to do this).
     if download_all_files:
-        if url.endswith(".pq"):
-            url, _ = url.rsplit("/", maxsplit=1)
-
         openml._api_calls._download_minio_bucket(source=url, destination=cache_directory)
 
     if not output_file_path.is_file():