Skip to content

Commit 07e9b9c

Browse files
authored
Add/1034 (#1352) dataset lazy loading default
* Towards lazy-by-default for dataset loading * Isolate lazy behavior to pytest function outside of class * Solve concurrency issue where test would use same cache * Ensure metadata is downloaded to verify dataset is processed * Clean up to reflect new defaults and tests * Fix oversight from 1335 * Download data as was 0.14 behavior * Restore test * Formatting * Test obsolete, replaced by test_get_dataset_lazy_behavior
1 parent 1d707e6 commit 07e9b9c

File tree

5 files changed

+282
-201
lines changed

5 files changed

+282
-201
lines changed

openml/datasets/functions.py

Lines changed: 9 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -416,8 +416,8 @@ def _name_to_id(
416416

417417
def get_datasets(
418418
dataset_ids: list[str | int],
419-
download_data: bool = True, # noqa: FBT001, FBT002
420-
download_qualities: bool = True, # noqa: FBT001, FBT002
419+
download_data: bool = False, # noqa: FBT001, FBT002
420+
download_qualities: bool = False, # noqa: FBT001, FBT002
421421
) -> list[OpenMLDataset]:
422422
"""Download datasets.
423423
@@ -450,14 +450,14 @@ def get_datasets(
450450

451451

452452
@openml.utils.thread_safe_if_oslo_installed
453-
def get_dataset( # noqa: C901, PLR0912, PLR0915
453+
def get_dataset( # noqa: C901, PLR0912
454454
dataset_id: int | str,
455-
download_data: bool | None = None, # Optional for deprecation warning; later again only bool
455+
download_data: bool = False, # noqa: FBT002, FBT001
456456
version: int | None = None,
457457
error_if_multiple: bool = False, # noqa: FBT002, FBT001
458458
cache_format: Literal["pickle", "feather"] = "pickle",
459-
download_qualities: bool | None = None, # Same as above
460-
download_features_meta_data: bool | None = None, # Same as above
459+
download_qualities: bool = False, # noqa: FBT002, FBT001
460+
download_features_meta_data: bool = False, # noqa: FBT002, FBT001
461461
download_all_files: bool = False, # noqa: FBT002, FBT001
462462
force_refresh_cache: bool = False, # noqa: FBT001, FBT002
463463
) -> OpenMLDataset:
@@ -485,7 +485,7 @@ def get_dataset( # noqa: C901, PLR0912, PLR0915
485485
----------
486486
dataset_id : int or str
487487
Dataset ID of the dataset to download
488-
download_data : bool (default=True)
488+
download_data : bool (default=False)
489489
If True, also download the data file. Beware that some datasets are large and it might
490490
make the operation noticeably slower. Metadata is also still retrieved.
491491
If False, create the OpenMLDataset and only populate it with the metadata.
@@ -499,12 +499,12 @@ def get_dataset( # noqa: C901, PLR0912, PLR0915
499499
Format for caching the dataset - may be feather or pickle
500500
Note that the default 'pickle' option may load slower than feather when
501501
no.of.rows is very high.
502-
download_qualities : bool (default=True)
502+
download_qualities : bool (default=False)
503503
Option to download 'qualities' meta-data in addition to the minimal dataset description.
504504
If True, download and cache the qualities file.
505505
If False, create the OpenMLDataset without qualities metadata. The data may later be added
506506
to the OpenMLDataset through the `OpenMLDataset.load_metadata(qualities=True)` method.
507-
download_features_meta_data : bool (default=True)
507+
download_features_meta_data : bool (default=False)
508508
Option to download 'features' meta-data in addition to the minimal dataset description.
509509
If True, download and cache the features file.
510510
If False, create the OpenMLDataset without features metadata. The data may later be added
@@ -523,28 +523,6 @@ def get_dataset( # noqa: C901, PLR0912, PLR0915
523523
dataset : :class:`openml.OpenMLDataset`
524524
The downloaded dataset.
525525
"""
526-
# TODO(0.15): Remove the deprecation warning and make the default False; adjust types above
527-
# and documentation. Also remove None-to-True-cases below
528-
if any(
529-
download_flag is None
530-
for download_flag in [download_data, download_qualities, download_features_meta_data]
531-
):
532-
warnings.warn(
533-
"Starting from Version 0.15 `download_data`, `download_qualities`, and `download_featu"
534-
"res_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy "
535-
"loading. To disable this message until version 0.15 explicitly set `download_data`, "
536-
"`download_qualities`, and `download_features_meta_data` to a bool while calling "
537-
"`get_dataset`.",
538-
FutureWarning,
539-
stacklevel=2,
540-
)
541-
542-
download_data = True if download_data is None else download_data
543-
download_qualities = True if download_qualities is None else download_qualities
544-
download_features_meta_data = (
545-
True if download_features_meta_data is None else download_features_meta_data
546-
)
547-
548526
if download_all_files:
549527
warnings.warn(
550528
"``download_all_files`` is experimental and is likely to break with new releases.",

openml/testing.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@ class TestBase(unittest.TestCase):
5656
logger = logging.getLogger("unit_tests_published_entities")
5757
logger.setLevel(logging.DEBUG)
5858

59-
def setUp(self, n_levels: int = 1) -> None:
59+
def setUp(self, n_levels: int = 1, tmpdir_suffix: str = "") -> None:
6060
"""Setup variables and temporary directories.
6161
6262
In particular, this methods:
@@ -92,7 +92,7 @@ def setUp(self, n_levels: int = 1) -> None:
9292
self.static_cache_dir = static_cache_dir
9393
self.cwd = Path.cwd()
9494
workdir = Path(__file__).parent.absolute()
95-
tmp_dir_name = self.id()
95+
tmp_dir_name = self.id() + tmpdir_suffix
9696
self.workdir = workdir / tmp_dir_name
9797
shutil.rmtree(self.workdir, ignore_errors=True)
9898

0 commit comments

Comments
 (0)