File tree Expand file tree Collapse file tree 3 files changed +15
-6
lines changed
Expand file tree Collapse file tree 3 files changed +15
-6
lines changed Original file line number Diff line number Diff line change 2323file_handler : logging .handlers .RotatingFileHandler | None = None
2424
2525OPENML_CACHE_DIR_ENV_VAR = "OPENML_CACHE_DIR"
26+ OPENML_SKIP_PARQUET_ENV_VAR = "OPENML_SKIP_PARQUET"
2627
2728
2829class _Config (TypedDict ):
Original file line number Diff line number Diff line change 33
44import gzip
55import logging
6+ import os
67import pickle
78import re
89import warnings
1718import xmltodict
1819
1920from openml .base import OpenMLBase
21+ from openml .config import OPENML_SKIP_PARQUET_ENV_VAR
2022from openml .exceptions import PyOpenMLError
2123
2224from .data_feature import OpenMLDataFeature
@@ -358,8 +360,10 @@ def _download_data(self) -> None:
358360 # import required here to avoid circular import.
359361 from .functions import _get_dataset_arff , _get_dataset_parquet
360362
361- if self ._parquet_url is not None :
362- self .parquet_file = str (_get_dataset_parquet (self ))
363+ skip_parquet = os .environ .get (OPENML_SKIP_PARQUET_ENV_VAR , "false" ).casefold () == "true"
364+ if self ._parquet_url is not None and not skip_parquet :
365+ parquet_file = _get_dataset_parquet (self )
366+ self .parquet_file = None if parquet_file is None else str (parquet_file )
363367 if self .parquet_file is None :
364368 self .data_file = str (_get_dataset_arff (self ))
365369
Original file line number Diff line number Diff line change 33from __future__ import annotations
44
55import logging
6+ import os
67import warnings
78from collections import OrderedDict
89from pathlib import Path
2021
2122import openml ._api_calls
2223import openml .utils
24+ from openml .config import OPENML_SKIP_PARQUET_ENV_VAR
2325from openml .exceptions import (
2426 OpenMLHashException ,
2527 OpenMLPrivateDatasetError ,
@@ -560,20 +562,22 @@ def get_dataset( # noqa: C901, PLR0912
560562 if download_qualities :
561563 qualities_file = _get_dataset_qualities_file (did_cache_dir , dataset_id )
562564
563- if "oml:parquet_url" in description and download_data :
565+ parquet_file = None
566+ skip_parquet = os .environ .get (OPENML_SKIP_PARQUET_ENV_VAR , "false" ).casefold () == "true"
567+ download_parquet = "oml:parquet_url" in description and not skip_parquet
568+ if download_parquet and (download_data or download_all_files ):
564569 try :
565570 parquet_file = _get_dataset_parquet (
566571 description ,
567572 download_all_files = download_all_files ,
568573 )
569574 except urllib3 .exceptions .MaxRetryError :
570575 parquet_file = None
571- else :
572- parquet_file = None
573576
574577 arff_file = None
575578 if parquet_file is None and download_data :
576- logger .warning ("Failed to download parquet, fallback on ARFF." )
579+ if download_parquet :
580+ logger .warning ("Failed to download parquet, fallback on ARFF." )
577581 arff_file = _get_dataset_arff (description )
578582
579583 remove_dataset_cache = False
You can’t perform that action at this time.
0 commit comments