44import hashlib
55import logging
66import math
7- import pathlib
87import random
98import time
109import urllib .parse
1110import xml
1211import zipfile
12+ from pathlib import Path
1313from typing import Dict , Tuple , Union
1414
1515import minio
1616import requests
17+ import requests .utils
1718import xmltodict
1819from urllib3 import ProxyManager
1920
2728
2829DATA_TYPE = Dict [str , Union [str , int ]]
2930FILE_ELEMENTS_TYPE = Dict [str , Union [str , Tuple [str , str ]]]
31+ DATABASE_CONNECTION_ERRCODE = 107
32+
33+
34+ def _robot_delay (n : int ) -> float :
35+ wait = (1 / (1 + math .exp (- (n * 0.5 - 4 )))) * 60
36+ variation = random .gauss (0 , wait / 10 )
37+ return max (1.0 , wait + variation )
38+
39+
40+ def _human_delay (n : int ) -> float :
41+ return max (1.0 , n )
3042
3143
3244def resolve_env_proxies (url : str ) -> str | None :
@@ -46,7 +58,7 @@ def resolve_env_proxies(url: str) -> str | None:
4658 The proxy url if found, else None
4759 """
4860 resolved_proxies = requests .utils .get_environ_proxies (url )
49- return requests .utils .select_proxy (url , resolved_proxies )
61+ return requests .utils .select_proxy (url , resolved_proxies ) # type: ignore
5062
5163
5264def _create_url_from_endpoint (endpoint : str ) -> str :
@@ -111,17 +123,17 @@ def _perform_api_call(
111123
112124def _download_minio_file (
113125 source : str ,
114- destination : str | pathlib . Path ,
115- exists_ok : bool = True ,
126+ destination : str | Path ,
127+ exists_ok : bool = True , # noqa: FBT001, FBT002
116128 proxy : str | None = "auto" ,
117129) -> None :
118130 """Download file ``source`` from a MinIO Bucket and store it at ``destination``.
119131
120132 Parameters
121133 ----------
122- source : Union[ str, pathlib.Path]
134+ source : str
123135 URL to a file in a MinIO bucket.
124- destination : str
136+ destination : str | Path
125137 Path to store the file to, if a directory is provided the original filename is used.
126138 exists_ok : bool, optional (default=True)
127139 If False, raise FileExists if a file already exists in ``destination``.
@@ -130,13 +142,13 @@ def _download_minio_file(
130142 automatically find the proxy to use. Pass None or the environment variable
131143 ``no_proxy="*"`` to disable proxies.
132144 """
133- destination = pathlib . Path (destination )
145+ destination = Path (destination )
134146 parsed_url = urllib .parse .urlparse (source )
135147
136148 # expect path format: /BUCKET/path/to/file.ext
137149 bucket , object_name = parsed_url .path [1 :].split ("/" , maxsplit = 1 )
138150 if destination .is_dir ():
139- destination = pathlib . Path (destination , object_name )
151+ destination = Path (destination , object_name )
140152 if destination .is_file () and not exists_ok :
141153 raise FileExistsError (f"File already exists in { destination } ." )
142154
@@ -158,30 +170,26 @@ def _download_minio_file(
158170 zip_ref .extractall (destination .parent )
159171
160172 except minio .error .S3Error as e :
161- if e .message .startswith ("Object does not exist" ):
173+ if e .message is not None and e . message .startswith ("Object does not exist" ):
162174 raise FileNotFoundError (f"Object at '{ source } ' does not exist." ) from e
163175 # e.g. permission error, or a bucket does not exist (which is also interpreted as a
164176 # permission error on minio level).
165177 raise FileNotFoundError ("Bucket does not exist or is private." ) from e
166178
167179
168- def _download_minio_bucket (
169- source : str ,
170- destination : str | pathlib .Path ,
171- exists_ok : bool = True ,
172- ) -> None :
180+ def _download_minio_bucket (source : str , destination : str | Path ) -> None :
173181 """Download file ``source`` from a MinIO Bucket and store it at ``destination``.
174182
175183 Parameters
176184 ----------
177- source : Union[ str, pathlib.Path]
185+ source : str
178186 URL to a MinIO bucket.
179- destination : str
187+ destination : str | Path
180188 Path to a directory to store the bucket content in.
181189 exists_ok : bool, optional (default=True)
182190 If False, raise FileExists if a file already exists in ``destination``.
183191 """
184- destination = pathlib . Path (destination )
192+ destination = Path (destination )
185193 parsed_url = urllib .parse .urlparse (source )
186194
187195 # expect path format: /BUCKET/path/to/file.ext
@@ -190,18 +198,21 @@ def _download_minio_bucket(
190198 client = minio .Minio (endpoint = parsed_url .netloc , secure = False )
191199
192200 for file_object in client .list_objects (bucket , recursive = True ):
201+ if file_object .object_name is None :
202+ raise ValueError ("Object name is None." )
203+
193204 _download_minio_file (
194205 source = source + "/" + file_object .object_name ,
195- destination = pathlib . Path (destination , file_object .object_name ),
206+ destination = Path (destination , file_object .object_name ),
196207 exists_ok = True ,
197208 )
198209
199210
200211def _download_text_file (
201212 source : str ,
202- output_path : str | None = None ,
213+ output_path : str | Path | None = None ,
203214 md5_checksum : str | None = None ,
204- exists_ok : bool = True ,
215+ exists_ok : bool = True , # noqa: FBT001, FBT002
205216 encoding : str = "utf8" ,
206217) -> str | None :
207218 """Download the text file at `source` and store it in `output_path`.
@@ -213,7 +224,7 @@ def _download_text_file(
213224 ----------
214225 source : str
215226 url of the file to be downloaded
216- output_path : str, (optional )
227+ output_path : str | Path | None (default=None )
217228 full path, including filename, of where the file should be stored. If ``None``,
218229 this function returns the downloaded file as string.
219230 md5_checksum : str, optional (default=None)
@@ -223,15 +234,14 @@ def _download_text_file(
223234 encoding : str, optional (default='utf8')
224235 The encoding with which the file should be stored.
225236 """
226- if output_path is not None :
227- try :
228- with open (output_path , encoding = encoding ):
229- if exists_ok :
230- return None
231- else :
232- raise FileExistsError
233- except FileNotFoundError :
234- pass
237+ if isinstance (output_path , str ):
238+ output_path = Path (output_path )
239+
240+ if output_path is not None and output_path .exists ():
241+ if not exists_ok :
242+ raise FileExistsError
243+
244+ return None
235245
236246 logging .info ("Starting [%s] request for the URL %s" , "get" , source )
237247 start = time .time ()
@@ -247,28 +257,25 @@ def _download_text_file(
247257 )
248258 return downloaded_file
249259
250- else :
251- with open (output_path , "w" , encoding = encoding ) as fh :
252- fh .write (downloaded_file )
260+ with output_path .open ("w" , encoding = encoding ) as fh :
261+ fh .write (downloaded_file )
253262
254- logging .info (
255- "%.7fs taken for [%s] request for the URL %s" ,
256- time .time () - start ,
257- "get" ,
258- source ,
259- )
260-
261- del downloaded_file
262- return None
263+ logging .info (
264+ "%.7fs taken for [%s] request for the URL %s" ,
265+ time .time () - start ,
266+ "get" ,
267+ source ,
268+ )
269+ return None
263270
264271
265- def _file_id_to_url (file_id : str , filename : str | None = None ) -> str :
272+ def _file_id_to_url (file_id : int , filename : str | None = None ) -> str :
266273 """
267274 Presents the URL how to download a given file id
268275 filename is optional
269276 """
270277 openml_url = config .server .split ("/api/" )
271- url = openml_url [0 ] + "/data/download/%s" % file_id
278+ url = openml_url [0 ] + f "/data/download/{ file_id !s } "
272279 if filename is not None :
273280 url += "/" + filename
274281 return url
@@ -316,13 +323,13 @@ def __read_url(
316323def __is_checksum_equal (downloaded_file_binary : bytes , md5_checksum : str | None = None ) -> bool :
317324 if md5_checksum is None :
318325 return True
319- md5 = hashlib .md5 ()
326+ md5 = hashlib .md5 () # noqa: S324
320327 md5 .update (downloaded_file_binary )
321328 md5_checksum_download = md5 .hexdigest ()
322329 return md5_checksum == md5_checksum_download
323330
324331
325- def _send_request (
332+ def _send_request ( # noqa: C901
326333 request_method : str ,
327334 url : str ,
328335 data : DATA_TYPE ,
@@ -331,7 +338,9 @@ def _send_request(
331338) -> requests .Response :
332339 n_retries = max (1 , config .connection_n_retries )
333340
334- response : requests .Response
341+ response : requests .Response | None = None
342+ delay_method = _human_delay if config .retry_policy == "human" else _robot_delay
343+
335344 with requests .Session () as session :
336345 # Start at one to have a non-zero multiplier for the sleep
337346 for retry_counter in range (1 , n_retries + 1 ):
@@ -344,10 +353,11 @@ def _send_request(
344353 response = session .post (url , data = data , files = files )
345354 else :
346355 raise NotImplementedError ()
356+
347357 __check_response (response = response , url = url , file_elements = files )
358+
348359 if request_method == "get" and not __is_checksum_equal (
349- response .text .encode ("utf-8" ),
350- md5_checksum ,
360+ response .text .encode ("utf-8" ), md5_checksum
351361 ):
352362 # -- Check if encoding is not UTF-8 perhaps
353363 if __is_checksum_equal (response .content , md5_checksum ):
@@ -365,41 +375,44 @@ def _send_request(
365375 "Checksum of downloaded file is unequal to the expected checksum {} "
366376 "when downloading {}." .format (md5_checksum , url ),
367377 )
368- break
378+
379+ return response
380+ except OpenMLServerException as e :
381+ # Propagate all server errors to the calling functions, except
382+ # for 107 which represents a database connection error.
383+ # These are typically caused by high server load,
384+ # which means trying again might resolve the issue.
385+ if e .code != DATABASE_CONNECTION_ERRCODE :
386+ raise e
387+
388+ delay = delay_method (retry_counter )
389+ time .sleep (delay )
390+
391+ except xml .parsers .expat .ExpatError as e :
392+ if request_method != "get" or retry_counter >= n_retries :
393+ if response is not None :
394+ extra = f"Status code: { response .status_code } \n { response .text } "
395+ else :
396+ extra = "No response retrieved."
397+
398+ raise OpenMLServerError (
399+ f"Unexpected server error when calling { url } . Please contact the "
400+ f"developers!\n { extra } "
401+ ) from e
402+
403+ delay = delay_method (retry_counter )
404+ time .sleep (delay )
405+
369406 except (
370407 requests .exceptions .ChunkedEncodingError ,
371408 requests .exceptions .ConnectionError ,
372409 requests .exceptions .SSLError ,
373- OpenMLServerException ,
374- xml .parsers .expat .ExpatError ,
375410 OpenMLHashException ,
376- ) as e :
377- if isinstance (e , OpenMLServerException ) and e .code != 107 :
378- # Propagate all server errors to the calling functions, except
379- # for 107 which represents a database connection error.
380- # These are typically caused by high server load,
381- # which means trying again might resolve the issue.
382- raise
383- elif isinstance (e , xml .parsers .expat .ExpatError ):
384- if request_method != "get" or retry_counter >= n_retries :
385- raise OpenMLServerError (
386- f"Unexpected server error when calling { url } . Please contact the "
387- f"developers!\n Status code: { response .status_code } \n { response .text } " ,
388- )
389- if retry_counter >= n_retries :
390- raise
391- else :
411+ ):
412+ delay = delay_method (retry_counter )
413+ time .sleep (delay )
392414
393- def robot (n : int ) -> float :
394- wait = (1 / (1 + math .exp (- (n * 0.5 - 4 )))) * 60
395- variation = random .gauss (0 , wait / 10 )
396- return max (1.0 , wait + variation )
397-
398- def human (n : int ) -> float :
399- return max (1.0 , n )
400-
401- delay = {"human" : human , "robot" : robot }[config .retry_policy ](retry_counter )
402- time .sleep (delay )
415+ assert response is not None
403416 return response
404417
405418
@@ -410,9 +423,7 @@ def __check_response(
410423) -> None :
411424 if response .status_code != 200 :
412425 raise __parse_server_exception (response , url , file_elements = file_elements )
413- elif (
414- "Content-Encoding" not in response .headers or response .headers ["Content-Encoding" ] != "gzip"
415- ):
426+ if "Content-Encoding" not in response .headers or response .headers ["Content-Encoding" ] != "gzip" :
416427 logging .warning (f"Received uncompressed content from OpenML for { url } ." )
417428
418429
@@ -423,17 +434,18 @@ def __parse_server_exception(
423434) -> OpenMLServerError :
424435 if response .status_code == 414 :
425436 raise OpenMLServerError (f"URI too long! ({ url } )" )
437+
426438 try :
427439 server_exception = xmltodict .parse (response .text )
428- except xml .parsers .expat .ExpatError :
429- raise
430- except Exception :
440+ except xml .parsers .expat .ExpatError as e :
441+ raise e
442+ except Exception as e : # noqa: BLE001
431443 # OpenML has a sophisticated error system
432444 # where information about failures is provided. try to parse this
433445 raise OpenMLServerError (
434446 f"Unexpected server error when calling { url } . Please contact the developers!\n "
435447 f"Status code: { response .status_code } \n { response .text } " ,
436- )
448+ ) from e
437449
438450 server_error = server_exception ["oml:error" ]
439451 code = int (server_error ["oml:code" ])
0 commit comments