Skip to content

Commit 6433d5b

Browse files
LennartPuruckereddiebergmanpre-commit-ci[bot]Bilgecelik
authored
Linting Everything - Fix All mypy and ruff Errors (#1307)
* style: Fix linting split.py * typing: Fix mypy errors split.py * typing: data_feature * typing: trace * more linting fixes * typing: finish up trace * typing: config.py * typing: More fixes on config.py * typing: setup.py * finalize runs linting * typing: evaluation.py * typing: setup * ruff fixes across different files and mypy fixes for run files * typing: _api_calls * adjust setup files' linting and minor ruff changes * typing: utils * late night push * typing: utils.py * typing: tip tap tippity * typing: mypy 78, ruff ~200 * refactor output format name and minor linting stuff * other: midway merge * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * typing: I'm runnign out of good messages * typing: datasets * leinting for flows and some ruff changes * no more mypy errors * ruff runs and setups * typing: Finish off mypy and ruff errors Co-authored-by: Bilgecelik <[email protected]> * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * style: File wide ignores of PLR0913 This is because the automated pre-commit.ci bot which made automatic commits and pushes would think the `noqa` on the individualy overloaded functions was not needed. After removing the `noqa`, the linter then raised the issue --------- Co-authored-by: eddiebergman <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Bilgecelik <[email protected]>
1 parent 1c660fb commit 6433d5b

30 files changed

+1968
-1373
lines changed

openml/__init__.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
"""
1717

1818
# License: BSD 3-Clause
19+
from __future__ import annotations
1920

2021
from . import (
2122
_api_calls,
@@ -49,7 +50,12 @@
4950
)
5051

5152

52-
def populate_cache(task_ids=None, dataset_ids=None, flow_ids=None, run_ids=None):
53+
def populate_cache(
54+
task_ids: list[int] | None = None,
55+
dataset_ids: list[int | str] | None = None,
56+
flow_ids: list[int] | None = None,
57+
run_ids: list[int] | None = None,
58+
) -> None:
5359
"""
5460
Populate a cache for offline and parallel usage of the OpenML connector.
5561

openml/_api_calls.py

Lines changed: 97 additions & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -4,16 +4,17 @@
44
import hashlib
55
import logging
66
import math
7-
import pathlib
87
import random
98
import time
109
import urllib.parse
1110
import xml
1211
import zipfile
12+
from pathlib import Path
1313
from typing import Dict, Tuple, Union
1414

1515
import minio
1616
import requests
17+
import requests.utils
1718
import xmltodict
1819
from urllib3 import ProxyManager
1920

@@ -27,6 +28,17 @@
2728

2829
DATA_TYPE = Dict[str, Union[str, int]]
2930
FILE_ELEMENTS_TYPE = Dict[str, Union[str, Tuple[str, str]]]
31+
DATABASE_CONNECTION_ERRCODE = 107
32+
33+
34+
def _robot_delay(n: int) -> float:
35+
wait = (1 / (1 + math.exp(-(n * 0.5 - 4)))) * 60
36+
variation = random.gauss(0, wait / 10)
37+
return max(1.0, wait + variation)
38+
39+
40+
def _human_delay(n: int) -> float:
41+
return max(1.0, n)
3042

3143

3244
def resolve_env_proxies(url: str) -> str | None:
@@ -46,7 +58,7 @@ def resolve_env_proxies(url: str) -> str | None:
4658
The proxy url if found, else None
4759
"""
4860
resolved_proxies = requests.utils.get_environ_proxies(url)
49-
return requests.utils.select_proxy(url, resolved_proxies)
61+
return requests.utils.select_proxy(url, resolved_proxies) # type: ignore
5062

5163

5264
def _create_url_from_endpoint(endpoint: str) -> str:
@@ -111,17 +123,17 @@ def _perform_api_call(
111123

112124
def _download_minio_file(
113125
source: str,
114-
destination: str | pathlib.Path,
115-
exists_ok: bool = True,
126+
destination: str | Path,
127+
exists_ok: bool = True, # noqa: FBT001, FBT002
116128
proxy: str | None = "auto",
117129
) -> None:
118130
"""Download file ``source`` from a MinIO Bucket and store it at ``destination``.
119131
120132
Parameters
121133
----------
122-
source : Union[str, pathlib.Path]
134+
source : str
123135
URL to a file in a MinIO bucket.
124-
destination : str
136+
destination : str | Path
125137
Path to store the file to, if a directory is provided the original filename is used.
126138
exists_ok : bool, optional (default=True)
127139
If False, raise FileExists if a file already exists in ``destination``.
@@ -130,13 +142,13 @@ def _download_minio_file(
130142
automatically find the proxy to use. Pass None or the environment variable
131143
``no_proxy="*"`` to disable proxies.
132144
"""
133-
destination = pathlib.Path(destination)
145+
destination = Path(destination)
134146
parsed_url = urllib.parse.urlparse(source)
135147

136148
# expect path format: /BUCKET/path/to/file.ext
137149
bucket, object_name = parsed_url.path[1:].split("/", maxsplit=1)
138150
if destination.is_dir():
139-
destination = pathlib.Path(destination, object_name)
151+
destination = Path(destination, object_name)
140152
if destination.is_file() and not exists_ok:
141153
raise FileExistsError(f"File already exists in {destination}.")
142154

@@ -158,30 +170,26 @@ def _download_minio_file(
158170
zip_ref.extractall(destination.parent)
159171

160172
except minio.error.S3Error as e:
161-
if e.message.startswith("Object does not exist"):
173+
if e.message is not None and e.message.startswith("Object does not exist"):
162174
raise FileNotFoundError(f"Object at '{source}' does not exist.") from e
163175
# e.g. permission error, or a bucket does not exist (which is also interpreted as a
164176
# permission error on minio level).
165177
raise FileNotFoundError("Bucket does not exist or is private.") from e
166178

167179

168-
def _download_minio_bucket(
169-
source: str,
170-
destination: str | pathlib.Path,
171-
exists_ok: bool = True,
172-
) -> None:
180+
def _download_minio_bucket(source: str, destination: str | Path) -> None:
173181
"""Download file ``source`` from a MinIO Bucket and store it at ``destination``.
174182
175183
Parameters
176184
----------
177-
source : Union[str, pathlib.Path]
185+
source : str
178186
URL to a MinIO bucket.
179-
destination : str
187+
destination : str | Path
180188
Path to a directory to store the bucket content in.
181189
exists_ok : bool, optional (default=True)
182190
If False, raise FileExists if a file already exists in ``destination``.
183191
"""
184-
destination = pathlib.Path(destination)
192+
destination = Path(destination)
185193
parsed_url = urllib.parse.urlparse(source)
186194

187195
# expect path format: /BUCKET/path/to/file.ext
@@ -190,18 +198,21 @@ def _download_minio_bucket(
190198
client = minio.Minio(endpoint=parsed_url.netloc, secure=False)
191199

192200
for file_object in client.list_objects(bucket, recursive=True):
201+
if file_object.object_name is None:
202+
raise ValueError("Object name is None.")
203+
193204
_download_minio_file(
194205
source=source + "/" + file_object.object_name,
195-
destination=pathlib.Path(destination, file_object.object_name),
206+
destination=Path(destination, file_object.object_name),
196207
exists_ok=True,
197208
)
198209

199210

200211
def _download_text_file(
201212
source: str,
202-
output_path: str | None = None,
213+
output_path: str | Path | None = None,
203214
md5_checksum: str | None = None,
204-
exists_ok: bool = True,
215+
exists_ok: bool = True, # noqa: FBT001, FBT002
205216
encoding: str = "utf8",
206217
) -> str | None:
207218
"""Download the text file at `source` and store it in `output_path`.
@@ -213,7 +224,7 @@ def _download_text_file(
213224
----------
214225
source : str
215226
url of the file to be downloaded
216-
output_path : str, (optional)
227+
output_path : str | Path | None (default=None)
217228
full path, including filename, of where the file should be stored. If ``None``,
218229
this function returns the downloaded file as string.
219230
md5_checksum : str, optional (default=None)
@@ -223,15 +234,14 @@ def _download_text_file(
223234
encoding : str, optional (default='utf8')
224235
The encoding with which the file should be stored.
225236
"""
226-
if output_path is not None:
227-
try:
228-
with open(output_path, encoding=encoding):
229-
if exists_ok:
230-
return None
231-
else:
232-
raise FileExistsError
233-
except FileNotFoundError:
234-
pass
237+
if isinstance(output_path, str):
238+
output_path = Path(output_path)
239+
240+
if output_path is not None and output_path.exists():
241+
if not exists_ok:
242+
raise FileExistsError
243+
244+
return None
235245

236246
logging.info("Starting [%s] request for the URL %s", "get", source)
237247
start = time.time()
@@ -247,28 +257,25 @@ def _download_text_file(
247257
)
248258
return downloaded_file
249259

250-
else:
251-
with open(output_path, "w", encoding=encoding) as fh:
252-
fh.write(downloaded_file)
260+
with output_path.open("w", encoding=encoding) as fh:
261+
fh.write(downloaded_file)
253262

254-
logging.info(
255-
"%.7fs taken for [%s] request for the URL %s",
256-
time.time() - start,
257-
"get",
258-
source,
259-
)
260-
261-
del downloaded_file
262-
return None
263+
logging.info(
264+
"%.7fs taken for [%s] request for the URL %s",
265+
time.time() - start,
266+
"get",
267+
source,
268+
)
269+
return None
263270

264271

265-
def _file_id_to_url(file_id: str, filename: str | None = None) -> str:
272+
def _file_id_to_url(file_id: int, filename: str | None = None) -> str:
266273
"""
267274
Presents the URL how to download a given file id
268275
filename is optional
269276
"""
270277
openml_url = config.server.split("/api/")
271-
url = openml_url[0] + "/data/download/%s" % file_id
278+
url = openml_url[0] + f"/data/download/{file_id!s}"
272279
if filename is not None:
273280
url += "/" + filename
274281
return url
@@ -316,13 +323,13 @@ def __read_url(
316323
def __is_checksum_equal(downloaded_file_binary: bytes, md5_checksum: str | None = None) -> bool:
317324
if md5_checksum is None:
318325
return True
319-
md5 = hashlib.md5()
326+
md5 = hashlib.md5() # noqa: S324
320327
md5.update(downloaded_file_binary)
321328
md5_checksum_download = md5.hexdigest()
322329
return md5_checksum == md5_checksum_download
323330

324331

325-
def _send_request(
332+
def _send_request( # noqa: C901
326333
request_method: str,
327334
url: str,
328335
data: DATA_TYPE,
@@ -331,7 +338,9 @@ def _send_request(
331338
) -> requests.Response:
332339
n_retries = max(1, config.connection_n_retries)
333340

334-
response: requests.Response
341+
response: requests.Response | None = None
342+
delay_method = _human_delay if config.retry_policy == "human" else _robot_delay
343+
335344
with requests.Session() as session:
336345
# Start at one to have a non-zero multiplier for the sleep
337346
for retry_counter in range(1, n_retries + 1):
@@ -344,10 +353,11 @@ def _send_request(
344353
response = session.post(url, data=data, files=files)
345354
else:
346355
raise NotImplementedError()
356+
347357
__check_response(response=response, url=url, file_elements=files)
358+
348359
if request_method == "get" and not __is_checksum_equal(
349-
response.text.encode("utf-8"),
350-
md5_checksum,
360+
response.text.encode("utf-8"), md5_checksum
351361
):
352362
# -- Check if encoding is not UTF-8 perhaps
353363
if __is_checksum_equal(response.content, md5_checksum):
@@ -365,41 +375,44 @@ def _send_request(
365375
"Checksum of downloaded file is unequal to the expected checksum {} "
366376
"when downloading {}.".format(md5_checksum, url),
367377
)
368-
break
378+
379+
return response
380+
except OpenMLServerException as e:
381+
# Propagate all server errors to the calling functions, except
382+
# for 107 which represents a database connection error.
383+
# These are typically caused by high server load,
384+
# which means trying again might resolve the issue.
385+
if e.code != DATABASE_CONNECTION_ERRCODE:
386+
raise e
387+
388+
delay = delay_method(retry_counter)
389+
time.sleep(delay)
390+
391+
except xml.parsers.expat.ExpatError as e:
392+
if request_method != "get" or retry_counter >= n_retries:
393+
if response is not None:
394+
extra = f"Status code: {response.status_code}\n{response.text}"
395+
else:
396+
extra = "No response retrieved."
397+
398+
raise OpenMLServerError(
399+
f"Unexpected server error when calling {url}. Please contact the "
400+
f"developers!\n{extra}"
401+
) from e
402+
403+
delay = delay_method(retry_counter)
404+
time.sleep(delay)
405+
369406
except (
370407
requests.exceptions.ChunkedEncodingError,
371408
requests.exceptions.ConnectionError,
372409
requests.exceptions.SSLError,
373-
OpenMLServerException,
374-
xml.parsers.expat.ExpatError,
375410
OpenMLHashException,
376-
) as e:
377-
if isinstance(e, OpenMLServerException) and e.code != 107:
378-
# Propagate all server errors to the calling functions, except
379-
# for 107 which represents a database connection error.
380-
# These are typically caused by high server load,
381-
# which means trying again might resolve the issue.
382-
raise
383-
elif isinstance(e, xml.parsers.expat.ExpatError):
384-
if request_method != "get" or retry_counter >= n_retries:
385-
raise OpenMLServerError(
386-
f"Unexpected server error when calling {url}. Please contact the "
387-
f"developers!\nStatus code: {response.status_code}\n{response.text}",
388-
)
389-
if retry_counter >= n_retries:
390-
raise
391-
else:
411+
):
412+
delay = delay_method(retry_counter)
413+
time.sleep(delay)
392414

393-
def robot(n: int) -> float:
394-
wait = (1 / (1 + math.exp(-(n * 0.5 - 4)))) * 60
395-
variation = random.gauss(0, wait / 10)
396-
return max(1.0, wait + variation)
397-
398-
def human(n: int) -> float:
399-
return max(1.0, n)
400-
401-
delay = {"human": human, "robot": robot}[config.retry_policy](retry_counter)
402-
time.sleep(delay)
415+
assert response is not None
403416
return response
404417

405418

@@ -410,9 +423,7 @@ def __check_response(
410423
) -> None:
411424
if response.status_code != 200:
412425
raise __parse_server_exception(response, url, file_elements=file_elements)
413-
elif (
414-
"Content-Encoding" not in response.headers or response.headers["Content-Encoding"] != "gzip"
415-
):
426+
if "Content-Encoding" not in response.headers or response.headers["Content-Encoding"] != "gzip":
416427
logging.warning(f"Received uncompressed content from OpenML for {url}.")
417428

418429

@@ -423,17 +434,18 @@ def __parse_server_exception(
423434
) -> OpenMLServerError:
424435
if response.status_code == 414:
425436
raise OpenMLServerError(f"URI too long! ({url})")
437+
426438
try:
427439
server_exception = xmltodict.parse(response.text)
428-
except xml.parsers.expat.ExpatError:
429-
raise
430-
except Exception:
440+
except xml.parsers.expat.ExpatError as e:
441+
raise e
442+
except Exception as e: # noqa: BLE001
431443
# OpenML has a sophisticated error system
432444
# where information about failures is provided. try to parse this
433445
raise OpenMLServerError(
434446
f"Unexpected server error when calling {url}. Please contact the developers!\n"
435447
f"Status code: {response.status_code}\n{response.text}",
436-
)
448+
) from e
437449

438450
server_error = server_exception["oml:error"]
439451
code = int(server_error["oml:code"])

0 commit comments

Comments
 (0)