Skip to content

Commit 935cd16

Browse files
bug: Add 403 error fallback handler for dataset downloads (#702)
* feat: Add 403 error fallback handler for dataset downloads - Implement fallback mechanism with browser-like headers for 403 errors - Add HTTPError to imports from requests.exceptions - Add urlparse for URL parsing in fallback logic - Include comprehensive test cases for 403 fallback success and failure scenarios * fix * Refactored method for handling dataset_downloads with improved fallback handling for HTTP 403 errors. Added `FALLBACK_HEADERS` to `constants.py` and updated `helpers.py` to utilize these headers for retries. * test refactoring * test fix
1 parent 0274fa0 commit 935cd16

File tree

3 files changed

+120
-48
lines changed

3 files changed

+120
-48
lines changed

tools/constants.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,20 @@
9999
LOAD_FUNC = "load_func"
100100
ZIP = "zip"
101101
JSON = "json"
102+
103+
#browser header
104+
FALLBACK_HEADERS = {
105+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
106+
"AppleWebKit/537.36 (KHTML, like Gecko) "
107+
"Chrome/132.0.0.0 Safari/537.36",
108+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,"
109+
"image/avif,image/webp,image/apng,*/*;q=0.8,"
110+
"application/signed-exchange;v=b3;q=0.7",
111+
"Accept-Language": "en-US,en;q=0.9",
112+
"Accept-Encoding": "gzip, deflate, br",
113+
"Accept": "application/zip",
114+
"Connection": "keep-alive",
115+
"Sec-Fetch-Dest": "document",
116+
"Sec-Fetch-Mode": "navigate",
117+
"Sec-Fetch-Site": "same-origin",
118+
}

tools/helpers.py

Lines changed: 49 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import datetime
44
import gtfs_kit
55
import requests
6-
from requests.exceptions import RequestException
6+
from requests.exceptions import RequestException, HTTPError
77
import pandas as pd
88
from pandas.errors import ParserError
99
from unidecode import unidecode
@@ -14,7 +14,10 @@
1414
MDB_ARCHIVES_LATEST_URL_TEMPLATE,
1515
MDB_SOURCE_FILENAME,
1616
ZIP,
17+
FALLBACK_HEADERS,
18+
1719
)
20+
from urllib.parse import urlparse
1821

1922
#########################
2023
# I/O FUNCTIONS
@@ -55,7 +58,7 @@ def to_csv(path, catalog, columns):
5558
"""
5659
Save a catalog to a CSV file.
5760
58-
This function normalizes a catalog, optionally filters it by specified columns,
61+
This function normalizes a catalog, optionally filters it by specified columns,
5962
and saves it to a CSV file at the given path.
6063
6164
Args:
@@ -72,52 +75,56 @@ def to_csv(path, catalog, columns):
7275
catalog.to_csv(path, sep=",", index=False)
7376

7477

75-
def download_dataset(
76-
url, authentication_type, api_key_parameter_name, api_key_parameter_value
77-
):
78+
def download_dataset(url, authentication_type, api_key_parameter_name=None, api_key_parameter_value=None):
79+
"""
80+
Downloads a dataset from the given URL using specified authentication mechanisms.
81+
The method performs a request to the URL with API key passed as either a query
82+
parameter or a header, based on the chosen authentication type. If the download
83+
fails with certain 403 errors, a fallback request with alternative headers is attempted.
84+
It writes the dataset contents to a temporary file and returns the file path.
85+
86+
:param url: The dataset's source URL.
87+
:type url: str
88+
:param authentication_type: The type of authentication mechanism to use (e.g.,
89+
1 for parameter-based, 2 for header-based).
90+
:type authentication_type: int
91+
:param api_key_parameter_name: The name of the API key parameter/header. It is
92+
optional if the dataset is publicly accessible or no authentication is
93+
required.
94+
:type api_key_parameter_name: str, optional
95+
:param api_key_parameter_value: The value of the API key to authenticate the
96+
request. It is optional if no authentication is required.
97+
:type api_key_parameter_value: str, optional
98+
:return: The file path where the downloaded dataset is temporarily stored.
99+
:type return: str
100+
:raises RequestException: If all attempts to download the dataset fail.
78101
"""
79-
Download a dataset from a given URL with optional authentication.
80-
81-
This function downloads a dataset from the specified URL using optional
82-
API key authentication and saves it to a file in the current working directory.
83-
84-
Args:
85-
url (str): The URL of the dataset to download.
86-
authentication_type (int): The type of authentication to use.
87-
0: No authentication.
88-
1: API key as a query parameter.
89-
2: API key as a header.
90-
api_key_parameter_name (str, optional): The name of the API key parameter.
91-
api_key_parameter_value (str, optional): The value of the API key.
92102

93-
Returns:
94-
str: The path to the downloaded file.
103+
def make_request(url, params=None, headers=None):
104+
try:
105+
response = requests.get(url, params=params, headers=headers, allow_redirects=True)
106+
response.raise_for_status()
107+
return response.content
108+
except HTTPError as e:
109+
return None if e.response.status_code == 403 else RequestException(
110+
f"HTTP error {e} when accessing {url}. A fallback attempt with alternative headers will be made.")
111+
except RequestException as e:
112+
raise RequestException(f"Request failed: {e}")
95113

96-
Raises:
97-
RequestException: If an error occurs during the download process.
98-
"""
114+
file_path = os.path.join(os.getcwd(), str(uuid.uuid4()))
99115

100-
file_name = str(uuid.uuid4())
101-
file_path = os.path.join(os.getcwd(), file_name)
116+
params = {api_key_parameter_name: api_key_parameter_value} if authentication_type == 1 else None
117+
headers = {api_key_parameter_name: api_key_parameter_value} if authentication_type == 2 else None
102118

103-
params = {}
104-
headers = {}
105-
if authentication_type == 1:
106-
params[api_key_parameter_name] = api_key_parameter_value
107-
elif authentication_type == 2:
108-
headers[api_key_parameter_name] = api_key_parameter_value
119+
zip_file = make_request(url, params, headers) or (
120+
make_request(url, params, {**FALLBACK_HEADERS, **(headers or {}),
121+
"Referer": f"{urlparse(url).scheme}://{urlparse(url).netloc}/",
122+
"Host": urlparse(url).netloc})
123+
)
109124

110-
try:
111-
zip_file_req = requests.get(
112-
url, params=params, headers=headers, allow_redirects=True
113-
)
114-
zip_file_req.raise_for_status()
115-
except RequestException as e:
116-
raise RequestException(
117-
f"FAILURE! Exception {e} occurred when downloading URL {url}.\n"
118-
)
125+
if zip_file is None:
126+
raise RequestException(f"FAILURE! Retry attempts failed for {url}.")
119127

120-
zip_file = zip_file_req.content
121128
with open(file_path, "wb") as f:
122129
f.write(zip_file)
123130

@@ -186,7 +193,7 @@ def are_overlapping_edges(
186193
filter_maximum (float): The maximum coordinate of the filter edge.
187194
188195
Returns:
189-
bool: True if the two edges are overlapping, False otherwise.
196+
bool: True if the two edges are overlapping, False otherwise.
190197
Returns False if one or more coordinates are None.
191198
"""
192199
return (

tools/tests/test_helpers.py

Lines changed: 54 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
)
2121
import pandas as pd
2222
from freezegun import freeze_time
23+
from requests.exceptions import HTTPError
2324

2425

2526
class TestVerificationFunctions(TestCase):
@@ -408,8 +409,8 @@ def test_download_dataset_auth_type_empty(
408409
api_key_parameter_value=test_api_key_parameter_value,
409410
)
410411
self.assertEqual(under_test, self.test_path)
411-
self.assertEqual(mock_requests.call_args.kwargs["params"], {})
412-
self.assertEqual(mock_requests.call_args.kwargs["headers"], {})
412+
self.assertEqual(mock_requests.call_args.kwargs["params"], None)
413+
self.assertEqual(mock_requests.call_args.kwargs["headers"], None)
413414
mock_requests.assert_called_once()
414415
mock_os.path.join.assert_called_once()
415416
mock_os.getcwd.assert_called_once()
@@ -434,8 +435,8 @@ def test_download_dataset_auth_type_0(
434435
api_key_parameter_value=test_api_key_parameter_value,
435436
)
436437
self.assertEqual(under_test, self.test_path)
437-
self.assertEqual(mock_requests.call_args.kwargs["params"], {})
438-
self.assertEqual(mock_requests.call_args.kwargs["headers"], {})
438+
self.assertEqual(mock_requests.call_args.kwargs["params"], None)
439+
self.assertEqual(mock_requests.call_args.kwargs["headers"], None)
439440
mock_requests.assert_called_once()
440441
mock_os.path.join.assert_called_once()
441442
mock_os.getcwd.assert_called_once()
@@ -464,7 +465,7 @@ def test_download_dataset_auth_type_1(
464465
mock_requests.call_args.kwargs["params"],
465466
{test_api_key_parameter_name: test_api_key_parameter_value},
466467
)
467-
self.assertEqual(mock_requests.call_args.kwargs["headers"], {})
468+
self.assertEqual(mock_requests.call_args.kwargs["headers"], None)
468469
mock_requests.assert_called_once()
469470
mock_os.path.join.assert_called_once()
470471
mock_os.getcwd.assert_called_once()
@@ -489,7 +490,7 @@ def test_download_dataset_auth_type_2(
489490
api_key_parameter_value=test_api_key_parameter_value,
490491
)
491492
self.assertEqual(under_test, self.test_path)
492-
self.assertEqual(mock_requests.call_args.kwargs["params"], {})
493+
self.assertEqual(mock_requests.call_args.kwargs["params"], None)
493494
self.assertEqual(
494495
mock_requests.call_args.kwargs["headers"],
495496
{test_api_key_parameter_name: test_api_key_parameter_value},
@@ -520,3 +521,50 @@ def test_download_dataset_exception(
520521
api_key_parameter_name=test_api_key_parameter_name,
521522
api_key_parameter_value=test_api_key_parameter_value,
522523
)
524+
525+
@patch("tools.helpers.open")
526+
@patch("tools.helpers.uuid.uuid4")
527+
@patch("tools.helpers.os")
528+
@patch("tools.helpers.requests.get")
529+
def test_download_dataset_403_fallback_success(self, mock_requests, mock_os, mock_uuid4, mock_open):
530+
531+
response_403 = Mock(status_code=403)
532+
response_403.raise_for_status.side_effect = HTTPError(response=response_403)
533+
534+
response_200 = Mock(status_code=200, content=b"file_content")
535+
536+
mock_requests.side_effect = [response_403, response_200]
537+
mock_os.path.join.return_value = self.test_path
538+
539+
under_test = download_dataset(url=self.test_url, authentication_type=0, api_key_parameter_name=None,
540+
api_key_parameter_value=None, )
541+
542+
self.assertEqual(under_test, self.test_path)
543+
self.assertEqual(mock_requests.call_count, 2)
544+
545+
@patch("tools.helpers.open")
546+
@patch("tools.helpers.uuid.uuid4")
547+
@patch("tools.helpers.os")
548+
@patch("tools.helpers.requests.get")
549+
def test_download_dataset_403_fallback_failure(self, mock_requests, mock_os, mock_uuid4, mock_open):
550+
test_authentication_type = 0
551+
test_api_key_parameter_name = None
552+
test_api_key_parameter_value = None
553+
554+
response_403_1 = Mock(status_code=403)
555+
response_403_1.raise_for_status.side_effect = HTTPError(response=response_403_1)
556+
response_403_2 = Mock(status_code=403)
557+
response_403_2.raise_for_status.side_effect = HTTPError(response=response_403_2)
558+
559+
mock_requests.side_effect = [response_403_1, response_403_2]
560+
561+
mock_os.path.join.return_value = self.test_path
562+
self.assertRaises(RequestException, download_dataset, url=self.test_url,
563+
authentication_type=test_authentication_type, api_key_parameter_name=test_api_key_parameter_name,
564+
api_key_parameter_value=test_api_key_parameter_value, )
565+
566+
self.assertEqual(mock_requests.call_count, 2)
567+
mock_os.path.join.assert_called_once()
568+
mock_os.getcwd.assert_called_once()
569+
mock_uuid4.assert_called_once()
570+
mock_open.assert_not_called()

0 commit comments

Comments
 (0)