33import datetime
44import gtfs_kit
55import requests
6- from requests .exceptions import RequestException
6+ from requests .exceptions import RequestException , HTTPError
77import pandas as pd
88from pandas .errors import ParserError
99from unidecode import unidecode
1414 MDB_ARCHIVES_LATEST_URL_TEMPLATE ,
1515 MDB_SOURCE_FILENAME ,
1616 ZIP ,
17+ FALLBACK_HEADERS ,
18+
1719)
20+ from urllib .parse import urlparse
1821
1922#########################
2023# I/O FUNCTIONS
@@ -55,7 +58,7 @@ def to_csv(path, catalog, columns):
5558 """
5659 Save a catalog to a CSV file.
5760
58- This function normalizes a catalog, optionally filters it by specified columns,
61+ This function normalizes a catalog, optionally filters it by specified columns,
5962 and saves it to a CSV file at the given path.
6063
6164 Args:
@@ -72,52 +75,56 @@ def to_csv(path, catalog, columns):
7275 catalog .to_csv (path , sep = "," , index = False )
7376
7477
75- def download_dataset (
76- url , authentication_type , api_key_parameter_name , api_key_parameter_value
77- ):
78+ def download_dataset (url , authentication_type , api_key_parameter_name = None , api_key_parameter_value = None ):
79+ """
80+ Downloads a dataset from the given URL using specified authentication mechanisms.
81+ The method performs a request to the URL with API key passed as either a query
82+ parameter or a header, based on the chosen authentication type. If the download
83+ fails with certain 403 errors, a fallback request with alternative headers is attempted.
84+ It writes the dataset contents to a temporary file and returns the file path.
85+
86+ :param url: The dataset's source URL.
87+ :type url: str
88+ :param authentication_type: The type of authentication mechanism to use (e.g.,
89+ 1 for parameter-based, 2 for header-based).
90+ :type authentication_type: int
91+ :param api_key_parameter_name: The name of the API key parameter/header. It is
92+ optional if the dataset is publicly accessible or no authentication is
93+ required.
94+ :type api_key_parameter_name: str, optional
95+ :param api_key_parameter_value: The value of the API key to authenticate the
96+ request. It is optional if no authentication is required.
97+ :type api_key_parameter_value: str, optional
98+ :return: The file path where the downloaded dataset is temporarily stored.
99+ :type return: str
100+ :raises RequestException: If all attempts to download the dataset fail.
78101 """
79- Download a dataset from a given URL with optional authentication.
80-
81- This function downloads a dataset from the specified URL using optional
82- API key authentication and saves it to a file in the current working directory.
83-
84- Args:
85- url (str): The URL of the dataset to download.
86- authentication_type (int): The type of authentication to use.
87- 0: No authentication.
88- 1: API key as a query parameter.
89- 2: API key as a header.
90- api_key_parameter_name (str, optional): The name of the API key parameter.
91- api_key_parameter_value (str, optional): The value of the API key.
92102
93- Returns:
94- str: The path to the downloaded file.
103+ def make_request (url , params = None , headers = None ):
104+ try :
105+ response = requests .get (url , params = params , headers = headers , allow_redirects = True )
106+ response .raise_for_status ()
107+ return response .content
108+ except HTTPError as e :
109+ return None if e .response .status_code == 403 else RequestException (
110+ f"HTTP error { e } when accessing { url } . A fallback attempt with alternative headers will be made." )
111+ except RequestException as e :
112+ raise RequestException (f"Request failed: { e } " )
95113
96- Raises:
97- RequestException: If an error occurs during the download process.
98- """
114+ file_path = os .path .join (os .getcwd (), str (uuid .uuid4 ()))
99115
100- file_name = str ( uuid . uuid4 ())
101- file_path = os . path . join ( os . getcwd (), file_name )
116+ params = { api_key_parameter_name : api_key_parameter_value } if authentication_type == 1 else None
117+ headers = { api_key_parameter_name : api_key_parameter_value } if authentication_type == 2 else None
102118
103- params = {}
104- headers = {}
105- if authentication_type == 1 :
106- params [api_key_parameter_name ] = api_key_parameter_value
107- elif authentication_type == 2 :
108- headers [api_key_parameter_name ] = api_key_parameter_value
119+ zip_file = make_request (url , params , headers ) or (
120+ make_request (url , params , {** FALLBACK_HEADERS , ** (headers or {}),
121+ "Referer" : f"{ urlparse (url ).scheme } ://{ urlparse (url ).netloc } /" ,
122+ "Host" : urlparse (url ).netloc })
123+ )
109124
110- try :
111- zip_file_req = requests .get (
112- url , params = params , headers = headers , allow_redirects = True
113- )
114- zip_file_req .raise_for_status ()
115- except RequestException as e :
116- raise RequestException (
117- f"FAILURE! Exception { e } occurred when downloading URL { url } .\n "
118- )
125+ if zip_file is None :
126+ raise RequestException (f"FAILURE! Retry attempts failed for { url } ." )
119127
120- zip_file = zip_file_req .content
121128 with open (file_path , "wb" ) as f :
122129 f .write (zip_file )
123130
@@ -186,7 +193,7 @@ def are_overlapping_edges(
186193 filter_maximum (float): The maximum coordinate of the filter edge.
187194
188195 Returns:
189- bool: True if the two edges are overlapping, False otherwise.
196+ bool: True if the two edges are overlapping, False otherwise.
190197 Returns False if one or more coordinates are None.
191198 """
192199 return (
0 commit comments