11import json
22import os
33from typing import List
4+ from urllib .parse import urlparse
45
56import requests
67from SPARQLWrapper import JSON , SPARQLWrapper
1213)
1314
1415
16+ # Hosts that require Vault token based authentication. Central source of truth.
17+ VAULT_REQUIRED_HOSTS = {
18+ "data.dbpedia.io" ,
19+ "data.dev.dbpedia.link" ,
20+ }
21+
22+
23+ class DownloadAuthError (Exception ):
24+ """Raised when an authorization problem occurs during download."""
25+
26+
27+
1528def _download_file (
1629 url ,
1730 localDir ,
@@ -52,16 +65,9 @@ def _download_file(
5265 os .makedirs (dirpath , exist_ok = True ) # Create the necessary directories
5366 # --- 1. Get redirect URL by requesting HEAD ---
5467 headers = {}
55- # --- 1a. public databus ---
56- response = requests .head (url , timeout = 30 )
57- # --- 1b. Databus API key required ---
58- if response .status_code == 401 :
59- # print(f"API key required for {url}")
60- if not databus_key :
61- raise ValueError ("Databus API key not given for protected download" )
6268
63- headers = { "X-API-KEY" : databus_key }
64- response = requests .head (url , headers = headers , timeout = 30 )
69+ # --- 1a. public databus ---
70+ response = requests .head (url , timeout = 30 , allow_redirects = False )
6571
6672 # Check for redirect and update URL if necessary
6773 if response .headers .get ("Location" ) and response .status_code in [
@@ -73,6 +79,30 @@ def _download_file(
7379 ]:
7480 url = response .headers .get ("Location" )
7581 print ("Redirects url: " , url )
82+ # Re-do HEAD request on redirect URL
83+ response = requests .head (url , timeout = 30 )
84+
85+ # Extract hostname from final URL (after redirect) to check if vault token needed.
86+ # This is the actual download location that may require authentication.
87+ parsed = urlparse (url )
88+ host = parsed .hostname
89+
90+ # --- 1b. Handle 401 on HEAD request ---
91+ if response .status_code == 401 :
92+ # Check if this is a vault-required host
93+ if host in VAULT_REQUIRED_HOSTS :
94+ # Vault-required host: need vault token
95+ if not vault_token_file :
96+ raise DownloadAuthError (
97+ f"Vault token required for host '{ host } ', but no token was provided. Please use --vault-token."
98+ )
99+ # Token provided; will handle in GET request below
100+ else :
101+ # Not a vault host; might need databus API key
102+ if not databus_key :
103+ raise DownloadAuthError ("Databus API key not given for protected download" )
104+ headers = {"X-API-KEY" : databus_key }
105+ response = requests .head (url , headers = headers , timeout = 30 )
76106
77107 # --- 2. Try direct GET to redirected URL ---
78108 headers ["Accept-Encoding" ] = (
@@ -81,25 +111,54 @@ def _download_file(
81111 response = requests .get (
82112 url , headers = headers , stream = True , allow_redirects = True , timeout = 30
83113 )
84- www = response .headers .get (
85- "WWW-Authenticate" , ""
86- ) # Check if authentication is required
114+ www = response .headers .get ("WWW-Authenticate" , "" ) # Check if authentication is required
87115
88- # --- 3. If redirected to authentication 401 Unauthorized, get Vault token and retry ---
116+ # --- 3. Handle authentication responses ---
117+ # 3a. Server requests Bearer auth. Only attempt token exchange for hosts
118+ # we explicitly consider Vault-protected (VAULT_REQUIRED_HOSTS). This avoids
119+ # sending tokens to unrelated hosts and makes auth behavior predictable.
89120 if response .status_code == 401 and "bearer" in www .lower ():
90- print (f"Authentication required for { url } " )
91- if not (vault_token_file ):
92- raise ValueError ("Vault token file not given for protected download" )
121+ # If host is not configured for Vault, do not attempt token exchange.
122+ if host not in VAULT_REQUIRED_HOSTS :
123+ raise DownloadAuthError (
124+ "Server requests Bearer authentication but this host is not configured for Vault token exchange."
125+ " Try providing a databus API key with --databus-key or contact your administrator."
126+ )
127+
128+ # Host requires Vault; ensure token file provided.
129+ if not vault_token_file :
130+ raise DownloadAuthError (
131+ f"Vault token required for host '{ host } ', but no token was provided. Please use --vault-token."
132+ )
93133
94- # --- 3a. Fetch Vault token ---
95- # TODO: cache token
134+ # --- 3b. Fetch Vault token and retry ---
135+ # Token exchange is potentially sensitive and should only be performed
136+ # for known hosts. __get_vault_access__ handles reading the refresh
137+ # token and exchanging it; errors are translated to DownloadAuthError
138+ # for user-friendly CLI output.
96139 vault_token = __get_vault_access__ (url , vault_token_file , auth_url , client_id )
97140 headers ["Authorization" ] = f"Bearer { vault_token } "
98- headers .pop ("Accept-Encoding" )
141+ headers .pop ("Accept-Encoding" , None )
99142
100- # --- 3b. Retry with token ---
143+ # Retry with token
101144 response = requests .get (url , headers = headers , stream = True , timeout = 30 )
102145
146+ # Map common auth failures to friendly messages
147+ if response .status_code == 401 :
148+ raise DownloadAuthError ("Vault token is invalid or expired. Please generate a new token." )
149+ if response .status_code == 403 :
150+ raise DownloadAuthError ("Vault token is valid but has insufficient permissions to access this file." )
151+
152+ # 3c. Generic forbidden without Bearer challenge
153+ if response .status_code == 403 :
154+ raise DownloadAuthError ("Access forbidden: your token or API key does not have permission to download this file." )
155+
156+ # 3d. Generic unauthorized without Bearer
157+ if response .status_code == 401 :
158+ raise DownloadAuthError (
159+ "Unauthorized: access denied. Check your --databus-key or --vault-token settings."
160+ )
161+
103162 try :
104163 response .raise_for_status () # Raise if still failing
105164 except requests .exceptions .HTTPError as e :
0 commit comments