@@ -403,26 +403,48 @@ def __download_file__(url, filename, vault_token_file=None, auth_url=None, clien
403403 - vault_token_file: Path to Vault refresh token file
404404 - auth_url: Keycloak token endpoint URL
405405 - client_id: Client ID for token exchange
406+
407+ Steps:
408+ 1. Try direct GET without Authorization header.
409+ 2. If server responds with WWW-Authenticate: Bearer, 401 Unauthorized) or url starts with "https://data.dbpedia.io/databus.dbpedia.org",
410+ then fetch Vault access token and retry with Authorization header.
406411 """
407412
408- print ("download " + url )
413+ print ("Download file: " + url )
409414 os .makedirs (os .path .dirname (filename ), exist_ok = True ) # Create the necessary directories
410415
411- headers = {}
412- if vault_token_file and auth_url and client_id :
413- headers ["Authorization" ] = f"Bearer { __get_vault_access__ (url , vault_token_file , auth_url , client_id )} "
416+ # --- 1. Try without token ---
417+ response = requests .get (url , stream = True , allow_redirects = False )
418+ # print("Response code:", response.status_code)
419+ # print(f"Status code: {response.status_code}")
420+ # print(f"Headers: {response.headers}")
421+ url = response .headers .get ("Location" ) # update URL to the final one after redirects
422+ print ("URL after redirects:" , url )
423+ # print(f"Full response: {response}")
424+ # exit(0)
425+
426+ if (response .status_code == 401 or "WWW-Authenticate" in response .headers or url .startswith ("https://data.dbpedia.io/databus.dbpedia.org" )):
427+ print (f"Authentication required for { url } " )
428+ if not (vault_token_file ):
429+ raise RuntimeError ("Authentication required but no vault_token provided" )
430+
431+ # --- 2. Fetch Vault token ---
432+ vault_token = __get_vault_access__ (url , vault_token_file , auth_url , client_id )
433+ headers = {"Authorization" : f"Bearer { vault_token } " }
434+
435+ # --- 3. Retry with token ---
436+ response = requests .get (url , headers = headers , stream = True )
437+
438+ response .raise_for_status () # Raise if still failing
414439
415- response = requests .get (url , headers = headers , stream = True )
416- response .raise_for_status () # Raise an error for bad responses
417440 total_size_in_bytes = int (response .headers .get ('content-length' , 0 ))
418- block_size = 1024 # 1 Kibibyte
441+ block_size = 1024 # 1 KiB
419442
420443 progress_bar = tqdm (total = total_size_in_bytes , unit = 'iB' , unit_scale = True )
421444 with open (filename , 'wb' ) as file :
422445 for data in response .iter_content (block_size ):
423446 progress_bar .update (len (data ))
424447 file .write (data )
425-
426448 progress_bar .close ()
427449
428450 if total_size_in_bytes != 0 and progress_bar .n != total_size_in_bytes :
@@ -510,15 +532,19 @@ def __handle_databus_file_query__(endpoint_url, query) -> List[str]:
510532
511533
512534def __handle_databus_file_json__ (json_str : str ) -> List [str ]:
513- downloadURLs = []
535+ """
536+ Parse the JSON-LD of a databus artifact version to extract download URLs.
537+ Don't get downloadURLs directly from the JSON-LD, but follow the "file" links to count access to databus accurately.
538+ """
539+
540+ databusIdUrl = []
514541 json_dict = json .loads (json_str )
515542 graph = json_dict .get ("@graph" , [])
516543 for node in graph :
517544 if node .get ("@type" ) == "Part" :
518- downloadURL = node .get ("downloadURL" )
519- if downloadURL :
520- downloadURLs .append (downloadURL )
521- return downloadURLs
545+ id = node .get ("file" )
546+ databusIdUrl .append (id )
547+ return databusIdUrl
522548
523549
524550def wsha256 (raw : str ):
@@ -555,7 +581,7 @@ def download(
555581 client_id = None
556582) -> None :
557583 """
558- Download datasets to local storage from databus registry. If vault options are provided , vault access will be used for downloading protected files.
584+ Download datasets to local storage from databus registry. If download is on vault , vault token will be used for downloading protected files.
559585 ------
560586 localDir: the local directory
561587 endpoint: the databus endpoint URL
@@ -565,6 +591,12 @@ def download(
565591 client_id: Client ID for token exchange
566592 """
567593
594+ # Auto-detect sparql endpoint from first databusURI if not given -> no need to specify endpoint (--databus)
595+ if endpoint is None :
596+ host = databusURIs [0 ].split ("/" )[2 ]
597+ endpoint = f"https://{ host } /sparql"
598+ print (f"SPARQL endpoint { endpoint } " )
599+
568600 databusVersionPattern = re .compile (r"^https://(databus\.dbpedia\.org|databus\.dev\.dbpedia\.link)/[^/]+/[^/]+/[^/]+/[^/]+/?$" )
569601
570602 for databusURI in databusURIs :
0 commit comments