Skip to content

Commit f28943e

Browse files
committed
feat: less cli args needed, downloading over databus redirect for accurate statistics and check if auth is needed before sending header
1 parent e124b51 commit f28943e

File tree

2 files changed

+50
-18
lines changed

2 files changed

+50
-18
lines changed

databusclient/cli.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -36,12 +36,12 @@ def deploy(
3636

3737
@app.command()
3838
def download(
39-
localDir: str = typer.Option(..., help="local databus folder"),
40-
databus: str = typer.Option(..., help="databus URL"),
4139
databusuris: List[str] = typer.Argument(..., help="any kind of these: databus identifier, databus collection identifier, query file"),
40+
localDir: str = typer.Option("./tmp", help="local databus folder"),
41+
databus: str = typer.Option(None, help="databus URL"),
4242
vault_token_file: str = typer.Option(None, help="Path to Vault refresh token file"),
43-
auth_url: str = typer.Option(None, help="Keycloak token endpoint URL"),
44-
client_id: str = typer.Option(None, help="Client ID for token exchange")
43+
auth_url: str = typer.Option("https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", help="Keycloak token endpoint URL"),
44+
client_id: str = typer.Option("vault-token-exchange", help="Client ID for token exchange")
4545
):
4646
"""
4747
Download datasets from databus, optionally using vault access if vault options are provided.

databusclient/client.py

Lines changed: 46 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -403,26 +403,48 @@ def __download_file__(url, filename, vault_token_file=None, auth_url=None, clien
403403
- vault_token_file: Path to Vault refresh token file
404404
- auth_url: Keycloak token endpoint URL
405405
- client_id: Client ID for token exchange
406+
407+
Steps:
408+
1. Try direct GET without Authorization header.
409+
2. If server responds with WWW-Authenticate: Bearer, 401 Unauthorized) or url starts with "https://data.dbpedia.io/databus.dbpedia.org",
410+
then fetch Vault access token and retry with Authorization header.
406411
"""
407412

408-
print("download "+url)
413+
print("Download file: "+url)
409414
os.makedirs(os.path.dirname(filename), exist_ok=True) # Create the necessary directories
410415

411-
headers = {}
412-
if vault_token_file and auth_url and client_id:
413-
headers["Authorization"] = f"Bearer {__get_vault_access__(url, vault_token_file, auth_url, client_id)}"
416+
# --- 1. Try without token ---
417+
response = requests.get(url, stream=True, allow_redirects=False)
418+
# print("Response code:", response.status_code)
419+
# print(f"Status code: {response.status_code}")
420+
# print(f"Headers: {response.headers}")
421+
url = response.headers.get("Location") # update URL to the final one after redirects
422+
print("URL after redirects:", url)
423+
# print(f"Full response: {response}")
424+
# exit(0)
425+
426+
if (response.status_code == 401 or "WWW-Authenticate" in response.headers or url.startswith("https://data.dbpedia.io/databus.dbpedia.org")):
427+
print(f"Authentication required for {url}")
428+
if not (vault_token_file):
429+
raise RuntimeError("Authentication required but no vault_token provided")
430+
431+
# --- 2. Fetch Vault token ---
432+
vault_token = __get_vault_access__(url, vault_token_file, auth_url, client_id)
433+
headers = {"Authorization": f"Bearer {vault_token}"}
434+
435+
# --- 3. Retry with token ---
436+
response = requests.get(url, headers=headers, stream=True)
437+
438+
response.raise_for_status() # Raise if still failing
414439

415-
response = requests.get(url, headers=headers, stream=True)
416-
response.raise_for_status() # Raise an error for bad responses
417440
total_size_in_bytes = int(response.headers.get('content-length', 0))
418-
block_size = 1024 # 1 Kibibyte
441+
block_size = 1024 # 1 KiB
419442

420443
progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True)
421444
with open(filename, 'wb') as file:
422445
for data in response.iter_content(block_size):
423446
progress_bar.update(len(data))
424447
file.write(data)
425-
426448
progress_bar.close()
427449

428450
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
@@ -510,15 +532,19 @@ def __handle_databus_file_query__(endpoint_url, query) -> List[str]:
510532

511533

512534
def __handle_databus_file_json__(json_str: str) -> List[str]:
513-
downloadURLs = []
535+
"""
536+
Parse the JSON-LD of a databus artifact version to extract download URLs.
537+
Don't get downloadURLs directly from the JSON-LD, but follow the "file" links to count access to databus accurately.
538+
"""
539+
540+
databusIdUrl = []
514541
json_dict = json.loads(json_str)
515542
graph = json_dict.get("@graph", [])
516543
for node in graph:
517544
if node.get("@type") == "Part":
518-
downloadURL = node.get("downloadURL")
519-
if downloadURL:
520-
downloadURLs.append(downloadURL)
521-
return downloadURLs
545+
id = node.get("file")
546+
databusIdUrl.append(id)
547+
return databusIdUrl
522548

523549

524550
def wsha256(raw: str):
@@ -555,7 +581,7 @@ def download(
555581
client_id=None
556582
) -> None:
557583
"""
558-
Download datasets to local storage from databus registry. If vault options are provided, vault access will be used for downloading protected files.
584+
Download datasets to local storage from databus registry. If download is on vault, vault token will be used for downloading protected files.
559585
------
560586
localDir: the local directory
561587
endpoint: the databus endpoint URL
@@ -565,6 +591,12 @@ def download(
565591
client_id: Client ID for token exchange
566592
"""
567593

594+
# Auto-detect sparql endpoint from first databusURI if not given -> no need to specify endpoint (--databus)
595+
if endpoint is None:
596+
host = databusURIs[0].split("/")[2]
597+
endpoint = f"https://{host}/sparql"
598+
print(f"SPARQL endpoint {endpoint}")
599+
568600
databusVersionPattern = re.compile(r"^https://(databus\.dbpedia\.org|databus\.dev\.dbpedia\.link)/[^/]+/[^/]+/[^/]+/[^/]+/?$")
569601

570602
for databusURI in databusURIs:

0 commit comments

Comments
 (0)