Skip to content

Commit f706563

Browse files
committed
feat: Dockerfile
1 parent 62026e5 commit f706563

File tree

4 files changed

+41
-25
lines changed

4 files changed

+41
-25
lines changed

Dockerfile

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
FROM python:3.11-slim
2+
3+
WORKDIR /app
4+
5+
# Copy everything first (pyproject.toml, README.md, and source code)
6+
COPY . .
7+
8+
# Install the package + dependencies
9+
RUN pip install .
10+
11+
# Default command
12+
ENTRYPOINT ["python", "-m", "databusclient"]

databusclient/cli.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -38,19 +38,12 @@ def deploy(
3838
def download(
3939
databusuris: List[str] = typer.Argument(..., help="any kind of these: databus identifier, databus collection identifier, query file"),
4040
localDir: str = typer.Option(None , help="local databus folder"), # if not given, databus folder structure is created in current working directory
41-
databus: str = typer.Option(None, help="databus URL"), # if not given, inferred on first databusuri (e.g. https://databus.dbpedia.org/sparql)
42-
vault_token_file: str = typer.Option(None, help="Path to Vault refresh token file"),
43-
auth_url: str = typer.Option("https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", help="Keycloak token endpoint URL"),
44-
client_id: str = typer.Option("vault-token-exchange", help="Client ID for token exchange")
41+
databus: str = typer.Option(None, help="databus URL"), # if not given, inferred on databusuri (e.g. https://databus.dbpedia.org/sparql)
42+
token: str = typer.Option(None, help="Path to Vault refresh token file"),
43+
authUrl: str = typer.Option("https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", help="Keycloak token endpoint URL"),
44+
clientId: str = typer.Option("vault-token-exchange", help="Client ID for token exchange")
4545
):
4646
"""
4747
Download datasets from databus, optionally using vault access if vault options are provided.
4848
"""
49-
# Validate vault options: either all three are provided or none
50-
vault_opts = [vault_token_file, auth_url, client_id]
51-
if any(vault_opts) and not all(vault_opts):
52-
raise typer.BadParameter(
53-
"If one of --vault-token-file, --auth-url, or --client-id is specified, all three must be specified."
54-
)
55-
56-
client.download(localDir=localDir, endpoint=databus, databusURIs=databusuris, vault_token_file=vault_token_file, auth_url=auth_url, client_id=client_id)
49+
client.download(localDir=localDir, endpoint=databus, databusURIs=databusuris, vault_token_file=vaultTokenFile, auth_url=authUrl, client_id=clientId)

databusclient/client.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -415,19 +415,20 @@ def __download_file__(url, filename, vault_token_file=None, auth_url=None, clien
415415

416416
# --- 1. Get redirect URL by requesting HEAD ---
417417
response = requests.head(url, stream=True)
418-
url = response.headers.get("Location") # update URL to the final one after redirects
419-
# print(f"Status code: {response.status_code} \nResponse code: {response.status_code}\nHeaders: {response.headers}")
420-
print("URL after redirects:", url)
418+
419+
# Check for redirect and update URL if necessary
420+
if response.headers.get("Location") and response.status_code in [301, 302, 303, 307, 308]:
421+
url = response.headers.get("Location")
422+
print("Redirects url: ", url)
421423

422424
# --- 2. Try direct GET ---
423425
response = requests.get(url, stream=True, allow_redirects=False) # no redirects here, we want to see if auth is required
424426
www = response.headers.get('WWW-Authenticate', '') # get WWW-Authenticate header if present to check for Bearer auth
425-
# print(f"Status code: {response.status_code} \nResponse code: {response.status_code}\nHeaders: {response.headers}")
426427

427428
if (response.status_code == 401 or "bearer" in www.lower()):
428429
print(f"Authentication required for {url}")
429430
if not (vault_token_file):
430-
raise RuntimeError("Authentication required but no vault_token provided")
431+
raise ValueError("Vault token file not given for protected download")
431432

432433
# --- 3. Fetch Vault token ---
433434
vault_token = __get_vault_access__(url, vault_token_file, auth_url, client_id)
@@ -449,7 +450,7 @@ def __download_file__(url, filename, vault_token_file=None, auth_url=None, clien
449450
progress_bar.close()
450451

451452
if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes:
452-
print("ERROR, something went wrong")
453+
raise
453454

454455

455456
def __get_vault_access__(download_url: str,
@@ -575,7 +576,9 @@ def __download_list__(urls: List[str],
575576

576577
file = url.split("/")[-1]
577578
filename = os.path.join(localDir, file)
579+
print("\n")
578580
__download_file__(url=url, filename=filename, vault_token_file=vault_token_file, auth_url=auth_url, client_id=client_id)
581+
print("\n")
579582

580583

581584
def __get_databus_id_parts__(uri: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], Optional[str]]:
@@ -607,29 +610,33 @@ def download(
607610
for databusURI in databusURIs:
608611
host, account, group, artifact, version, file = __get_databus_id_parts__(databusURI)
609612

610-
# Auto-detect sparql endpoint from databusURI if not given -> no need to specify endpoint (--databus)
611-
if endpoint is None:
612-
endpoint = f"https://{host}/sparql"
613-
print(f"SPARQL endpoint {endpoint}")
614-
615613
# dataID or databus collection
616614
if databusURI.startswith("http://") or databusURI.startswith("https://"):
615+
# Auto-detect sparql endpoint from databusURI if not given -> no need to specify endpoint (--databus)
616+
if endpoint is None:
617+
endpoint = f"https://{host}/sparql"
618+
print(f"SPARQL endpoint {endpoint}")
619+
617620
# databus collection
618621
if "/collections/" in databusURI: # TODO "in" is not safe! there could be an artifact named collections, need to check for the correct part position in the URI
619622
query = __handle_databus_collection__(databusURI)
620623
res = __handle_databus_file_query__(endpoint, query)
621624
__download_list__(res, localDir)
622-
# databus artifact version // https://(databus.dbpedia.org|databus.dev.dbpedia.link)/$ACCOUNT/$GROUP/$ARTIFACT/$VERSION
625+
# databus file
623626
elif file is not None:
624-
print("fileId not supported yet") # TODO
627+
__download_list__([databusURI], localDir, vault_token_file=vault_token_file, auth_url=auth_url, client_id=client_id)
628+
# databus artifact version
625629
elif version is not None:
626630
json_str = __handle_databus_artifact_version__(databusURI)
627631
res = __handle_databus_file_json__(json_str)
628632
__download_list__(res, localDir, vault_token_file=vault_token_file, auth_url=auth_url, client_id=client_id)
633+
# databus artifact
629634
elif artifact is not None:
630635
print("artifactId not supported yet") # TODO
636+
# databus group
631637
elif group is not None:
632638
print("groupId not supported yet") # TODO
639+
# databus account
633640
elif account is not None:
634641
print("accountId not supported yet") # TODO
635642
else:
@@ -640,5 +647,7 @@ def download(
640647
# query as argument
641648
else:
642649
print("QUERY {}", databusURI.replace("\n", " "))
650+
if endpoint is None: # endpoint is required for queries (--databus)
651+
raise ValueError("No endpoint given for query")
643652
res = __handle_databus_file_query__(endpoint, databusURI)
644653
__download_list__(res, localDir)

pyproject.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ typer = "^0.6.1"
1212
requests = "^2.28.1"
1313
tqdm = "^4.42.1"
1414
SPARQLWrapper = "^2.0.0"
15+
click = "<8.0"
16+
rdflib = "^7.2.1"
1517

1618

1719
[tool.poetry.dev-dependencies]

0 commit comments

Comments
 (0)