diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..b44f7b8 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,11 @@ +FROM python:3.10-slim + +WORKDIR /data + +COPY . . + +# Install dependencies +RUN pip install . + +# Use ENTRYPOINT for the CLI +ENTRYPOINT ["databusclient"] diff --git a/README.md b/README.md index cac4401..3782a1c 100644 --- a/README.md +++ b/README.md @@ -23,15 +23,18 @@ Options: Commands: deploy - downoad + download ``` + +## Docker Image Usage + +A docker image is available at [dbpedia/databus-python-client](https://hub.docker.com/r/dbpedia/databus-python-client). See [download section](#usage-of-docker-image) for details. + ### Deploy command ``` databusclient deploy --help ``` ``` - - Usage: databusclient deploy [OPTIONS] DISTRIBUTIONS... Arguments: @@ -40,23 +43,23 @@ Arguments: content variants of a distribution, fileExt and Compression can be set, if not they are inferred from the path [required] Options: - --versionid TEXT target databus version/dataset identifier of the form [required] - --title TEXT dataset title [required] - --abstract TEXT dataset abstract max 200 chars [required] - --description TEXT dataset description [required] - --license TEXT license (see dalicc.net) [required] - --apikey TEXT apikey [required] + --title TEXT Dataset title [required] + --abstract TEXT Dataset abstract max 200 chars [required] + --description TEXT Dataset description [required] + --license TEXT License (see dalicc.net) [required] + --apikey TEXT API key [required] --help Show this message and exit. ``` Examples of using deploy command ``` -databusclient deploy --versionid https://databus.dbpedia.org/user1/group1/artifact1/2022-05-18 --title title1 --abstract abstract1 --description description1 --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 --apikey MYSTERIOUS 'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' +databusclient deploy --version-id https://databus.dbpedia.org/user1/group1/artifact1/2022-05-18 --title title1 --abstract abstract1 --description description1 --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 --apikey MYSTERIOUS 'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' ``` ``` -databusclient deploy --versionid https://dev.databus.dbpedia.org/denis/group1/artifact1/2022-05-18 --title "Client Testing" --abstract "Testing the client...." --description "Testing the client...." --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 --apikey MYSTERIOUS 'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' +databusclient deploy --version-id https://dev.databus.dbpedia.org/denis/group1/artifact1/2022-05-18 --title "Client Testing" --abstract "Testing the client...." --description "Testing the client...." --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 --apikey MYSTERIOUS 'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' ``` A few more notes for CLI usage: @@ -65,6 +68,93 @@ A few more notes for CLI usage: * For complete inferred: Just use the URL with `https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml` * If other parameters are used, you need to leave them empty like `https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml||yml|7a751b6dd5eb8d73d97793c3c564c71ab7b565fa4ba619e4a8fd05a6f80ff653:367116` +### Download command +``` +databusclient download --help +``` + +``` +Usage: databusclient download [OPTIONS] DATABUSURIS... + +Arguments: + DATABUSURIS... databus uris to download from https://databus.dbpedia.org, + or a query statement that returns databus uris from https://databus.dbpedia.org/sparql + to be downloaded [required] + + Download datasets from databus, optionally using vault access if vault + options are provided. + +Options: + --localdir TEXT Local databus folder (if not given, databus folder + structure is created in current working directory) + --databus TEXT Databus URL (if not given, inferred from databusuri, e.g. + https://databus.dbpedia.org/sparql) + --token TEXT Path to Vault refresh token file + --authurl TEXT Keycloak token endpoint URL [default: + https://auth.dbpedia.org/realms/dbpedia/protocol/openid- + connect/token] + --clientid TEXT Client ID for token exchange [default: vault-token- + exchange] + --help Show this message and exit. Show this message and exit. +``` + +Examples of using download command + +**File**: download of a single file +``` +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01/mappingbased-literals_lang=az.ttl.bz2 +``` + +**Version**: download of all files of a specific version +``` +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 +``` + +**Artifact**: download of all files with latest version of an artifact +``` +databusclient download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals +``` + +**Group**: download of all files with lates version of all artifacts of a group +``` +databusclient download https://databus.dbpedia.org/dbpedia/mappings +``` + +If no `--localdir` is provided, the current working directory is used as base directory. The downloaded files will be stored in the working directory in a folder structure according to the databus structure, i.e. `./$ACCOUNT/$GROUP/$ARTIFACT/$VERSION/`. + +**Collection**: download of all files within a collection +``` +databusclient download https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12 +``` + +**Query**: download of all files returned by a query (sparql endpoint must be provided with `--databus`) +``` +databusclient download 'PREFIX dcat: SELECT ?x WHERE { ?sub dcat:downloadURL ?x . } LIMIT 10' --databus https://databus.dbpedia.org/sparql +``` + +#### Authentication with vault + +For downloading files from the vault, you need to provide a vault token. See [getting-the-access-refresh-token](https://github.com/dbpedia/databus-vault-access?tab=readme-ov-file#step-1-getting-the-access-refresh-token) for details. You can come back here once you have a `vault-token.dat` file. To use it, just provide the path to the file with `--token /path/to/vault-token.dat`. + +Example: +``` +databusclient download https://databus.dbpedia.org/dbpedia-enterprise/live-fusion-snapshots/fusion/2025-08-23 --token vault-token.dat +``` + +If vault authentication is required for downloading a file, the client will use the token. If no vault authentication is required, the token will not be used. + +#### Usage of docker image + +A docker image is available at [dbpedia/databus-python-client](https://hub.docker.com/r/dbpedia/databus-python-client). You can use it like this: + +``` +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia/mappings/mappingbased-literals/2022.12.01 +``` +If using vault authentication, make sure the token file is available in the container, e.g. by placing it in the current working directory. +``` +docker run --rm -v $(pwd):/data dbpedia/databus-python-client download https://databus.dbpedia.org/dbpedia-enterprise/live-fusion-snapshots/fusion/2025-08-23/fusion_props=all_subjectns=commons-wikimedia-org_vocab=all.ttl.gz --token vault-token.dat +``` + ## Module Usage ### Step 1: Create lists of distributions for the dataset diff --git a/databusclient/cli.py b/databusclient/cli.py index 3384323..8fc3e02 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -1,43 +1,61 @@ #!/usr/bin/env python3 -import typer +import click from typing import List from databusclient import client -app = typer.Typer() + +@click.group() +def app(): + """Databus Client CLI""" + pass @app.command() -def deploy( - version_id: str = typer.Option( - ..., - help="target databus version/dataset identifier of the form " - "", - ), - title: str = typer.Option(..., help="dataset title"), - abstract: str = typer.Option(..., help="dataset abstract max 200 chars"), - description: str = typer.Option(..., help="dataset description"), - license_uri: str = typer.Option(..., help="license (see dalicc.net)"), - apikey: str = typer.Option(..., help="apikey"), - distributions: List[str] = typer.Argument( - ..., - help="distributions in the form of List[URL|CV|fileext|compression|sha256sum:contentlength] where URL is the " - "download URL and CV the " - "key=value pairs (_ separated) content variants of a distribution. filext and compression are optional " - "and if left out inferred from the path. If the sha256sum:contentlength part is left out it will be " - "calcuted by downloading the file.", - ), -): - typer.echo(version_id) - dataid = client.create_dataset( - version_id, title, abstract, description, license_uri, distributions - ) +@click.option( + "--version-id", "version_id", + required=True, + help="Target databus version/dataset identifier of the form " + "", +) +@click.option("--title", required=True, help="Dataset title") +@click.option("--abstract", required=True, help="Dataset abstract max 200 chars") +@click.option("--description", required=True, help="Dataset description") +@click.option("--license", "license_url", required=True, help="License (see dalicc.net)") +@click.option("--apikey", required=True, help="API key") +@click.argument( + "distributions", + nargs=-1, + required=True, +) +def deploy(version_id, title, abstract, description, license_url, apikey, distributions: List[str]): + """ + Deploy a dataset version with the provided metadata and distributions. + """ + click.echo(f"Deploying dataset version: {version_id}") + dataid = client.create_dataset(version_id, title, abstract, description, license_url, distributions) client.deploy(dataid=dataid, api_key=apikey) @app.command() -def download( - localDir: str = typer.Option(..., help="local databus folder"), - databus: str = typer.Option(..., help="databus URL"), - databusuris: List[str] = typer.Argument(...,help="any kind of these: databus identifier, databus collection identifier, query file") -): - client.download(localDir=localDir,endpoint=databus,databusURIs=databusuris) +@click.argument("databusuris", nargs=-1, required=True) +@click.option("--localdir", help="Local databus folder (if not given, databus folder structure is created in current working directory)") +@click.option("--databus", help="Databus URL (if not given, inferred from databusuri, e.g. https://databus.dbpedia.org/sparql)") +@click.option("--token", help="Path to Vault refresh token file") +@click.option("--authurl", default="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", show_default=True, help="Keycloak token endpoint URL") +@click.option("--clientid", default="vault-token-exchange", show_default=True, help="Client ID for token exchange") +def download(databusuris: List[str], localdir, databus, token, authurl, clientid): + """ + Download datasets from databus, optionally using vault access if vault options are provided. + """ + client.download( + localDir=localdir, + endpoint=databus, + databusURIs=databusuris, + token=token, + auth_url=authurl, + client_id=clientid, + ) + + +if __name__ == "__main__": + app() diff --git a/databusclient/client.py b/databusclient/client.py index 5cb5061..764bf6b 100644 --- a/databusclient/client.py +++ b/databusclient/client.py @@ -7,6 +7,7 @@ from SPARQLWrapper import SPARQLWrapper, JSON from hashlib import sha256 import os +import re __debug = False @@ -392,32 +393,124 @@ def deploy( print(resp.text) -def __download_file__(url, filename): +def __download_file__(url, filename, vault_token_file=None, auth_url=None, client_id=None) -> None: """ Download a file from the internet with a progress bar using tqdm. Parameters: - url: the URL of the file to download - filename: the local file path where the file should be saved + - vault_token_file: Path to Vault refresh token file + - auth_url: Keycloak token endpoint URL + - client_id: Client ID for token exchange + + Steps: + 1. Try direct GET without Authorization header. + 2. If server responds with WWW-Authenticate: Bearer, 401 Unauthorized) or url starts with "https://data.dbpedia.io/databus.dbpedia.org", + then fetch Vault access token and retry with Authorization header. """ - print("download "+url) - os.makedirs(os.path.dirname(filename), exist_ok=True) # Create the necessary directories - response = requests.get(url, stream=True) - total_size_in_bytes= int(response.headers.get('content-length', 0)) - block_size = 1024 # 1 Kibibyte + print(f"Download file: {url}") + dirpath = os.path.dirname(filename) + if dirpath: + os.makedirs(dirpath, exist_ok=True) # Create the necessary directories + # --- 1. Get redirect URL by requesting HEAD --- + response = requests.head(url, stream=True) + # Check for redirect and update URL if necessary + if response.headers.get("Location") and response.status_code in [301, 302, 303, 307, 308]: + url = response.headers.get("Location") + print("Redirects url: ", url) + + # --- 2. Try direct GET --- + response = requests.get(url, stream=True, allow_redirects=False) # no redirects here, we want to see if auth is required + www = response.headers.get('WWW-Authenticate', '') # get WWW-Authenticate header if present to check for Bearer auth + + if (response.status_code == 401 or "bearer" in www.lower()): + print(f"Authentication required for {url}") + if not (vault_token_file): + raise ValueError("Vault token file not given for protected download") + + # --- 3. Fetch Vault token --- + vault_token = __get_vault_access__(url, vault_token_file, auth_url, client_id) + headers = {"Authorization": f"Bearer {vault_token}"} + + # --- 4. Retry with token --- + response = requests.get(url, headers=headers, stream=True) + + try: + response.raise_for_status() # Raise if still failing + except requests.exceptions.HTTPError as e: + if response.status_code == 404: + print(f"WARNING: Skipping file {url} because it was not found (404).") + return + else: + raise e + + total_size_in_bytes = int(response.headers.get('content-length', 0)) + block_size = 1024 # 1 KiB progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) - with open(filename, 'wb') as file: + with open(filename, 'wb') as file: for data in response.iter_content(block_size): progress_bar.update(len(data)) file.write(data) progress_bar.close() + if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: - print("ERROR, something went wrong") + raise IOError("Downloaded size does not match Content-Length header") + + +def __get_vault_access__(download_url: str, + token_file: str, + auth_url: str, + client_id: str) -> str: + """ + Get Vault access token for a protected databus download. + """ + # 1. Load refresh token + refresh_token = os.environ.get("REFRESH_TOKEN") + if not refresh_token: + if not os.path.exists(token_file): + raise FileNotFoundError(f"Vault token file not found: {token_file}") + with open(token_file, "r") as f: + refresh_token = f.read().strip() + if len(refresh_token) < 80: + print(f"Warning: token from {token_file} is short (<80 chars)") + + # 2. Refresh token -> access token + resp = requests.post(auth_url, data={ + "client_id": client_id, + "grant_type": "refresh_token", + "refresh_token": refresh_token + }) + resp.raise_for_status() + access_token = resp.json()["access_token"] + + # 3. Extract host as audience + # Remove protocol prefix + if download_url.startswith("https://"): + host_part = download_url[len("https://"):] + elif download_url.startswith("http://"): + host_part = download_url[len("http://"):] + else: + host_part = download_url + audience = host_part.split("/")[0] # host is before first "/" + + # 4. Access token -> Vault token + resp = requests.post(auth_url, data={ + "client_id": client_id, + "grant_type": "urn:ietf:params:oauth:grant-type:token-exchange", + "subject_token": access_token, + "audience": audience + }) + resp.raise_for_status() + vault_token = resp.json()["access_token"] + + print(f"Using Vault access token for {download_url}") + return vault_token -def __query_sparql__(endpoint_url, query)-> dict: +def __query_sparql__(endpoint_url, query) -> dict: """ Query a SPARQL endpoint and return results in JSON format. @@ -436,8 +529,8 @@ def __query_sparql__(endpoint_url, query)-> dict: return results -def __handle__databus_file_query__(endpoint_url, query) -> List[str]: - result_dict = __query_sparql__(endpoint_url,query) +def __handle_databus_file_query__(endpoint_url, query) -> List[str]: + result_dict = __query_sparql__(endpoint_url, query) for binding in result_dict['results']['bindings']: if len(binding.keys()) > 1: print("Error multiple bindings in query response") @@ -447,45 +540,182 @@ def __handle__databus_file_query__(endpoint_url, query) -> List[str]: yield value +def __handle_databus_artifact_version__(json_str: str) -> List[str]: + """ + Parse the JSON-LD of a databus artifact version to extract download URLs. + Don't get downloadURLs directly from the JSON-LD, but follow the "file" links to count access to databus accurately. + + Returns a list of download URLs. + """ + + databusIdUrl = [] + json_dict = json.loads(json_str) + graph = json_dict.get("@graph", []) + for node in graph: + if node.get("@type") == "Part": + id = node.get("file") + databusIdUrl.append(id) + return databusIdUrl + + +def __get_databus_latest_version_of_artifact__(json_str: str) -> str: + """ + Parse the JSON-LD of a databus artifact to extract URLs of the latest version. + + Returns download URL of latest version of the artifact. + """ + json_dict = json.loads(json_str) + versions = json_dict.get("databus:hasVersion") + + # Single version case {} + if isinstance(versions, dict): + versions = [versions] + # Multiple versions case [{}, {}] + + version_urls = [v["@id"] for v in versions if "@id" in v] + if not version_urls: + raise ValueError("No versions found in artifact JSON-LD") + + version_urls.sort(reverse=True) # Sort versions in descending order + return version_urls[0] # Return the latest version URL + + +def __get_databus_artifacts_of_group__(json_str: str) -> List[str]: + """ + Parse the JSON-LD of a databus group to extract URLs of all artifacts. + + Returns a list of artifact URLs. + """ + json_dict = json.loads(json_str) + artifacts = json_dict.get("databus:hasArtifact", []) + + result = [] + for item in artifacts: + uri = item.get("@id") + if not uri: + continue + _, _, _, _, version, _ = __get_databus_id_parts__(uri) + if version is None: + result.append(uri) + return result + + def wsha256(raw: str): return sha256(raw.encode('utf-8')).hexdigest() -def __handle_databus_collection__(endpoint, uri: str)-> str: +def __handle_databus_collection__(uri: str) -> str: headers = {"Accept": "text/sparql"} return requests.get(uri, headers=headers).text -def __download_list__(urls: List[str], localDir: str): +def __get_json_ld_from_databus__(uri: str) -> str: + headers = {"Accept": "application/ld+json"} + return requests.get(uri, headers=headers).text + + +def __download_list__(urls: List[str], + localDir: str, + vault_token_file: str = None, + auth_url: str = None, + client_id: str = None) -> None: for url in urls: - __download_file__(url=url,filename=localDir+"/"+wsha256(url)) + if localDir is None: + host, account, group, artifact, version, file = __get_databus_id_parts__(url) + localDir = os.path.join(os.getcwd(), account, group, artifact, version if version is not None else "latest") + print(f"Local directory not given, using {localDir}") + + file = url.split("/")[-1] + filename = os.path.join(localDir, file) + print("\n") + __download_file__(url=url, filename=filename, vault_token_file=vault_token_file, auth_url=auth_url, client_id=client_id) + print("\n") + + +def __get_databus_id_parts__(uri: str) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str], Optional[str], Optional[str]]: + uri = uri.removeprefix("https://").removeprefix("http://") + parts = uri.strip("/").split("/") + parts += [None] * (6 - len(parts)) # pad with None if less than 6 parts + return tuple(parts[:6]) # return only the first 6 parts def download( localDir: str, endpoint: str, - databusURIs: List[str] + databusURIs: List[str], + token=None, + auth_url=None, + client_id=None ) -> None: """ - Download datasets to local storage from databus registry + Download datasets to local storage from databus registry. If download is on vault, vault token will be used for downloading protected files. ------ localDir: the local directory + endpoint: the databus endpoint URL databusURIs: identifiers to access databus registered datasets + token: Path to Vault refresh token file + auth_url: Keycloak token endpoint URL + client_id: Client ID for token exchange """ + + # TODO: make pretty for databusURI in databusURIs: + host, account, group, artifact, version, file = __get_databus_id_parts__(databusURI) + # dataID or databus collection if databusURI.startswith("http://") or databusURI.startswith("https://"): + # Auto-detect sparql endpoint from databusURI if not given -> no need to specify endpoint (--databus) + if endpoint is None: + endpoint = f"https://{host}/sparql" + print(f"SPARQL endpoint {endpoint}") + # databus collection - if "/collections/" in databusURI: #TODO "in" is not safe! there could be an artifact named collections, need to check for the correct part position in the URI - query = __handle_databus_collection__(endpoint,databusURI) - res = __handle__databus_file_query__(endpoint, query) + if "/collections/" in databusURI: # TODO "in" is not safe! there could be an artifact named collections, need to check for the correct part position in the URI + query = __handle_databus_collection__(databusURI) + res = __handle_databus_file_query__(endpoint, query) + __download_list__(res, localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + # databus file + elif file is not None: + __download_list__([databusURI], localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + # databus artifact version + elif version is not None: + json_str = __get_json_ld_from_databus__(databusURI) + res = __handle_databus_artifact_version__(json_str) + __download_list__(res, localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + # databus artifact + elif artifact is not None: + json_str = __get_json_ld_from_databus__(databusURI) + latest = __get_databus_latest_version_of_artifact__(json_str) + print(f"No version given, using latest version: {latest}") + json_str = __get_json_ld_from_databus__(latest) + res = __handle_databus_artifact_version__(json_str) + __download_list__(res, localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + + # databus group + elif group is not None: + json_str = __get_json_ld_from_databus__(databusURI) + artifacts = __get_databus_artifacts_of_group__(json_str) + for artifact_uri in artifacts: + print(f"Processing artifact {artifact_uri}") + json_str = __get_json_ld_from_databus__(artifact_uri) + latest = __get_databus_latest_version_of_artifact__(json_str) + print(f"No version given, using latest version: {latest}") + json_str = __get_json_ld_from_databus__(latest) + res = __handle_databus_artifact_version__(json_str) + __download_list__(res, localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) + + # databus account + elif account is not None: + print("accountId not supported yet") # TODO else: - print("dataId not supported yet") #TODO add support for other DatabusIds here (artifact, group, etc.) + print("dataId not supported yet") # TODO add support for other DatabusIds # query in local file elif databusURI.startswith("file://"): print("query in file not supported yet") # query as argument else: - print("QUERY {}", databusURI.replace("\n"," ")) - res = __handle__databus_file_query__(endpoint,databusURI) - __download_list__(res,localDir) \ No newline at end of file + print("QUERY {}", databusURI.replace("\n", " ")) + if endpoint is None: # endpoint is required for queries (--databus) + raise ValueError("No endpoint given for query") + res = __handle_databus_file_query__(endpoint, databusURI) + __download_list__(res, localDir, vault_token_file=token, auth_url=auth_url, client_id=client_id) diff --git a/poetry.lock b/poetry.lock index 6add7d4..c5b6e69 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.2.1 and should not be changed by hand. [[package]] name = "black" @@ -6,6 +6,7 @@ version = "22.12.0" description = "The uncompromising code formatter." optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "black-22.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9eedd20838bd5d75b80c9f5487dbcb06836a43833a37846cf1d8c1cc01cef59d"}, {file = "black-22.12.0-cp310-cp310-win_amd64.whl", hash = "sha256:159a46a4947f73387b4d83e87ea006dbb2337eab6c879620a3ba52699b1f4351"}, @@ -41,6 +42,7 @@ version = "2024.2.2" description = "Python package for providing Mozilla's CA Bundle." optional = false python-versions = ">=3.6" +groups = ["main"] files = [ {file = "certifi-2024.2.2-py3-none-any.whl", hash = "sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1"}, {file = "certifi-2024.2.2.tar.gz", hash = "sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f"}, @@ -52,6 +54,7 @@ version = "3.3.2" description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet." optional = false python-versions = ">=3.7.0" +groups = ["main"] files = [ {file = "charset-normalizer-3.3.2.tar.gz", hash = "sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5"}, {file = "charset_normalizer-3.3.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3"}, @@ -151,6 +154,7 @@ version = "8.1.7" description = "Composable command line interface toolkit" optional = false python-versions = ">=3.7" +groups = ["main", "dev"] files = [ {file = "click-8.1.7-py3-none-any.whl", hash = "sha256:ae74fb96c20a0277a1d615f1e4d73c8414f5a98db8b799a7931d1582f3390c28"}, {file = "click-8.1.7.tar.gz", hash = "sha256:ca9853ad459e787e2192211578cc907e7594e294c7ccc834310722b41b9ca6de"}, @@ -165,10 +169,12 @@ version = "0.4.6" description = "Cross-platform colored terminal text." optional = false python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,!=3.4.*,!=3.5.*,!=3.6.*,>=2.7" +groups = ["main", "dev"] files = [ {file = "colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6"}, {file = "colorama-0.4.6.tar.gz", hash = "sha256:08695f5cb7ed6e0531a20572697297273c47b8cae5a63ffc6d6ed5c201be6e44"}, ] +markers = {main = "platform_system == \"Windows\"", dev = "platform_system == \"Windows\" or sys_platform == \"win32\""} [[package]] name = "exceptiongroup" @@ -176,6 +182,8 @@ version = "1.2.0" description = "Backport of PEP 654 (exception groups)" optional = false python-versions = ">=3.7" +groups = ["dev"] +markers = "python_version < \"3.11\"" files = [ {file = "exceptiongroup-1.2.0-py3-none-any.whl", hash = "sha256:4bfd3996ac73b41e9b9628b04e079f193850720ea5945fc96a08633c66912f14"}, {file = "exceptiongroup-1.2.0.tar.gz", hash = "sha256:91f5c769735f051a4290d52edd0858999b57e5876e9f85937691bd4c9fa3ed68"}, @@ -190,6 +198,7 @@ version = "3.6" description = "Internationalized Domain Names in Applications (IDNA)" optional = false python-versions = ">=3.5" +groups = ["main"] files = [ {file = "idna-3.6-py3-none-any.whl", hash = "sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f"}, {file = "idna-3.6.tar.gz", hash = "sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca"}, @@ -201,6 +210,7 @@ version = "2.0.0" description = "brain-dead simple config-ini parsing" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, @@ -208,24 +218,24 @@ files = [ [[package]] name = "isodate" -version = "0.6.1" +version = "0.7.2" description = "An ISO 8601 date/time/duration parser and formatter" optional = false -python-versions = "*" +python-versions = ">=3.7" +groups = ["main"] +markers = "python_version < \"3.11\"" files = [ - {file = "isodate-0.6.1-py2.py3-none-any.whl", hash = "sha256:0751eece944162659049d35f4f549ed815792b38793f07cf73381c1c87cbed96"}, - {file = "isodate-0.6.1.tar.gz", hash = "sha256:48c5881de7e8b0a0d648cb024c8062dc84e7b840ed81e864c7614fd3c127bde9"}, + {file = "isodate-0.7.2-py3-none-any.whl", hash = "sha256:28009937d8031054830160fce6d409ed342816b543597cece116d966c6d99e15"}, + {file = "isodate-0.7.2.tar.gz", hash = "sha256:4cd1aa0f43ca76f4a6c6c0292a85f40b35ec2e43e315b59f06e6d32171a953e6"}, ] -[package.dependencies] -six = "*" - [[package]] name = "mypy-extensions" version = "1.0.0" description = "Type system extensions for programs checked with the mypy type checker." optional = false python-versions = ">=3.5" +groups = ["dev"] files = [ {file = "mypy_extensions-1.0.0-py3-none-any.whl", hash = "sha256:4392f6c0eb8a5668a69e23d168ffa70f0be9ccfd32b5cc2d26a34ae5b844552d"}, {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, @@ -237,6 +247,7 @@ version = "23.2" description = "Core utilities for Python packages" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "packaging-23.2-py3-none-any.whl", hash = "sha256:8c491190033a9af7e1d931d0b5dacc2ef47509b34dd0de67ed209b5203fc88c7"}, {file = "packaging-23.2.tar.gz", hash = "sha256:048fb0e9405036518eaaf48a55953c750c11e1a1b68e0dd1a9d62ed0c092cfc5"}, @@ -248,6 +259,7 @@ version = "0.12.1" description = "Utility library for gitignore style pattern matching of file paths." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pathspec-0.12.1-py3-none-any.whl", hash = "sha256:a0d503e138a4c123b27490a4f7beda6a01c6f288df0e4a8b79c7eb0dc7b4cc08"}, {file = "pathspec-0.12.1.tar.gz", hash = "sha256:a482d51503a1ab33b1c67a6c3813a26953dbdc71c31dacaef9a838c4e29f5712"}, @@ -259,6 +271,7 @@ version = "4.2.0" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a \"user data dir\"." optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "platformdirs-4.2.0-py3-none-any.whl", hash = "sha256:0614df2a2f37e1a662acbd8e2b25b92ccf8632929bc6d43467e17fe89c75e068"}, {file = "platformdirs-4.2.0.tar.gz", hash = "sha256:ef0cc731df711022c174543cb70a9b5bd22e5a9337c8624ef2c2ceb8ddad8768"}, @@ -274,6 +287,7 @@ version = "1.4.0" description = "plugin and hook calling mechanisms for python" optional = false python-versions = ">=3.8" +groups = ["dev"] files = [ {file = "pluggy-1.4.0-py3-none-any.whl", hash = "sha256:7db9f7b503d67d1c5b95f59773ebb58a8c1c288129a88665838012cfb07b8981"}, {file = "pluggy-1.4.0.tar.gz", hash = "sha256:8c85c2876142a764e5b7548e7d9a0e0ddb46f5185161049a79b7e974454223be"}, @@ -289,6 +303,7 @@ version = "3.1.1" description = "pyparsing module - Classes and methods to define and execute parsing grammars" optional = false python-versions = ">=3.6.8" +groups = ["main"] files = [ {file = "pyparsing-3.1.1-py3-none-any.whl", hash = "sha256:32c7c0b711493c72ff18a981d24f28aaf9c1fb7ed5e9667c9e84e3db623bdbfb"}, {file = "pyparsing-3.1.1.tar.gz", hash = "sha256:ede28a1a32462f5a9705e07aea48001a08f7cf81a021585011deba701581a0db"}, @@ -303,6 +318,7 @@ version = "7.4.4" description = "pytest: simple powerful testing with Python" optional = false python-versions = ">=3.7" +groups = ["dev"] files = [ {file = "pytest-7.4.4-py3-none-any.whl", hash = "sha256:b090cdf5ed60bf4c45261be03239c2c1c22df034fbffe691abe93cd80cea01d8"}, {file = "pytest-7.4.4.tar.gz", hash = "sha256:2cf0005922c6ace4a3e2ec8b4080eb0d9753fdc93107415332f50ce9e7994280"}, @@ -321,24 +337,26 @@ testing = ["argcomplete", "attrs (>=19.2.0)", "hypothesis (>=3.56)", "mock", "no [[package]] name = "rdflib" -version = "7.0.0" +version = "7.2.1" description = "RDFLib is a Python library for working with RDF, a simple yet powerful language for representing information." optional = false -python-versions = ">=3.8.1,<4.0.0" +python-versions = ">=3.8.1" +groups = ["main"] files = [ - {file = "rdflib-7.0.0-py3-none-any.whl", hash = "sha256:0438920912a642c866a513de6fe8a0001bd86ef975057d6962c79ce4771687cd"}, - {file = "rdflib-7.0.0.tar.gz", hash = "sha256:9995eb8569428059b8c1affd26b25eac510d64f5043d9ce8c84e0d0036e995ae"}, + {file = "rdflib-7.2.1-py3-none-any.whl", hash = "sha256:1a175bc1386a167a42fbfaba003bfa05c164a2a3ca3cb9c0c97f9c9638ca6ac2"}, + {file = "rdflib-7.2.1.tar.gz", hash = "sha256:cf9b7fa25234e8925da8b1fb09700f8349b5f0f100e785fb4260e737308292ac"}, ] [package.dependencies] -isodate = ">=0.6.0,<0.7.0" +isodate = {version = ">=0.7.2,<1.0.0", markers = "python_version < \"3.11\""} pyparsing = ">=2.1.0,<4" [package.extras] berkeleydb = ["berkeleydb (>=18.1.0,<19.0.0)"] -html = ["html5lib (>=1.0,<2.0)"] -lxml = ["lxml (>=4.3.0,<5.0.0)"] -networkx = ["networkx (>=2.0.0,<3.0.0)"] +html = ["html5rdf (>=1.2,<2)"] +lxml = ["lxml (>=4.3,<6.0)"] +networkx = ["networkx (>=2,<4)"] +orjson = ["orjson (>=3.9.14,<4)"] [[package]] name = "requests" @@ -346,6 +364,7 @@ version = "2.31.0" description = "Python HTTP for Humans." optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "requests-2.31.0-py3-none-any.whl", hash = "sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f"}, {file = "requests-2.31.0.tar.gz", hash = "sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1"}, @@ -361,23 +380,13 @@ urllib3 = ">=1.21.1,<3" socks = ["PySocks (>=1.5.6,!=1.5.7)"] use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"] -[[package]] -name = "six" -version = "1.16.0" -description = "Python 2 and 3 compatibility utilities" -optional = false -python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*" -files = [ - {file = "six-1.16.0-py2.py3-none-any.whl", hash = "sha256:8abb2f1d86890a2dfb989f9a77cfcfd3e47c2a354b01111771326f8aa26e0254"}, - {file = "six-1.16.0.tar.gz", hash = "sha256:1e61c37477a1626458e36f7b1d82aa5c9b094fa4802892072e49de9c60c4c926"}, -] - [[package]] name = "sparqlwrapper" version = "2.0.0" description = "SPARQL Endpoint interface to Python" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "SPARQLWrapper-2.0.0-py3-none-any.whl", hash = "sha256:c99a7204fff676ee28e6acef327dc1ff8451c6f7217dcd8d49e8872f324a8a20"}, {file = "SPARQLWrapper-2.0.0.tar.gz", hash = "sha256:3fed3ebcc77617a4a74d2644b86fd88e0f32e7f7003ac7b2b334c026201731f1"}, @@ -398,6 +407,8 @@ version = "2.0.1" description = "A lil' TOML parser" optional = false python-versions = ">=3.7" +groups = ["dev"] +markers = "python_full_version < \"3.11.0a7\"" files = [ {file = "tomli-2.0.1-py3-none-any.whl", hash = "sha256:939de3e7a6161af0c887ef91b7d41a53e7c5a1ca976325f429cb46ea9bc30ecc"}, {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"}, @@ -409,6 +420,7 @@ version = "4.66.2" description = "Fast, Extensible Progress Meter" optional = false python-versions = ">=3.7" +groups = ["main"] files = [ {file = "tqdm-4.66.2-py3-none-any.whl", hash = "sha256:1ee4f8a893eb9bef51c6e35730cebf234d5d0b6bd112b0271e10ed7c24a02bd9"}, {file = "tqdm-4.66.2.tar.gz", hash = "sha256:6cd52cdf0fef0e0f543299cfc96fec90d7b8a7e88745f411ec33eb44d5ed3531"}, @@ -423,32 +435,14 @@ notebook = ["ipywidgets (>=6)"] slack = ["slack-sdk"] telegram = ["requests"] -[[package]] -name = "typer" -version = "0.6.1" -description = "Typer, build great CLIs. Easy to code. Based on Python type hints." -optional = false -python-versions = ">=3.6" -files = [ - {file = "typer-0.6.1-py3-none-any.whl", hash = "sha256:54b19e5df18654070a82f8c2aa1da456a4ac16a2a83e6dcd9f170e291c56338e"}, - {file = "typer-0.6.1.tar.gz", hash = "sha256:2d5720a5e63f73eaf31edaa15f6ab87f35f0690f8ca233017d7d23d743a91d73"}, -] - -[package.dependencies] -click = ">=7.1.1,<9.0.0" - -[package.extras] -all = ["colorama (>=0.4.3,<0.5.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"] -dev = ["autoflake (>=1.3.1,<2.0.0)", "flake8 (>=3.8.3,<4.0.0)", "pre-commit (>=2.17.0,<3.0.0)"] -doc = ["mdx-include (>=1.4.1,<2.0.0)", "mkdocs (>=1.1.2,<2.0.0)", "mkdocs-material (>=8.1.4,<9.0.0)"] -test = ["black (>=22.3.0,<23.0.0)", "coverage (>=5.2,<6.0)", "isort (>=5.0.6,<6.0.0)", "mypy (==0.910)", "pytest (>=4.4.0,<5.4.0)", "pytest-cov (>=2.10.0,<3.0.0)", "pytest-sugar (>=0.9.4,<0.10.0)", "pytest-xdist (>=1.32.0,<2.0.0)", "rich (>=10.11.0,<13.0.0)", "shellingham (>=1.3.0,<2.0.0)"] - [[package]] name = "typing-extensions" version = "4.9.0" description = "Backported and Experimental Type Hints for Python 3.8+" optional = false python-versions = ">=3.8" +groups = ["dev"] +markers = "python_version == \"3.9\"" files = [ {file = "typing_extensions-4.9.0-py3-none-any.whl", hash = "sha256:af72aea155e91adfc61c3ae9e0e342dbc0cba726d6cba4b6c72c1f34e47291cd"}, {file = "typing_extensions-4.9.0.tar.gz", hash = "sha256:23478f88c37f27d76ac8aee6c905017a143b0b1b886c3c9f66bc2fd94f9f5783"}, @@ -460,18 +454,19 @@ version = "2.2.0" description = "HTTP library with thread-safe connection pooling, file post, and more." optional = false python-versions = ">=3.8" +groups = ["main"] files = [ {file = "urllib3-2.2.0-py3-none-any.whl", hash = "sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224"}, {file = "urllib3-2.2.0.tar.gz", hash = "sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20"}, ] [package.extras] -brotli = ["brotli (>=1.0.9)", "brotlicffi (>=0.8.0)"] +brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""] h2 = ["h2 (>=4,<5)"] socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"] zstd = ["zstandard (>=0.18.0)"] [metadata] -lock-version = "2.0" +lock-version = "2.1" python-versions = "^3.9" -content-hash = "6380be6ddc03d3f38ddd1f923f18a24ca9fb385753e0f47ef4549ed8dc933f3e" +content-hash = "6f798ca5bc7629dc0668179934c9889c0d971743c1b162ae1387bd0c5a349d94" diff --git a/pyproject.toml b/pyproject.toml index 016518e..0d32ee1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,20 +8,18 @@ readme = "README.md" [tool.poetry.dependencies] python = "^3.9" -typer = "^0.6.1" +click = "^8.0.4" requests = "^2.28.1" tqdm = "^4.42.1" SPARQLWrapper = "^2.0.0" - - -[tool.poetry.dev-dependencies] -black = "^22.6.0" +rdflib = "^7.2.1" [tool.poetry.group.dev.dependencies] +black = "^22.6.0" pytest = "^7.1.3" [tool.poetry.scripts] -databusclient = "databusclient:run" +databusclient = "databusclient.cli:app" [build-system] requires = ["poetry-core>=1.0.0"] diff --git a/test.sh b/test.sh index 0a4c096..f590198 100755 --- a/test.sh +++ b/test.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash databusclient deploy \ - --versionid "https://d8lr.tools.dbpedia.org/hopver/testGroup/testArtifact/1.0-alpha/" \ + --version-id "https://d8lr.tools.dbpedia.org/hopver/testGroup/testArtifact/1.0-alpha/" \ --title "Test Title" \ --abstract "Test Abstract" \ --description "Test Description" \ diff --git a/tests/test_download.py b/tests/test_download.py index 41909b1..6a1a72e 100644 --- a/tests/test_download.py +++ b/tests/test_download.py @@ -5,16 +5,19 @@ DEFAULT_ENDPOINT="https://databus.dbpedia.org/sparql" TEST_QUERY=""" PREFIX dcat: -SELECT ?x WHERE { - ?sub dcat:downloadURL ?x . -} LIMIT 10 +SELECT ?file +WHERE { + ?file dcat:downloadURL ?url ; + dcat:byteSize ?size . + FILTER(STRSTARTS(STR(?file), "https://databus.dbpedia.org/dbpedia/")) + FILTER(xsd:integer(?size) < 104857600) +} +LIMIT 10 """ TEST_COLLECTION="https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12" def test_with_query(): - cl.download("tmp",DEFAULT_ENDPOINT,[TEST_QUERY] - -) + cl.download("tmp",DEFAULT_ENDPOINT,[TEST_QUERY]) def test_with_collection(): cl.download("tmp",DEFAULT_ENDPOINT,[TEST_COLLECTION]) \ No newline at end of file