diff --git a/CHANGELOG.md b/CHANGELOG.md index c86fc78..cd8d40d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,3 +22,11 @@ All notable changes to this project will be documented in this file. ### Notes - Version 0.15 skips 0.13 and 0.14 as requested in issue #35 - This release updates the PyPI package to align with current repository features + + +## [Unreleased] + +### Added +- Add `-v/--verbose` global CLI option to enable redacted HTTP request/response logging for debugging (`databusclient -v ...`) +- Ensure `Authorization` and `X-API-KEY` headers are redacted in verbose output +- Add unit tests and README documentation for verbose mode diff --git a/PR_BODY.md b/PR_BODY.md new file mode 100644 index 0000000..02b5221 --- /dev/null +++ b/PR_BODY.md @@ -0,0 +1,19 @@ +Title: Add verbose CLI flag (-v) with redacted HTTP logging + +Short description: +- Add a global `-v/--verbose` CLI flag to enable redacted HTTP request/response logging to help debug interactions with the Databus and Vault. + +What changed: +- Add global `-v/--verbose` option to `databusclient` CLI and propagate it to API calls. +- Implement redacted HTTP logging helper (redacts `Authorization` and `X-API-KEY` headers). +- Instrument `download` and Vault token exchange flows to print HTTP request/response details when `-v` is enabled. +- Add unit tests ensuring verbose logs are printed and sensitive tokens are redacted. +- Update `README.md` and add a `CHANGELOG.md` entry. + +Why: +- Provides safe, actionable debugging output for issues involving HTTP communication and auth problems without exposing secrets. + +Security note: +- Authorization and API-key headers are redacted in verbose output. Avoid enabling verbose output in public CI logs. + +Closes #27 diff --git a/README.md b/README.md index 171590c..05d5c5b 100644 --- a/README.md +++ b/README.md @@ -50,10 +50,26 @@ python3 -m pip install --upgrade databusclient==0.15 You can then use the client in the command line: ```bash +# Python databusclient --help -databusclient deploy --help -databusclient delete --help -databusclient download --help + +# Example output: +# Usage: databusclient [OPTIONS] COMMAND [ARGS]... +# +# Options: +# --install-completion [bash|zsh|fish|powershell|pwsh] Install completion for the specified shell. +# --show-completion [bash|zsh|fish|powershell|pwsh] Show completion for the specified shell. +# --help Show this message and exit. +# +# Commands: +# deploy +# download +# delete +# mkdist +# completion +``` + +### Download command ``` ### Docker @@ -172,6 +188,8 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client download $DOWNLOAD - If the dataset/files to be downloaded require vault authentication, you need to provide a vault token with `--vault-token /path/to/vault-token.dat`. See [Registration (Access Token)](#registration-access-token) for details on how to get a vault token. Note: Vault tokens are only required for certain protected Databus hosts (for example: `data.dbpedia.io`, `data.dev.dbpedia.link`). The client now detects those hosts and will fail early with a clear message if a token is required but not provided. Do not pass `--vault-token` for public downloads. +- `-v, --verbose` + - Enable verbose HTTP request/response output for debugging. Headers that may contain secrets (for example `Authorization` and `X-API-KEY`) are redacted in the output. Use with caution and avoid enabling in public CI logs. - `--databus-key` - If the databus is protected and needs API key authentication, you can provide the API key with `--databus-key YOUR_API_KEY`. @@ -289,7 +307,7 @@ Usage: databusclient deploy [OPTIONS] [DISTRIBUTIONS]... - Upload & deploy via Nextcloud (--webdav-url, --remote, --path) Options: - --version-id TEXT Target databus version/dataset identifier of the form [required] --title TEXT Dataset title [required] @@ -303,6 +321,18 @@ Options: --remote TEXT rclone remote name (e.g., 'nextcloud') --path TEXT Remote path on Nextcloud (e.g., 'datasets/mydataset') --help Show this message and exit. +<<<<<<< HEAD + +``` +#### Examples of using deploy command +##### Mode 1: Classic Deploy (Distributions) +``` +databusclient deploy --versionid https://databus.dbpedia.org/user1/group1/artifact1/2022-05-18 --title title1 --abstract abstract1 --description description1 --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 --apikey MYSTERIOUS 'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' +``` + +``` +databusclient deploy --versionid https://dev.databus.dbpedia.org/denis/group1/artifact1/2022-05-18 --title "Client Testing" --abstract "Testing the client...." --description "Testing the client...." --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 --apikey MYSTERIOUS 'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger' +======= ``` ### Mode 1: Classic Deploy (Distributions) @@ -326,6 +356,7 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client deploy \ --license http://dalicc.net/licenselibrary/AdaptivePublicLicense10 \ --apikey MYSTERIOUS \ 'https://raw.githubusercontent.com/dbpedia/databus/master/server/app/api/swagger.yml|type=swagger +>>>>>>> upstream/main ``` A few more notes for CLI usage: @@ -342,6 +373,10 @@ All files referenced there will be registered on the Databus. ```bash # Python databusclient deploy \ +<<<<<<< HEAD + --metadata /home/metadata.json \ + --versionid https://databus.org/user/dataset/version/1.0 \ +======= --metadata ./metadata.json \ --version-id https://databus.dbpedia.org/user1/group1/artifact1/1.0 \ --title "Metadata Deploy Example" \ @@ -353,6 +388,7 @@ databusclient deploy \ docker run --rm -v $(pwd):/data dbpedia/databus-python-client deploy \ --metadata ./metadata.json \ --version-id https://databus.dbpedia.org/user1/group1/artifact1/1.0 \ +>>>>>>> upstream/main --title "Metadata Deploy Example" \ --abstract "This is a short abstract of the dataset." \ --description "This dataset was uploaded using metadata.json." \ @@ -388,6 +424,9 @@ databusclient deploy \ --webdav-url https://cloud.example.com/remote.php/webdav \ --remote nextcloud \ --path datasets/mydataset \ +<<<<<<< HEAD + --versionid https://databus.org/user/dataset/version/1.0 \ +======= --version-id https://databus.dbpedia.org/user1/group1/artifact1/1.0 \ --title "Test Dataset" \ --abstract "Short abstract of dataset" \ @@ -402,6 +441,7 @@ docker run --rm -v $(pwd):/data dbpedia/databus-python-client deploy \ --remote nextcloud \ --path datasets/mydataset \ --version-id https://databus.dbpedia.org/user1/group1/artifact1/1.0 \ +>>>>>>> upstream/main --title "Test Dataset" \ --abstract "Short abstract of dataset" \ --description "This dataset was uploaded for testing the Nextcloud → Databus pipeline." \ @@ -487,6 +527,48 @@ databusclient delete https://databus.dbpedia.org/dbpedia/collections/dbpedia-sna docker run --rm -v $(pwd):/data dbpedia/databus-python-client delete https://databus.dbpedia.org/dbpedia/collections/dbpedia-snapshot-2022-12 --databus-key YOUR_API_KEY ``` +### mkdist command + +Create a distribution string from components. + +Usage: +``` +databusclient mkdist URL --cv key=value --cv key2=value2 --format ttl --compression gz --sha-length : +``` + +Example: +``` +python -m databusclient mkdist "https://example.org/file.ttl" --cv lang=en --cv part=sorted --format ttl --compression gz --sha-length aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa:12345 +``` + +## Completion + +Enable shell completion (bash example): +``` +eval "$(_DATABUSCLIENT_COMPLETE=source_bash python -m databusclient)" +``` + +### mkdist command + +Create a distribution string from components. + +Usage: +``` +databusclient mkdist URL --cv key=value --cv key2=value2 --format ttl --compression gz --sha-length : +``` + +Example: +``` +python -m databusclient mkdist "https://example.org/file.ttl" --cv lang=en --cv part=sorted --format ttl --compression gz --sha-length aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa:12345 +``` + +## Completion + +Enable shell completion (bash example): +``` +eval "$(_DATABUSCLIENT_COMPLETE=source_bash python -m databusclient)" +``` + ## Module Usage diff --git a/databusclient/api/download.py b/databusclient/api/download.py index f045ce2..373e5f9 100644 --- a/databusclient/api/download.py +++ b/databusclient/api/download.py @@ -6,6 +6,9 @@ import requests from SPARQLWrapper import JSON, SPARQLWrapper from tqdm import tqdm +import logging + +logger = logging.getLogger("databusclient") from databusclient.api.utils import ( fetch_databus_jsonld, @@ -32,16 +35,19 @@ def _download_file( databus_key=None, auth_url=None, client_id=None, + verbose=False, ) -> None: - """Download a file from the internet with a progress bar using tqdm. - - Args: - url: The URL of the file to download. - localDir: Local directory to download file to. If None, the databus folder structure is created in the current working directory. - vault_token_file: Path to Vault refresh token file. - databus_key: Databus API key for protected downloads. - auth_url: Keycloak token endpoint URL. - client_id: Client ID for token exchange. + """ + Download a file from the internet with a progress bar using tqdm. + + Parameters: + - url: the URL of the file to download + - localDir: Local directory to download file to. If None, the databus folder structure is created in the current working directory. + - vault_token_file: Path to Vault refresh token file + - databus_key: Databus API key for protected downloads + - auth_url: Keycloak token endpoint URL + - client_id: Client ID for token exchange + - verbose: when True, print redacted HTTP request/response details """ if localDir is None: _host, account, group, artifact, version, file = ( @@ -66,7 +72,15 @@ def _download_file( headers = {} # --- 1a. public databus --- + if verbose or logger.isEnabledFor(logging.DEBUG): + from databusclient.api.utils import log_http + + log_http("HEAD", url, req_headers=headers) response = requests.head(url, timeout=30, allow_redirects=False) + if verbose or logger.isEnabledFor(logging.DEBUG): + from databusclient.api.utils import log_http + + log_http("HEAD", url, req_headers=headers, status=response.status_code, resp_headers=response.headers) # Check for redirect and update URL if necessary if response.headers.get("Location") and response.status_code in [ @@ -107,9 +121,17 @@ def _download_file( headers["Accept-Encoding"] = ( "identity" # disable gzip to get correct content-length ) + if verbose or logger.isEnabledFor(logging.DEBUG): + from databusclient.api.utils import log_http + + log_http("GET", url, req_headers=headers) response = requests.get( url, headers=headers, stream=True, allow_redirects=True, timeout=30 ) + if verbose or logger.isEnabledFor(logging.DEBUG): + from databusclient.api.utils import log_http + + log_http("GET", url, req_headers=headers, status=response.status_code, resp_headers=response.headers) www = response.headers.get("WWW-Authenticate", "") # Check if authentication is required # --- 3. Handle authentication responses --- @@ -135,12 +157,20 @@ def _download_file( # for known hosts. __get_vault_access__ handles reading the refresh # token and exchanging it; errors are translated to DownloadAuthError # for user-friendly CLI output. - vault_token = __get_vault_access__(url, vault_token_file, auth_url, client_id) + vault_token = __get_vault_access__(url, vault_token_file, auth_url, client_id, verbose=verbose) headers["Authorization"] = f"Bearer {vault_token}" headers.pop("Accept-Encoding", None) # Retry with token + if verbose or logger.isEnabledFor(logging.DEBUG): + from databusclient.api.utils import log_http + + log_http("GET", url, req_headers=headers) response = requests.get(url, headers=headers, stream=True, timeout=30) + if verbose or logger.isEnabledFor(logging.DEBUG): + from databusclient.api.utils import log_http + + log_http("GET", url, req_headers=headers, status=response.status_code, resp_headers=response.headers) # Map common auth failures to friendly messages if response.status_code == 401: @@ -190,16 +220,19 @@ def _download_files( databus_key: str = None, auth_url: str = None, client_id: str = None, + verbose: bool = False, ) -> None: - """Download multiple files from the databus. - - Args: - urls: List of file download URLs. - localDir: Local directory to download files to. If None, the databus folder structure is created in the current working directory. - vault_token_file: Path to Vault refresh token file. - databus_key: Databus API key for protected downloads. - auth_url: Keycloak token endpoint URL. - client_id: Client ID for token exchange. + """ + Download multiple files from the databus. + + Parameters: + - urls: List of file download URLs + - localDir: Local directory to download files to. If None, the databus folder structure is created in the current working directory. + - vault_token_file: Path to Vault refresh token file + - databus_key: Databus API key for protected downloads + - auth_url: Keycloak token endpoint URL + - client_id: Client ID for token exchange + - verbose: when True, print redacted HTTP request/response details """ for url in urls: _download_file( @@ -209,39 +242,59 @@ def _download_files( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) -def _get_sparql_query_of_collection(uri: str, databus_key: str | None = None) -> str: - """Get SPARQL query of collection members from databus collection URI. +def _get_sparql_query_of_collection(uri: str, databus_key: str | None = None, verbose: bool = False) -> str: + """ + Get SPARQL query of collection members from databus collection URI. - Args: - uri: The full databus collection URI. - databus_key: Optional Databus API key for authentication on protected resources. + Parameters: + - uri: The full databus collection URI + - databus_key: Optional Databus API key for authentication on protected resources + - verbose: when True, print redacted HTTP request/response details Returns: - SPARQL query string to get download URLs of all files in the collection. + SPARQL query string to get download URLs of all files in the collection. """ headers = {"Accept": "text/sparql"} if databus_key is not None: headers["X-API-KEY"] = databus_key + if verbose: + from databusclient.api.utils import log_http + + log_http("GET", uri, req_headers=headers) response = requests.get(uri, headers=headers, timeout=30) + if verbose: + from databusclient.api.utils import log_http + + log_http("GET", uri, req_headers=headers, status=response.status_code, resp_headers=response.headers) + response.raise_for_status() return response.text -def _query_sparql_endpoint(endpoint_url, query, databus_key=None) -> dict: - """Query a SPARQL endpoint and return results in JSON format. +def _query_sparql_endpoint(endpoint_url, query, databus_key=None, verbose: bool = False) -> dict: + """ + Query a SPARQL endpoint and return results in JSON format. - Args: - endpoint_url: The URL of the SPARQL endpoint. - query: The SPARQL query string. - databus_key: Optional API key for authentication. + Parameters: + - endpoint_url: the URL of the SPARQL endpoint + - query: the SPARQL query string + - databus_key: Optional API key for authentication + - verbose: when True, print redacted HTTP request/response details Returns: - Dictionary containing the query results. + - Dictionary containing the query results """ + if verbose: + from databusclient.api.utils import log_http + + headers = {"X-API-KEY": databus_key} if databus_key is not None else None + log_http("POST", endpoint_url, req_headers=headers) + sparql = SPARQLWrapper(endpoint_url) sparql.method = "POST" sparql.setQuery(query) @@ -249,23 +302,31 @@ def _query_sparql_endpoint(endpoint_url, query, databus_key=None) -> dict: if databus_key is not None: sparql.setCustomHttpHeaders({"X-API-KEY": databus_key}) results = sparql.query().convert() + + if verbose: + from databusclient.api.utils import log_http + + log_http("POST", endpoint_url, req_headers={"X-API-KEY": databus_key} if databus_key is not None else None, status=200) + return results def _get_file_download_urls_from_sparql_query( - endpoint_url, query, databus_key=None + endpoint_url, query, databus_key=None, verbose: bool = False ) -> List[str]: - """Execute a SPARQL query to get databus file download URLs. + """ + Execute a SPARQL query to get databus file download URLs. - Args: - endpoint_url: The URL of the SPARQL endpoint. - query: The SPARQL query string. - databus_key: Optional API key for authentication. + Parameters: + - endpoint_url: the URL of the SPARQL endpoint + - query: the SPARQL query string + - databus_key: Optional API key for authentication + - verbose: when True, print redacted HTTP request/response details Returns: - List of file download URLs. + - List of file download URLs """ - result_dict = _query_sparql_endpoint(endpoint_url, query, databus_key=databus_key) + result_dict = _query_sparql_endpoint(endpoint_url, query, databus_key=databus_key, verbose=verbose) bindings = result_dict.get("results", {}).get("bindings") if not isinstance(bindings, list): @@ -289,7 +350,7 @@ def _get_file_download_urls_from_sparql_query( def __get_vault_access__( - download_url: str, token_file: str, auth_url: str, client_id: str + download_url: str, token_file: str, auth_url: str, client_id: str, verbose: bool = False ) -> str: """ Get Vault access token for a protected databus download. @@ -302,7 +363,8 @@ def __get_vault_access__( with open(token_file, "r") as f: refresh_token = f.read().strip() if len(refresh_token) < 80: - print(f"Warning: token from {token_file} is short (<80 chars)") + logger.warning("Token from %s is short (<80 chars)", token_file) + # 2. Refresh token -> access token resp = requests.post( @@ -315,6 +377,10 @@ def __get_vault_access__( timeout=30, ) resp.raise_for_status() + if verbose or logger.isEnabledFor(logging.DEBUG): + from databusclient.api.utils import log_http + + log_http("POST", auth_url, req_headers={"client_id": client_id}, status=resp.status_code, resp_headers=resp.headers) access_token = resp.json()["access_token"] # 3. Extract host as audience @@ -339,9 +405,13 @@ def __get_vault_access__( timeout=30, ) resp.raise_for_status() + if verbose or logger.isEnabledFor(logging.DEBUG): + from databusclient.api.utils import log_http + + log_http("POST", auth_url, req_headers={"client_id": client_id, "audience": audience}, status=resp.status_code, resp_headers=resp.headers) vault_token = resp.json()["access_token"] - print(f"Using Vault access token for {download_url}") + logger.debug("Using Vault access token for %s", download_url) return vault_token @@ -353,21 +423,24 @@ def _download_collection( databus_key: str = None, auth_url: str = None, client_id: str = None, + verbose: bool = False, ) -> None: - """Download all files in a databus collection. - - Args: - uri: The full databus collection URI. - endpoint: The databus SPARQL endpoint URL. - localDir: Local directory to download files to. If None, the databus folder structure is created in the current working directory. - vault_token: Path to Vault refresh token file for protected downloads. - databus_key: Databus API key for protected downloads. - auth_url: Keycloak token endpoint URL. - client_id: Client ID for token exchange. """ - query = _get_sparql_query_of_collection(uri, databus_key=databus_key) + Download all files in a databus collection. + + Parameters: + - uri: The full databus collection URI + - endpoint: the databus SPARQL endpoint URL + - localDir: Local directory to download files to. If None, the databus folder structure is created in the current working directory. + - vault_token: Path to Vault refresh token file for protected downloads + - databus_key: Databus API key for protected downloads + - auth_url: Keycloak token endpoint URL + - client_id: Client ID for token exchange + - verbose: when True, print redacted HTTP request/response details + """ + query = _get_sparql_query_of_collection(uri, databus_key=databus_key, verbose=verbose) file_urls = _get_file_download_urls_from_sparql_query( - endpoint, query, databus_key=databus_key + endpoint, query, databus_key=databus_key, verbose=verbose ) _download_files( list(file_urls), @@ -376,6 +449,7 @@ def _download_collection( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) @@ -386,18 +460,21 @@ def _download_version( databus_key: str = None, auth_url: str = None, client_id: str = None, + verbose: bool = False, ) -> None: - """Download all files in a databus artifact version. - - Args: - uri: The full databus artifact version URI. - localDir: Local directory to download files to. If None, the databus folder structure is created in the current working directory. - vault_token_file: Path to Vault refresh token file for protected downloads. - databus_key: Databus API key for protected downloads. - auth_url: Keycloak token endpoint URL. - client_id: Client ID for token exchange. """ - json_str = fetch_databus_jsonld(uri, databus_key=databus_key) + Download all files in a databus artifact version. + + Parameters: + - uri: The full databus artifact version URI + - localDir: Local directory to download files to. If None, the databus folder structure is created in the current working directory. + - vault_token_file: Path to Vault refresh token file for protected downloads + - databus_key: Databus API key for protected downloads + - auth_url: Keycloak token endpoint URL + - client_id: Client ID for token exchange + - verbose: when True, print redacted HTTP request/response details + """ + json_str = fetch_databus_jsonld(uri, databus_key=databus_key, verbose=verbose) file_urls = _get_file_download_urls_from_artifact_jsonld(json_str) _download_files( file_urls, @@ -406,6 +483,7 @@ def _download_version( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) @@ -417,25 +495,28 @@ def _download_artifact( databus_key: str = None, auth_url: str = None, client_id: str = None, + verbose: bool = False, ) -> None: - """Download files in a databus artifact. - - Args: - uri: The full databus artifact URI. - localDir: Local directory to download files to. If None, the databus folder structure is created in the current working directory. - all_versions: If True, download all versions of the artifact; otherwise, only download the latest version. - vault_token_file: Path to Vault refresh token file for protected downloads. - databus_key: Databus API key for protected downloads. - auth_url: Keycloak token endpoint URL. - client_id: Client ID for token exchange. """ - json_str = fetch_databus_jsonld(uri, databus_key=databus_key) + Download files in a databus artifact. + + Parameters: + - uri: The full databus artifact URI + - localDir: Local directory to download files to. If None, the databus folder structure is created in the current working directory. + - all_versions: If True, download all versions of the artifact; otherwise, only download the latest version + - vault_token_file: Path to Vault refresh token file for protected downloads + - databus_key: Databus API key for protected downloads + - auth_url: Keycloak token endpoint URL + - client_id: Client ID for token exchange + - verbose: when True, print redacted HTTP request/response details + """ + json_str = fetch_databus_jsonld(uri, databus_key=databus_key, verbose=verbose) versions = _get_databus_versions_of_artifact(json_str, all_versions=all_versions) if isinstance(versions, str): versions = [versions] for version_uri in versions: print(f"Downloading version: {version_uri}") - json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key) + json_str = fetch_databus_jsonld(version_uri, databus_key=databus_key, verbose=verbose) file_urls = _get_file_download_urls_from_artifact_jsonld(json_str) _download_files( file_urls, @@ -444,21 +525,23 @@ def _download_artifact( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) def _get_databus_versions_of_artifact( json_str: str, all_versions: bool ) -> str | List[str]: - """Parse the JSON-LD of a databus artifact to extract URLs of its versions. + """ + Parse the JSON-LD of a databus artifact to extract URLs of its versions. - Args: - json_str: JSON-LD string of the databus artifact. - all_versions: If True, return all version URLs; otherwise, return only the latest version URL. + Parameters: + - json_str: JSON-LD string of the databus artifact + - all_versions: If True, return all version URLs; otherwise, return only the latest version URL Returns: - If all_versions is True: List of all version URLs. - If all_versions is False: URL of the latest version. + - If all_versions is True: List of all version URLs + - If all_versions is False: URL of the latest version """ json_dict = json.loads(json_str) versions = json_dict.get("databus:hasVersion") @@ -486,15 +569,15 @@ def _get_databus_versions_of_artifact( def _get_file_download_urls_from_artifact_jsonld(json_str: str) -> List[str]: - """Parse the JSON-LD of a databus artifact version to extract download URLs. - + """ + Parse the JSON-LD of a databus artifact version to extract download URLs. Don't get downloadURLs directly from the JSON-LD, but follow the "file" links to count access to databus accurately. - Args: - json_str: JSON-LD string of the databus artifact version. + Parameters: + - json_str: JSON-LD string of the databus artifact version Returns: - List of all file download URLs in the artifact version. + List of all file download URLs in the artifact version. """ databusIdUrl: List[str] = [] @@ -518,19 +601,22 @@ def _download_group( databus_key: str = None, auth_url: str = None, client_id: str = None, + verbose: bool = False, ) -> None: - """Download files in a databus group. - - Args: - uri: The full databus group URI. - localDir: Local directory to download files to. If None, the databus folder structure is created in the current working directory. - all_versions: If True, download all versions of each artifact in the group; otherwise, only download the latest version. - vault_token_file: Path to Vault refresh token file for protected downloads. - databus_key: Databus API key for protected downloads. - auth_url: Keycloak token endpoint URL. - client_id: Client ID for token exchange. """ - json_str = fetch_databus_jsonld(uri, databus_key=databus_key) + Download files in a databus group. + + Parameters: + - uri: The full databus group URI + - localDir: Local directory to download files to. If None, the databus folder structure is created in the current working directory. + - all_versions: If True, download all versions of each artifact in the group; otherwise, only download the latest version + - vault_token_file: Path to Vault refresh token file for protected downloads + - databus_key: Databus API key for protected downloads + - auth_url: Keycloak token endpoint URL + - client_id: Client ID for token exchange + - verbose: when True, print redacted HTTP request/response details + """ + json_str = fetch_databus_jsonld(uri, databus_key=databus_key, verbose=verbose) artifacts = _get_databus_artifacts_of_group(json_str) for artifact_uri in artifacts: print(f"Download artifact: {artifact_uri}") @@ -542,6 +628,7 @@ def _download_group( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) @@ -588,19 +675,22 @@ def download( all_versions=None, auth_url="https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token", client_id="vault-token-exchange", + verbose: bool = False, ) -> None: - """Download datasets from databus. + """ + Download datasets from databus. Download of files, versions, artifacts, groups or databus collections via their databus URIs or user-defined SPARQL queries that return file download URLs. - Args: - localDir: Local directory to download datasets to. If None, the databus folder structure is created in the current working directory. - endpoint: The databus endpoint URL. If None, inferred from databusURI. Required for user-defined SPARQL queries. - databusURIs: Databus identifiers to specify datasets to download. - token: Path to Vault refresh token file for protected downloads. - databus_key: Databus API key for protected downloads. - auth_url: Keycloak token endpoint URL. Default is "https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token". - client_id: Client ID for token exchange. Default is "vault-token-exchange". + Parameters: + - localDir: Local directory to download datasets to. If None, the databus folder structure is created in the current working directory. + - endpoint: the databus endpoint URL. If None, inferred from databusURI. Required for user-defined SPARQL queries. + - databusURIs: databus identifiers to specify datasets to download. + - token: Path to Vault refresh token file for protected downloads + - databus_key: Databus API key for protected downloads + - auth_url: Keycloak token endpoint URL. Default is "https://auth.dbpedia.org/realms/dbpedia/protocol/openid-connect/token". + - client_id: Client ID for token exchange. Default is "vault-token-exchange". + - verbose: when True, print redacted HTTP request/response details """ for databusURI in databusURIs: host, account, group, artifact, version, file = ( @@ -627,6 +717,7 @@ def download( databus_key, auth_url, client_id, + verbose=verbose, ) elif file is not None: print(f"Downloading file: {databusURI}") @@ -637,6 +728,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) elif version is not None: print(f"Downloading version: {databusURI}") @@ -647,6 +739,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) elif artifact is not None: print( @@ -660,6 +753,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) elif group is not None and group != "collections": print( @@ -673,6 +767,7 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) elif account is not None: print("accountId not supported yet") # TODO @@ -689,7 +784,7 @@ def download( if uri_endpoint is None: # endpoint is required for queries (--databus) raise ValueError("No endpoint given for query") res = _get_file_download_urls_from_sparql_query( - uri_endpoint, databusURI, databus_key=databus_key + uri_endpoint, databusURI, databus_key=databus_key, verbose=verbose ) _download_files( res, @@ -698,4 +793,5 @@ def download( databus_key=databus_key, auth_url=auth_url, client_id=client_id, + verbose=verbose, ) diff --git a/databusclient/api/utils.py b/databusclient/api/utils.py index 948268c..0f3f8ec 100644 --- a/databusclient/api/utils.py +++ b/databusclient/api/utils.py @@ -5,6 +5,7 @@ """ from typing import Optional, Tuple +import logging import requests @@ -19,42 +20,112 @@ def get_databus_id_parts_from_file_url( Optional[str], Optional[str], ]: - """Extract databus ID parts from a given databus URI. + """Split a Databus URI into its six parts. + + The returned tuple is (host, accountId, groupId, artifactId, versionId, fileId). + Missing parts are returned as ``None``. Args: - uri: The full databus URI of the form "http(s)://host/accountId/groupId/artifactId/versionId/fileId". + uri: The full databus URI of the form + "http(s)://host/accountId/groupId/artifactId/versionId/fileId". Returns: A tuple containing (host, accountId, groupId, artifactId, versionId, fileId). - Each element is a string or None if not present. - """ - """Split a Databus URI into its six parts. - - The returned tuple is (host, accountId, groupId, artifactId, versionId, fileId). - Missing parts are returned as ``None``. """ - uri = uri.removeprefix("https://").removeprefix("http://") parts = uri.strip("/").split("/") parts += [None] * (6 - len(parts)) # pad with None if less than 6 parts return tuple(parts[:6]) # return only the first 6 parts -def fetch_databus_jsonld(uri: str, databus_key: str | None = None) -> str: +def fetch_databus_jsonld( + uri: str, + databus_key: Optional[str] = None, + verbose: bool = False, +) -> str: """Fetch the JSON-LD representation of a Databus resource. Args: uri: Full Databus resource URI. databus_key: Optional API key for protected resources. + verbose: When True, log redacted HTTP request/response details. Returns: The response body as a string containing JSON-LD. """ - headers = {"Accept": "application/ld+json"} if databus_key is not None: headers["X-API-KEY"] = databus_key + + if verbose: + log_http("GET", uri, req_headers=headers) + response = requests.get(uri, headers=headers, timeout=30) - response.raise_for_status() + if verbose: + log_http( + "GET", + uri, + req_headers=headers, + status=response.status_code, + resp_headers=response.headers, + ) + + response.raise_for_status() return response.text + + +def _redact_headers(headers): + if not headers: + return headers + redacted = {} + for k, v in headers.items(): + key = k.lower() + if key == "authorization" or key.startswith("x-api-key"): + redacted[k] = "REDACTED" + else: + redacted[k] = v + return redacted + + +def log_http( + method, + url, + req_headers=None, + status=None, + resp_headers=None, + body_snippet=None, +): + """Log HTTP request/response details at DEBUG level with sanitized headers.""" + logger = logging.getLogger("databusclient") + msg_lines = [f"[HTTP] {method} {url}"] + + if req_headers: + msg_lines.append(f" Req headers: {_redact_headers(req_headers)}") + + if status is not None: + msg_lines.append(f" Status: {status}") + + if resp_headers: + try: + resp_dict = dict(resp_headers) + except Exception: + if hasattr(resp_headers, "items"): + try: + resp_dict = dict(resp_headers.items()) + except Exception: + resp_dict = {"headers": str(resp_headers)} + elif hasattr(resp_headers, "headers"): + try: + resp_dict = dict(getattr(resp_headers, "headers") or {}) + except Exception: + resp_dict = {"headers": str(resp_headers)} + else: + resp_dict = {"headers": str(resp_headers)} + + msg_lines.append(f" Resp headers: {_redact_headers(resp_dict)}") + + if body_snippet: + msg_lines.append(" Body preview: " + body_snippet[:500]) + + logger.debug("\n".join(msg_lines)) diff --git a/databusclient/cli.py b/databusclient/cli.py index 1a345f3..868eded 100644 --- a/databusclient/cli.py +++ b/databusclient/cli.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import json import os +import re from typing import List import click @@ -12,13 +13,27 @@ @click.group() -def app(): +@click.option("-v", "--verbose", is_flag=True, help="Enable verbose HTTP request/response output") +@click.pass_context +def app(ctx, verbose): """Databus Client CLI. Provides `deploy`, `download`, and `delete` commands for interacting with the DBpedia Databus. """ - pass + import logging + + ctx.ensure_object(dict) + ctx.obj["verbose"] = verbose + + # Configure databusclient logger when verbose flag is used + logger = logging.getLogger("databusclient") + if verbose: + handler = logging.StreamHandler() + handler.setFormatter(logging.Formatter("%(message)s")) + if not logger.hasHandlers(): + logger.addHandler(handler) + logger.setLevel(logging.DEBUG) @app.command() @@ -64,20 +79,23 @@ def deploy( distributions: List[str], ): """ - Flexible deploy to Databus command supporting three modes:\n - - Classic deploy (distributions as arguments)\n - - Metadata-based deploy (--metadata )\n + Flexible deploy to Databus command supporting three modes: + + - Classic deploy (distributions as arguments) + - Metadata-based deploy (--metadata ) - Upload & deploy via Nextcloud (--webdav-url, --remote, --path) """ # Sanity checks for conflicting options if metadata_file and any([distributions, webdav_url, remote, path]): raise click.UsageError( - "Invalid combination: when using --metadata, do not provide --webdav-url, --remote, --path, or distributions." + "Invalid combination: when using --metadata, do not provide " + "--webdav-url, --remote, --path, or distributions." ) if any([webdav_url, remote, path]) and not all([webdav_url, remote, path]): raise click.UsageError( - "Invalid combination: when using WebDAV/Nextcloud mode, please provide --webdav-url, --remote, and --path together." + "Invalid combination: when using WebDAV/Nextcloud mode, please " + "provide --webdav-url, --remote, and --path together." ) # === Mode 1: Classic Deploy === @@ -108,7 +126,6 @@ def deploy( "Please provide files to upload when using WebDAV/Nextcloud mode." ) - # Check that all given paths exist and are files or directories. invalid = [f for f in distributions if not os.path.exists(f)] if invalid: raise click.UsageError( @@ -162,7 +179,9 @@ def deploy( show_default=True, help="Client ID for token exchange", ) +@click.pass_context def download( + ctx, databusuris: List[str], localdir, databus, @@ -172,9 +191,7 @@ def download( authurl, clientid, ): - """ - Download datasets from databus, optionally using vault access if vault options are provided. - """ + """Download datasets from databus.""" try: api_download( localDir=localdir, @@ -185,6 +202,7 @@ def download( all_versions=all_versions, auth_url=authurl, client_id=clientid, + verbose=ctx.obj.get("verbose", False), ) except DownloadAuthError as e: raise click.ClickException(str(e)) @@ -202,13 +220,7 @@ def download( "--force", is_flag=True, help="Force deletion without confirmation prompt" ) def delete(databusuris: List[str], databus_key: str, dry_run: bool, force: bool): - """ - Delete a dataset from the databus. - - Delete a group, artifact, or version identified by the given databus URI. - Will recursively delete all data associated with the dataset. - """ - + """Delete a dataset from the databus.""" api_delete( databusURIs=databusuris, databus_key=databus_key, @@ -217,5 +229,53 @@ def delete(databusuris: List[str], databus_key: str, dry_run: bool, force: bool) ) +@app.command() +@click.argument("url") +@click.option("--cv", "cvs", multiple=True, help="Content variant like key=value (repeatable). Keys must not contain '|' or '_'") +@click.option("--format", "file_format", help="Format extension (e.g. ttl)") +@click.option("--compression", help="Compression (e.g. gzip)") +@click.option("--sha-length", help="sha256:length (64 hex chars followed by ':' and integer length)") +@click.option("--json-output", is_flag=True, help="Output JSON distribution object instead of plain string") +def mkdist(url, cvs, file_format, compression, sha_length, json_output): + """Create a distribution string from components.""" + cvs_dict = {} + for cv in cvs: + if "=" not in cv: + raise click.BadParameter(f"Invalid content variant '{cv}': expected key=value") + key, val = cv.split("=", 1) + if any(ch in key for ch in ("|", "_")): + raise click.BadParameter("Invalid characters in content-variant key (forbidden: '|' and '_')") + if key in cvs_dict: + raise click.BadParameter(f"Duplicate content-variant key '{key}'") + cvs_dict[key] = val + + sha_tuple = None + if sha_length: + if not re.match(r"^[A-Fa-f0-9]{64}:\d+$", sha_length): + raise click.BadParameter("Invalid --sha-length; expected SHA256HEX:length") + sha, length = sha_length.split(":", 1) + sha_tuple = (sha, int(length)) + + sorted_cvs = {k: cvs_dict[k] for k in sorted(cvs_dict)} + + dist = api_deploy.create_distribution( + url=url, + cvs=sorted_cvs, + file_format=file_format, + compression=compression, + sha256_length_tuple=sha_tuple, + ) + if json_output: + click.echo(json.dumps({"distribution": dist})) + else: + click.echo(dist) + + +@app.command() +@click.argument("shell", type=click.Choice(["bash", "zsh", "fish", "powershell"]), required=False) +def completion(shell="bash"): + click.echo(f"Run: eval \"$(_DATABUSCLIENT_COMPLETE=source_{shell} python -m databusclient)\"") + + if __name__ == "__main__": app() diff --git a/file.txt b/file.txt new file mode 100644 index 0000000..e69de29 diff --git a/test.sh b/test.sh index f590198..0a4c096 100755 --- a/test.sh +++ b/test.sh @@ -1,7 +1,7 @@ #!/usr/bin/env bash databusclient deploy \ - --version-id "https://d8lr.tools.dbpedia.org/hopver/testGroup/testArtifact/1.0-alpha/" \ + --versionid "https://d8lr.tools.dbpedia.org/hopver/testGroup/testArtifact/1.0-alpha/" \ --title "Test Title" \ --abstract "Test Abstract" \ --description "Test Description" \ diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..3dfd3eb --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,42 @@ +from click.testing import CliRunner +from databusclient import cli + + +def test_mkdist_multiple_cv(): + runner = CliRunner() + sha = 'a' * 64 + res = runner.invoke(cli.app, [ + 'mkdist', + 'https://example.org/file', + '--cv', 'b=2', + '--cv', 'a=1', + '--format', 'ttl', + '--compression', 'gz', + '--sha-length', f'{sha}:42' + ]) + assert res.exit_code == 0, res.output + # keys should be sorted alphabetically: a then b + assert res.output.strip() == f'https://example.org/file|a=1_b=2|ttl|gz|{sha}:42' + + +def test_mkdist_invalid_cv(): + runner = CliRunner() + res = runner.invoke(cli.app, ['mkdist', 'https://example.org/file', '--cv', 'badcv']) + assert res.exit_code != 0 + assert 'Invalid content variant' in res.output + + +def test_mkdist_invalid_sha(): + runner = CliRunner() + res = runner.invoke(cli.app, [ + 'mkdist', 'https://example.org/file', '--cv', 'k=v', '--sha-length', 'abc:123' + ]) + assert res.exit_code != 0 + assert 'Invalid --sha-length' in res.output + + +def test_completion_output(): + runner = CliRunner() + res = runner.invoke(cli.app, ['completion', 'bash']) + assert res.exit_code == 0 + assert '_DATABUSCLIENT_COMPLETE' in res.output diff --git a/tests/test_cli_verbose.py b/tests/test_cli_verbose.py new file mode 100644 index 0000000..c5bba14 --- /dev/null +++ b/tests/test_cli_verbose.py @@ -0,0 +1,38 @@ +from click.testing import CliRunner +from unittest.mock import Mock, patch + +import databusclient.cli as cli + + +# CLI-level integration test for -v flag +def test_cli_download_verbose_logs_redacted(caplog): + caplog.set_level("DEBUG", logger="databusclient") + runner = CliRunner() + + # Prepare mocked HTTP responses + resp_head_401 = Mock() + resp_head_401.status_code = 401 + resp_head_401.headers = {} + + resp_head_200 = Mock() + resp_head_200.status_code = 200 + resp_head_200.headers = {} + + resp_get = Mock() + resp_get.status_code = 200 + resp_get.headers = {"content-length": "0"} + resp_get.iter_content = lambda chunk: iter([]) + + # Initial HEAD returns 401 so client uses --databus-key header on retry + with patch("requests.head", side_effect=[resp_head_401, resp_head_200]), patch( + "requests.get", return_value=resp_get + ): + # Run CLI with verbose flag and databus key (so X-API-KEY will be redacted in logs) + target = "https://example.com/account/group/artifact/1/file.txt" + res = runner.invoke(cli.app, ["-v", "download", target, "--localdir", ".", "--databus-key", "SECRET"]) + + assert res.exit_code == 0, res.output + # Should log HTTP activity and redact secret (captured by caplog) + assert "[HTTP]" in caplog.text + assert "REDACTED" in caplog.text + assert "SECRET" not in caplog.text diff --git a/tests/test_download_auth.py b/tests/test_download_auth.py index 7225e08..94cdf3b 100644 --- a/tests/test_download_auth.py +++ b/tests/test_download_auth.py @@ -3,12 +3,12 @@ import pytest import requests +import logging import databusclient.api.download as dl from databusclient.api.download import VAULT_REQUIRED_HOSTS, DownloadAuthError - def make_response(status=200, headers=None, content=b""): headers = headers or {} mock = Mock() @@ -102,3 +102,33 @@ def test_403_reports_insufficient_permissions(): dl._download_file(url, localDir='.', vault_token_file="/some/token/file") assert "permission" in str(exc.value) or "forbidden" in str(exc.value) + + +def test_verbose_redacts_authorization(monkeypatch, caplog): + caplog.set_level(logging.DEBUG, logger='databusclient') + vault_host = next(iter(VAULT_REQUIRED_HOSTS)) + url = f"https://{vault_host}/protected/file.ttl" + + resp_head = make_response(status=200, headers={}) + resp_401 = make_response(status=401, headers={"WWW-Authenticate": "Bearer realm=\"auth\""}) + resp_200 = make_response(status=200, headers={"content-length": "0"}, content=b"") + + get_side_effects = [resp_401, resp_200] + + post_resp_1 = Mock() + post_resp_1.json.return_value = {"access_token": "ACCESS"} + post_resp_2 = Mock() + post_resp_2.json.return_value = {"access_token": "VAULT"} + + with patch("requests.head", return_value=resp_head), patch( + "requests.get", side_effect=get_side_effects + ), patch("requests.post", side_effect=[post_resp_1, post_resp_2]): + monkeypatch.setenv("REFRESH_TOKEN", "x" * 90) + + # run download with verbose enabled + dl._download_file(url, localDir='.', vault_token_file="/does/not/matter", verbose=True) + assert "[HTTP] HEAD" in caplog.text or "[HTTP] GET" in caplog.text + assert "REDACTED" in caplog.text + # Ensure token values are not directly printed + assert "ACCESS" not in caplog.text + assert "VAULT" not in caplog.text diff --git a/tests/test_utils_verbose.py b/tests/test_utils_verbose.py new file mode 100644 index 0000000..aa1b344 --- /dev/null +++ b/tests/test_utils_verbose.py @@ -0,0 +1,76 @@ +from unittest.mock import Mock, patch + +import databusclient.api.utils as utils +import databusclient.api.download as dl + +import requests +import logging + + + + +def make_response(status=200, headers=None, text=''): + headers = headers or {} + mock = Mock() + mock.status_code = status + mock.headers = headers + mock.text = text + def raise_for_status(): + if mock.status_code >= 400: + raise requests.exceptions.HTTPError() + mock.raise_for_status = raise_for_status + return mock + + +def test_fetch_databus_jsonld_verbose_redacts_api_key(caplog): + caplog.set_level(logging.DEBUG, logger='databusclient') + url = "https://databus.example/resource" + resp = make_response(status=200, headers={"content-type": "application/ld+json"}, text='{}') + with patch("databusclient.api.utils.requests.get", return_value=resp): + txt = utils.fetch_databus_jsonld(url, databus_key="SECRET", verbose=True) + assert "[HTTP] GET" in caplog.text + assert "REDACTED" in caplog.text + assert "SECRET" not in caplog.text + assert txt == '{}' + + + +def test_get_sparql_query_of_collection_verbose(caplog): + caplog.set_level(logging.DEBUG, logger='databusclient') + url = "https://databus.example/collections/col" + resp = make_response(status=200, headers={"content-type": "text/sparql"}, text='SELECT *') + with patch("databusclient.api.download.requests.get", return_value=resp): + txt = dl._get_sparql_query_of_collection(url, databus_key="SECRET", verbose=True) + assert "[HTTP] GET" in caplog.text + assert "REDACTED" in caplog.text + assert "SECRET" not in caplog.text + assert txt == 'SELECT *' + + + +def test_query_sparql_endpoint_verbose(caplog): + caplog.set_level(logging.DEBUG, logger='databusclient') + endpoint = "https://dbpedia.org/sparql" + sample = {"results": {"bindings": []}} + class MockSPARQL: + def __init__(self, url): + self.url = url + self.method = None + self._query = None + self._headers = None + def setQuery(self, q): + self._query = q + def setReturnFormat(self, fmt): + pass + def setCustomHttpHeaders(self, headers): + self._headers = headers + def query(self): + mock = Mock() + mock.convert.return_value = sample + return mock + with patch("databusclient.api.download.SPARQLWrapper", new=MockSPARQL): + res = dl._query_sparql_endpoint(endpoint, "SELECT ?s WHERE { ?s ?p ?o }", databus_key="SECRET", verbose=True) + assert "[HTTP] POST" in caplog.text + assert "REDACTED" in caplog.text + assert "SECRET" not in caplog.text + assert res == sample