diff --git a/.github/workflows/format-typing-check.yml b/.github/workflows/format-typing-check.yml index a4128a15..c5a97d4b 100644 --- a/.github/workflows/format-typing-check.yml +++ b/.github/workflows/format-typing-check.yml @@ -37,8 +37,8 @@ jobs: - name: Install ruff and mypy run: | pip install ruff mypy typing_extensions \ - types-Deprecated types-beautifulsoup4 types-jsonschema \ - types-networkx types-tabulate types-PyYAML pandas-stubs + types-Deprecated types-beautifulsoup4 types-jsonschema types-requests \ + types-networkx types-tabulate types-PyYAML pandas-stubs - name: Get all changed python files id: changed-python-files uses: tj-actions/changed-files@v44 diff --git a/pyproject.toml b/pyproject.toml index 879895f0..8fd371b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,6 +58,7 @@ dev = [ "mypy", "typing_extensions", # stub packages. Update the `format-typing-check.yml` too if you add more. + "types-requests", "types-beautifulsoup4", "types-jsonschema", "types-networkx", diff --git a/src/nplinker/genomics/antismash/__init__.py b/src/nplinker/genomics/antismash/__init__.py index e126f548..03e50b07 100644 --- a/src/nplinker/genomics/antismash/__init__.py +++ b/src/nplinker/genomics/antismash/__init__.py @@ -1,16 +1,28 @@ -from .antismash_downloader import download_and_extract_antismash_data +from .antismash_api_client import antismash_job_is_done +from .antismash_api_client import submit_antismash_job +from .antismash_downloader import download_and_extract_from_antismash_api +from .antismash_downloader import download_and_extract_from_antismash_db +from .antismash_downloader import extract_antismash_data from .antismash_loader import AntismashBGCLoader from .antismash_loader import parse_bgc_genbank +from .genome_accession_resolver import resolve_genome_accession +from .ncbi_downloader import download_and_extract_ncbi_genome from .podp_antismash_downloader import GenomeStatus from .podp_antismash_downloader import get_best_available_genome_id from .podp_antismash_downloader import podp_download_and_extract_antismash_data __all__ = [ - "download_and_extract_antismash_data", + "extract_antismash_data", + "resolve_genome_accession", + "download_and_extract_from_antismash_api", + "download_and_extract_from_antismash_db", "AntismashBGCLoader", "parse_bgc_genbank", "GenomeStatus", "get_best_available_genome_id", "podp_download_and_extract_antismash_data", + "download_and_extract_ncbi_genome", + "submit_antismash_job", + "antismash_job_is_done", ] diff --git a/src/nplinker/genomics/antismash/antismash_api_client.py b/src/nplinker/genomics/antismash/antismash_api_client.py new file mode 100644 index 00000000..124aedcb --- /dev/null +++ b/src/nplinker/genomics/antismash/antismash_api_client.py @@ -0,0 +1,81 @@ +from __future__ import annotations +import logging +from os import PathLike +from pathlib import Path +import requests + + +logger = logging.getLogger(__name__) + + +def submit_antismash_job(genbank_filepath: str | PathLike) -> str: + """Submits an antiSMASH job using the provided GenBank file. + + This function sends a GenBank file to the antiSMASH API + and retrieves the job ID if the submission is successful. + + Args: + genbank_filepath (str | PathLike): The path to the GenBank file to be submitted. + + Returns: + str: The job ID of the submitted antiSMASH job. + + Raises: + requests.exceptions.RequestException: If there is an issue with the HTTP request. + RuntimeError: If the API response does not contain a job ID. + """ + url = "https://antismash.secondarymetabolites.org/api/v1.0/submit" + genbank_filepath = Path(genbank_filepath) + + with open(genbank_filepath, "rb") as file: + files = {"seq": file} + response = requests.post(url, files=files) + response.raise_for_status() # Raise an exception for HTTP errors + + data = response.json() + if "id" not in data: + raise RuntimeError("No antiSMASH job ID returned") + return str(data["id"]) + + +def antismash_job_is_done(job_id: str) -> bool: + """Determines if the antiSMASH job has completed by checking its status. + + This function queries the antiSMASH API to retrieve the current state + of the job and determines whether it has finished successfully, is still + in progress, or has encountered an error. + + Args: + job_id (str): The unique identifier of the antiSMASH job. + + Returns: + bool: True if the job is completed successfully, False if it is still + running or queued. + + Raises: + RuntimeError: If the job has failed or if the API response indicates an error. + ValueError: If the job state is missing or an unexpected state is encountered + in the API response. + requests.exceptions.HTTPError: If an HTTP error occurs during the API request. + """ + url = f"https://antismash.secondarymetabolites.org/api/v1.0/status/{job_id}" + + response = requests.get(url, timeout=10) + response.raise_for_status() # Raise exception for HTTP errors + respose_data = response.json() + + if "state" not in respose_data: + raise ValueError(f"Job state missing in response for job_id: {job_id}") + + job_state = respose_data["state"] + if job_state in ("running", "queued"): + return False + if job_state == "done": + return True + if job_state == "failed": + job_status = respose_data.get("status", "No error message provided") + raise RuntimeError(f"AntiSMASH job {job_id} failed with an error: {job_status}") + else: + raise ValueError( + f"Unexpected job state for antismash job ID {job_id}. Job state: {job_state}" + ) diff --git a/src/nplinker/genomics/antismash/antismash_downloader.py b/src/nplinker/genomics/antismash/antismash_downloader.py index a02728f4..2c27938e 100644 --- a/src/nplinker/genomics/antismash/antismash_downloader.py +++ b/src/nplinker/genomics/antismash/antismash_downloader.py @@ -4,7 +4,9 @@ import shutil from os import PathLike from pathlib import Path +import requests from nplinker.utils import download_and_extract_archive +from nplinker.utils import extract_archive from nplinker.utils import list_dirs from nplinker.utils import list_files @@ -15,10 +17,75 @@ ANTISMASH_DB_DOWNLOAD_URL = "https://antismash-db.secondarymetabolites.org/output/{}/{}" # The antiSMASH DBV2 is for the availability of the old version, better to keep it. ANTISMASH_DBV2_DOWNLOAD_URL = "https://antismash-dbv2.secondarymetabolites.org/output/{}/{}" +# antismash api to download results from submitted jobs +ANTISMASH_API_DOWNLOAD_URL = "https://antismash.secondarymetabolites.org/upload/{}/{}" def download_and_extract_antismash_data( - antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike + url: str, antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike +) -> None: + """Download and extract antiSMASH BGC archive for a specified genome. + + This function downloads a BGC archive from the specified URL, extracts its contents, + and organizes the extracted files into a structured directory under the given `extract_root`. + + Args: + url (str): The URL to download the BGC archive from. + antismash_id (str): The identifier for the antiSMASH genome, used to name the extraction directory. + download_root: Path to the directory where the downloaded archive will be stored. + extract_root: Path to the directory where the data files will be extracted. + Note that an `antismash` directory will be created in the specified `extract_root` if + it doesn't exist. The files will be extracted to `/antismash/` directory. + + Raises: + ValueError: if `/antismash/` dir is not empty. + Exception: If any error occurs during the download or extraction process, the partially extracted + directory will be cleaned up, and the exception will be re-raised. + + Examples: + >>> download_and_extract_antismash_data( + "https://antismash-db.secondarymetabolites.org/output/GCF_001.1/GCF_001.1.zip", + "GCF_001.1", + "/data/download", + "/data/extracted" + ) + """ + extract_path = Path(extract_root) / "antismash" / antismash_id + + _prepare_extract_path(extract_path) + try: + download_and_extract_archive(url, download_root, extract_path, f"{antismash_id}.zip") + _cleanup_extracted_files(extract_path) + except Exception as e: + shutil.rmtree(extract_path) + raise e + + +def download_and_extract_from_antismash_api( + job_id: str, antismash_id: str, download_root: str | PathLike, extract_root: str | PathLike +) -> None: + """Downloads and extracts results from an antiSMASH API job. + + This function constructs the download URL using the provided job ID then + downloads the results as a ZIP file and extracts its contents to the specified directories. + + Args: + antismash_id (str): The unique identifier for the antiSMASH dataset. + job_id (str): The job ID for the antiSMASH API job. + download_root (str or PathLike): The root directory where the ZIP file will be downloaded. + extract_root (str or PathLike): The root directory where the contents of the ZIP file will be extracted. + + Raises: + requests.exceptions.RequestException: If there is an issue with the HTTP request. + zipfile.BadZipFile: If the downloaded file is not a valid ZIP file. + OSError: If there is an issue with file operations such as writing or extracting. + """ + url = ANTISMASH_API_DOWNLOAD_URL.format(job_id, antismash_id + ".zip") + download_and_extract_antismash_data(url, antismash_id, download_root, extract_root) + + +def download_and_extract_from_antismash_db( + refseq_acc: str, download_root: str | PathLike, extract_root: str | PathLike ) -> None: """Download and extract antiSMASH BGC archive for a specified genome. @@ -27,7 +94,7 @@ def download_and_extract_antismash_data( of a genome as the id of the archive. Args: - antismash_id: The id used to download BGC archive from antiSMASH database. + refseq_acc: The id used to download BGC archive from antiSMASH database. If the id is versioned (e.g., "GCF_004339725.1") please be sure to specify the version as well. download_root: Path to the directory to place downloaded archive in. @@ -36,41 +103,53 @@ def download_and_extract_antismash_data( it doesn't exist. The files will be extracted to `/antismash/` directory. Raises: - ValueError: if `/antismash/` dir is not empty. + ValueError: if `/antismash/` dir is not empty. Examples: - >>> download_and_extract_antismash_metadata("GCF_004339725.1", "/data/download", "/data/extracted") + >>> download_and_extract_from_antismash_db("GCF_004339725.1", "/data/download", "/data/extracted") """ - download_root = Path(download_root) - extract_root = Path(extract_root) - extract_path = extract_root / "antismash" / antismash_id + for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]: + url = base_url.format(refseq_acc, f"{refseq_acc}.zip") + if requests.head(url).status_code == 404: # not found + continue + download_and_extract_antismash_data(url, refseq_acc, download_root, extract_root) + return # Exit the loop once a valid URL is processed - try: - if extract_path.exists(): - _check_extract_path(extract_path) - else: - extract_path.mkdir(parents=True, exist_ok=True) + # if both urls give 404 not found + raise RuntimeError(f"No results in antiSMASH DB for {refseq_acc}") - for base_url in [ANTISMASH_DB_DOWNLOAD_URL, ANTISMASH_DBV2_DOWNLOAD_URL]: - url = base_url.format(antismash_id, antismash_id + ".zip") - download_and_extract_archive(url, download_root, extract_path, antismash_id + ".zip") - break - # delete subdirs - for subdir_path in list_dirs(extract_path): - shutil.rmtree(subdir_path) +def extract_antismash_data( + archive: str | PathLike, extract_root: str | PathLike, antimash_id: str +) -> None: + """Extracts antiSMASH results from a given archive into a specified directory. - # delete unnecessary files - files_to_keep = list_files(extract_path, suffix=(".json", ".gbk")) - for file in list_files(extract_path): - if file not in files_to_keep: - os.remove(file) + This function handles the extraction of antiSMASH results by preparing the + extraction path, extracting the archive, and performing cleanup of + unnecessary files. If an error occurs during the process, the partially + extracted files are removed, and the exception is re-raised. - logger.info("antiSMASH BGC data of %s is downloaded and extracted.", antismash_id) + Args: + archive (str | PathLike): The path to the archive file containing antiSMASH results. + extract_root (str | PathLike): The root directory where the data should + be extracted. + antimash_id (str): A unique identifier for the antiSMASH data, used to + create a subdirectory for the extracted files. + + Raises: + Exception: If any error occurs during the extraction process, the + exception is re-raised after cleaning up the extraction directory. + """ + extract_path = Path(extract_root) / "antismash" / antimash_id + + _prepare_extract_path(extract_path) + + try: + extract_archive(archive, extract_path, remove_finished=False) + _cleanup_extracted_files(extract_path) except Exception as e: shutil.rmtree(extract_path) - logger.warning(e) raise e @@ -78,3 +157,23 @@ def _check_extract_path(extract_path: Path): # check if extract_path is empty if any(extract_path.iterdir()): raise ValueError(f'Nonempty directory: "{extract_path}"') + + +def _cleanup_extracted_files(extract_path: str | PathLike) -> None: + # delete subdirs + for subdir_path in list_dirs(extract_path): + shutil.rmtree(subdir_path) + + # delete unnecessary files + files_to_keep = list_files(extract_path, suffix=(".json", ".gbk")) + for file in list_files(extract_path): + if file not in files_to_keep: + os.remove(file) + + +def _prepare_extract_path(extract_path: str | PathLike) -> None: + extract_path = Path(extract_path) + if extract_path.exists(): + _check_extract_path(extract_path) + else: + extract_path.mkdir(parents=True, exist_ok=True) diff --git a/src/nplinker/genomics/antismash/genome_accession_resolver.py b/src/nplinker/genomics/antismash/genome_accession_resolver.py new file mode 100644 index 00000000..e30433c3 --- /dev/null +++ b/src/nplinker/genomics/antismash/genome_accession_resolver.py @@ -0,0 +1,230 @@ +import logging +import re +from typing import Any +from typing import Callable +from typing import Literal +from typing import Mapping +import httpx +from bs4 import BeautifulSoup + + +JGI_GENOME_LOOKUP_URL = ( + "https://img.jgi.doe.gov/cgi-bin/m/main.cgi?section=TaxonDetail&page=taxonDetail&taxon_oid={}" +) +USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0" + +logger = logging.getLogger(__name__) + + +def get_latest_assembly_accession(acc: str) -> str: + """Retrieve the latest NCBI genome assembly accession for a given accession. + + This function retrieves the most recent genome assembly accession + from the revision history of the provided accession. It prioritizes + RefSeq accessions, and if unavailable, falls back to GenBank accessions. + + Args: + acc (str): The accession identifier to resolve. + + Returns: + str: The latest valid genome assembly accession. + + Raises: + ValueError: If no valid RefSeq or GenBank accession is found in + the assembly revision history. + """ + revision_history = _get_revision_history(acc) + + acc_priority = ("refseq_accession", "genbank_accession") + for acc_type in acc_priority: + assembly_revisions = revision_history.get("assembly_revisions", []) + revisions_with_acc = [entry for entry in assembly_revisions if acc_type in entry] + if revisions_with_acc: + latest_revision = max(revisions_with_acc, key=lambda x: x["release_date"]) + return str(latest_revision[acc_type]) + + raise ValueError("No valid genome accession found in assembly revision history") + + +def resolve_genome_accession(genome_id_data: Mapping[Any, Any]) -> str: + """Gets the NCBI genome assembly accession. + + Gets the latest RefSeq genome assembly accession, or if not available, + the latest GenBank genome assembly accession. + + This function gets the latest RefSeq genome assembly accession, or if not + available, the latest GenBank genome assembly accession. It attempts to get + a genome accession by checking the genome id date for specific ID types in + the following order: + 1. RefSeq_accession + 2. GenBank_accession + 3. JGI_Genome_ID + + For each ID type, it uses a corresponding resolver function to process + the ID. If a resolver fails, a warning is logged, and the function + proceeds to the next ID type. If no valid genome assembly accession can be + retrieved, a RuntimeError is raised. + + Args: + genome_id_data (dict): A dictionary containing genome ID types as keys + and their corresponding values. + + Returns: + str: The retrieved genome accession, prioritizing RefSeq if available, + otherwise GenBank. + + Raises: + RuntimeError: If no valid assembly accessions can be retrieved. + + Logs: + Warning messages if a resolver fails for a specific ID type. + """ + resolver_priority = ["RefSeq_accession", "GenBank_accession", "JGI_Genome_ID"] + resolvers: dict[str, Callable] = { + "RefSeq_accession": _resolve_refseq, + "GenBank_accession": _resolve_genbank, + "JGI_Genome_ID": _resolve_jgi, + } + + for id_type in resolver_priority: + if id_type not in genome_id_data: + continue + + resolver = resolvers[id_type] + try: + genome_id = genome_id_data[id_type].strip() + return str(resolver(genome_id)) + except Exception as e: + logger.warning(f"Failed to resolve {id_type}: {e}") + + raise RuntimeError("No valid assembly accessions found") + + +def validate_assembly_acc(acc: str, acc_type: Literal["RefSeq", "GenBank"]) -> None: + """Validates a NCBI genome assembly accession string based on its type. + + This function checks if the provided genome assembly accession string + adheres to the expected format for the specified accession type + ('RefSeq' or 'GenBank'). It ensures that the accession starts with the + correct prefix ('GCF_' for RefSeq or 'GCA_' for GenBank) and contains + a version number. + + Args: + acc (str): The genome assembly accession string to validate. + acc_type (Literal["RefSeq", "GenBank"]): The type of accession, either + 'RefSeq' or 'GenBank'. + + Raises: + ValueError: If the accession type is invalid. + ValueError: If the accession does not start with the expected prefix. + ValueError: If the accession is missing a version number. + """ + if acc_type == "RefSeq": + prefix = "GCF_" + elif acc_type == "GenBank": + prefix = "GCA_" + else: + raise ValueError( + f"Invalid genome assembly accession type '{acc_type}'. Expected 'RefSeq' or 'GenBank'." + ) + if not acc.startswith(prefix): + raise ValueError(f"Invalid {acc_type} assembly accession (must start with {prefix}): {acc}") + if "." not in acc: + raise ValueError(f"Invalid assembly accession (missing version number): {acc}") + + +def _get_revision_history(assembly_acc: str) -> dict[str, Any]: + """Fetches the revision history of a genome assembly from the NCBI Datasets API. + + Args: + assembly_acc (str): The accession number of the genome assembly. + + Returns: + dict[str, Any]: A dictionary containing the revision history of the specified assembly. + + Raises: + httpx.HTTPStatusError: If the HTTP request fails or returns a non-success status code. + ValueError: If no revision history data is found for the given accession number. + """ + url = ( + f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{assembly_acc}/revision_history" + ) + resp = httpx.get(url, headers={"User-Agent": USER_AGENT}, timeout=10.0, follow_redirects=True) + resp.raise_for_status() + revision_history = resp.json() + if not revision_history: + raise ValueError(f"No Assembly Revision data found for {assembly_acc}") + if not isinstance(revision_history, dict): + raise ValueError(f"Unexpected response format: {type(revision_history)}") + return revision_history + + +def _resolve_refseq(acc: str) -> str: + """Retrieves the latest NCBI genome assembly accession for a given RefSeq accession. + + This function validates the provided RefSeq accession and retrieves the + most up-to-date RefSeq genome assembly accession, or if not found, + alternatively the most up-to-date RefSeq GenBank genome assembly accession. + + Args: + acc (str): The RefSeq accession to resolve. + + Returns: + str: The latest NCBI assembly accession corresponding to the given RefSeq accession. + + Raises: + ValueError: If the provided accession is invalid or not a RefSeq accession. + """ + validate_assembly_acc(acc, "RefSeq") + return get_latest_assembly_accession(acc) + + +def _resolve_genbank(acc: str) -> str: + """Retrieves the latest NCBI genome assembly accession for a given GenBank accession. + + This function validates the provided GenBank accession and retrieves + most up-to-date RefSeq genome assembly accession, or if not found, + alternatively the most up-to-date RefSeq GenBank genome assembly accession. + + Args: + acc (str): The GenBank accession to resolve. + + Returns: + str: The latest NCBI assembly accession corresponding to the given GenBank accession. + + Raises: + ValueError: If the provided accession is invalid or not a RefSeq accession. + """ + validate_assembly_acc(acc, "GenBank") + return get_latest_assembly_accession(acc) + + +def _resolve_jgi(jgi_genome_id: str) -> str: + """Resolves a JGI Genome ID to its corresponding NCBI assembly accession. + + This function queries a predefined JGI genome lookup URL using the provided + JGI Genome ID. It parses the HTML response to extract the NCBI assembly + accession link. If no valid link is found, an exception is raised. The + function then retrieves the most up-to-date NCBI assembly accession for the + found assembly accession. + + Args: + jgi_genome_id (str): The JGI Genome ID to resolve. + + Returns: + str: The latest NCBI assembly accession corresponding to the given JGI Genome ID. + + Raises: + ValueError: If no NCBI accessions can be found for the given JGI Genome ID. + httpx.HTTPStatusError: If the HTTP request to the JGI genome lookup URL fails. + """ + url = JGI_GENOME_LOOKUP_URL.format(jgi_genome_id) + resp = httpx.get(url, headers={"User-Agent": USER_AGENT}, timeout=10.0, follow_redirects=True) + resp.raise_for_status() + + soup = BeautifulSoup(resp.content, "html.parser") + link = soup.find("a", href=re.compile("https://www.ncbi.nlm.nih.gov/datasets/genome/.*")) + if not link: + raise ValueError(f"Unable to find NCBI accessions for the JGI Genome ID: {jgi_genome_id}. ") + assembly_acc = link.text + return get_latest_assembly_accession(assembly_acc) diff --git a/src/nplinker/genomics/antismash/ncbi_downloader.py b/src/nplinker/genomics/antismash/ncbi_downloader.py new file mode 100644 index 00000000..fe016fd5 --- /dev/null +++ b/src/nplinker/genomics/antismash/ncbi_downloader.py @@ -0,0 +1,133 @@ +from __future__ import annotations +import logging +import os +import shutil +import time +from os import PathLike +from pathlib import Path +import httpx +import requests +from nplinker.utils import check_md5 +from nplinker.utils import download_url +from nplinker.utils import extract_archive + + +logger = logging.getLogger(__name__) + + +def download_and_extract_ncbi_genome( + genome_assembly_acc: str, + download_root: str | PathLike, + extract_root: str | PathLike, + max_attempts: int = 10, +) -> Path: + """Downloads and extracts an NCBI dataset for a given genome assembly accession. + + This function retrieves a dataset from the NCBI database using the provided + genome assembly accession. It retries the download process up to a specified maximum number + of attempts in case of errors. The function verifies the integrity of the + downloaded files using MD5 checksums, extracts the dataset, and renames the + GenBank file for easier access. Unnecessary files are removed after successful + processing. + + Args: + genome_assembly_acc (str): The NCBI accession of the genome assembly to be downloaded. + download_root (str | PathLike): The directory where the dataset will be downloaded. + extract_root (str | PathLike): The directory where the dataset will be extracted. + max_attempts (int): The maximum number of download attempts. Defaults to 10. + + Returns: + Path: The path to the extracted GenBank file. + + Raises: + RuntimeError: If the maximum number of retries is reached and the dataset + could not be successfully downloaded and extracted. + """ + extract_path = Path(extract_root) / "ncbi_genomes" + extract_path.mkdir(parents=True, exist_ok=True) + + _check_genome_accession_validity(genome_assembly_acc) + archive = _download_genome(genome_assembly_acc, download_root, max_attempts) + extract_archive(archive, extract_path) + _verify_ncbi_dataset_md5_sums(extract_path) + + # Move and rename GenBank file + genbank_path = extract_path / "ncbi_dataset" / "data" / genome_assembly_acc / "genomic.gbff" + new_genbank_path = extract_path / f"{genome_assembly_acc}.gbff" + genbank_path.rename(new_genbank_path) + + # Delete unnecessary files + shutil.rmtree(extract_path / "ncbi_dataset") + os.remove(extract_path / "md5sum.txt") + os.remove(extract_path / "README.md") + + return new_genbank_path + + +def _check_genome_accession_validity(genome_assembly_acc, max_attempts=10): + """Check the validity of genome accessio.""" + url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{genome_assembly_acc}/check" + + # Retry multiple times because NCBI has currently issues (500 Internal Server Error) + for attempt in range(1, max_attempts + 1): + try: + response = requests.get(url) + response.raise_for_status() + break + except Exception: + if attempt < max_attempts: + time.sleep(1) + + # Raise if no attempt was successful + response.raise_for_status() + # Raise if genome assembly is not successful + if "valid_assemblies" not in response.json(): + raise ValueError(f"Not a valid genome assembly accession: {genome_assembly_acc}") + + +def _download_genome(genome_assembly_acc, download_root, max_attempts): + url = ( + "https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/" + f"{genome_assembly_acc}/download?include_annotation_type=GENOME_GB" + ) + download_root = Path(download_root) + filename = f"ncbi_{genome_assembly_acc}.zip" + + # Retry multiple times because NCBI has issues currently + for attempt in range(1, max_attempts + 1): + try: + download_url(url, download_root, filename) + return download_root / filename + except httpx.ReadTimeout as e: + logger.warning(f"Attempt {attempt}/{max_attempts} failed to download {url}. Error: {e}") + if attempt < max_attempts: + time.sleep(1) + else: + raise httpx.ReadTimeout( + f"Failed to download the genome {genome_assembly_acc} from NCBI. " + f"Maximum download retries ({max_attempts}) reached for {url}." + ) + + +def _verify_ncbi_dataset_md5_sums(extract_path: str | PathLike) -> None: + """Verify the integrity of files in a specified directory using MD5 checksums. + + This function reads an "md5sum.txt" file located in the given extraction path, + which contains MD5 checksums and corresponding file names. It then computes + the MD5 checksum for each file and compares it with the expected value. If any + file's checksum does not match, a `ValueError` is raised. + + Args: + extract_path (PathLike): Path to the directory containing the files and + the "md5sum.txt" file. + + Raises: + ValueError: If the MD5 checksum of any file does not match the expected value. + """ + extract_path = Path(extract_path) + with open(extract_path / "md5sum.txt", "r") as f: + for line in f: + md5sum, file_name = line.strip().split() + file_path = extract_path / file_name + if not check_md5(file_path, md5sum): + raise ValueError(f"MD5 checksum mismatch for {file_path}") diff --git a/src/nplinker/genomics/antismash/podp_antismash_downloader.py b/src/nplinker/genomics/antismash/podp_antismash_downloader.py index 84e3cee4..b630b963 100644 --- a/src/nplinker/genomics/antismash/podp_antismash_downloader.py +++ b/src/nplinker/genomics/antismash/podp_antismash_downloader.py @@ -1,17 +1,21 @@ from __future__ import annotations import json import logging -import re +import time import warnings from collections.abc import Mapping from collections.abc import Sequence from os import PathLike from pathlib import Path -import httpx -from bs4 import BeautifulSoup from jsonschema import validate from nplinker.defaults import GENOME_STATUS_FILENAME -from nplinker.genomics.antismash import download_and_extract_antismash_data +from nplinker.genomics.antismash import antismash_job_is_done +from nplinker.genomics.antismash import download_and_extract_from_antismash_api +from nplinker.genomics.antismash import download_and_extract_from_antismash_db +from nplinker.genomics.antismash import download_and_extract_ncbi_genome +from nplinker.genomics.antismash import extract_antismash_data +from nplinker.genomics.antismash import resolve_genome_accession +from nplinker.genomics.antismash import submit_antismash_job from nplinker.schemas import GENOME_STATUS_SCHEMA @@ -33,24 +37,23 @@ class GenomeStatus: def __init__( self, original_id: str, - resolved_refseq_id: str = "", - resolve_attempted: bool = False, + resolved_id: str = "", + failed_previously: bool = False, bgc_path: str = "", ): """Initialize a GenomeStatus object for the given genome. Args: original_id: The original ID of the genome. - resolved_refseq_id: The resolved RefSeq ID of the - genome. Defaults to "". - resolve_attempted: A flag indicating whether an - attempt to resolve the RefSeq ID has been made. Defaults to False. + resolved_id: The resolved genome ID of the genome. Defaults to "". + failed_previously: Indicates whether a previous attempt to get BGC data + for the genome has failed. Defaults to False. bgc_path: The path to the downloaded BGC file for the genome. Defaults to "". """ self.original_id = original_id - self.resolved_refseq_id = "" if resolved_refseq_id == "None" else resolved_refseq_id - self.resolve_attempted = resolve_attempted + self.resolved_id = "" if resolved_id == "None" else resolved_id + self.failed_previously = failed_previously self.bgc_path = bgc_path @staticmethod @@ -114,8 +117,8 @@ def _to_dict(self) -> dict: """Convert the GenomeStatus object to a dict.""" return { "original_id": self.original_id, - "resolved_refseq_id": self.resolved_refseq_id, - "resolve_attempted": self.resolve_attempted, + "resolved_id": self.resolved_id, + "failed_previously": self.failed_previously, "bgc_path": self.bgc_path, } @@ -153,59 +156,110 @@ def podp_download_and_extract_antismash_data( gs_dict = GenomeStatus.read_json(gs_file) for i, genome_record in enumerate(genome_records): - # get the best available ID from the dict - genome_id_data = genome_record["genome_ID"] - raw_genome_id = get_best_available_genome_id(genome_id_data) - if raw_genome_id is None or len(raw_genome_id) == 0: - logger.warning(f'Invalid input genome record "{genome_record}"') - continue - - # check if genome ID exist in the genome status file - if raw_genome_id not in gs_dict: - gs_dict[raw_genome_id] = GenomeStatus(raw_genome_id) - - gs_obj = gs_dict[raw_genome_id] - logger.info( - f"Checking for antismash data {i + 1}/{len(genome_records)}, " - f"current genome ID={raw_genome_id}" + f"Getting antiSMASH BGC data for genome record {i + 1} of {len(genome_records)}." ) - # first, check if BGC data is downloaded - if gs_obj.bgc_path and Path(gs_obj.bgc_path).exists(): - logger.info(f"Genome ID {raw_genome_id} already downloaded to {gs_obj.bgc_path}") + + # get the best available genome ID from the dict + original_genome_id = get_best_available_genome_id(genome_record["genome_ID"]) + if not original_genome_id: + logger.warning(f"Skipping invalid genome record: {genome_record}") continue - # second, check if lookup attempted previously - if gs_obj.resolve_attempted: - logger.info(f"Genome ID {raw_genome_id} skipped due to previous failed attempt") + # Retrieve or initialize the GenomeStatus object for the genome ID + gs = gs_dict.setdefault(original_genome_id, GenomeStatus(original_genome_id)) + + # Check if genomes already have antiSMASH BGC data + if gs.bgc_path and Path(gs.bgc_path).exists(): + logger.info( + f"antiSMASH BGC data for genome ID {gs.original_id} already downloaded to " + f"{gs.bgc_path}" + ) + try: + process_existing_antismash_data(gs, project_extract_root) + continue + except Exception as e: + logger.warning( + "Failed to process existing antiSMASH BGC data for genome ID " + f"{gs.original_id}. Error: {e}" + ) + gs.bgc_path = "" # Reset bgc path + + # Check if a previous attempt to get bgc data has failed + if gs.failed_previously: + logger.info(f"Genome ID {gs.original_id} skipped due to previous failed attempt") + GenomeStatus.to_json(gs_dict, gs_file) continue - # if not downloaded or lookup attempted, then try to resolve the ID - # and download - logger.info(f"Start lookup process for genome ID {raw_genome_id}") - gs_obj.resolved_refseq_id = _resolve_refseq_id(genome_id_data) - gs_obj.resolve_attempted = True - - if gs_obj.resolved_refseq_id == "": - # give up on this one - logger.warning(f"Failed lookup for genome ID {raw_genome_id}") + # resolve genome ID + try: + gs.resolved_id = resolve_genome_accession(genome_record["genome_ID"]) + except Exception as e: + logger.warning(f"Failed to resolve genome ID {gs.original_id}. Error: {e}") + gs.failed_previously = True + GenomeStatus.to_json(gs_dict, gs_file) continue - # if resolved id is valid, try to download and extract antismash data + # check if the antiSMASH BGC data is already in the downloads + bgc_path = Path(project_download_root, f"{gs.resolved_id}.zip").absolute() + if bgc_path.exists(): + logger.info( + f"antiSMASH BGC data for genome ID {gs.original_id} already downloaded to " + f"{bgc_path}" + ) + try: + gs.bgc_path = str(bgc_path) + process_existing_antismash_data(gs, project_extract_root) + GenomeStatus.to_json(gs_dict, gs_file) + continue + except Exception as e: + logger.warning( + "Failed to process existing antiSMASH BGC data for genome ID " + f"{gs.original_id}. Error: {e}" + ) + gs.bgc_path = "" + + # retrieve antiSMASH BGC data from antiSMASH-DB try: - download_and_extract_antismash_data( - gs_obj.resolved_refseq_id, project_download_root, project_extract_root + retrieve_antismash_db_data(gs, project_download_root, project_extract_root) + logger.info( + f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted" ) - - gs_obj.bgc_path = str( - Path(project_download_root, gs_obj.resolved_refseq_id + ".zip").absolute() + GenomeStatus.to_json(gs_dict, gs_file) + continue + except Exception as e: + logger.info( + f"Unable to retrieve BGC data from antiSMASH-DB for genome ID {gs.original_id}. " + f"Error: {e}" ) - output_path = Path(project_extract_root, "antismash", gs_obj.resolved_refseq_id) - if output_path.exists(): - Path.touch(output_path / "completed", exist_ok=True) + # retrieve antismash BGC by submitting antismash job via API + try: + logger.info( + "Downloading genome assembly from NCBI and submitting antiSMASH job for " + f"genome ID {gs.original_id}." + ) + genome_path = download_and_extract_ncbi_genome( + gs.resolved_id, project_download_root, project_extract_root + ) + job_id = submit_antismash_job(genome_path) + logger.info(f"Waiting for antiSMASH job {job_id} to complete.") + while antismash_job_is_done(job_id) is False: + time.sleep(15) + retrieve_antismash_job_data(job_id, gs, project_download_root, project_extract_root) + logger.info( + f"antiSMASH BGC data for genome ID {gs.original_id} is downloaded and extracted" + ) + GenomeStatus.to_json(gs_dict, gs_file) + continue + except Exception as e: + logger.info( + f"Unable to retrieve BGC data via antiSMASH API for genome ID {gs.original_id}. " + f"Error: {e}" + ) - except Exception: - gs_obj.bgc_path = "" + logger.warning(f"Failed to retrieve BGC data for genome ID {gs.original_id}.") + gs.failed_previously = True + GenomeStatus.to_json(gs_dict, gs_file) # raise and log warning for failed downloads failed_ids = [gs.original_id for gs in gs_dict.values() if not gs.bgc_path] @@ -216,9 +270,6 @@ def podp_download_and_extract_antismash_data( logger.warning(warning_message) warnings.warn(warning_message, UserWarning) - # save updated genome status to json file - GenomeStatus.to_json(gs_dict, gs_file) - if len(failed_ids) == len(genome_records): raise ValueError("No antiSMASH data found for any genome") @@ -247,122 +298,100 @@ def get_best_available_genome_id(genome_id_data: Mapping[str, str]) -> str | Non return best_id -def _resolve_genbank_accession(genbank_id: str) -> str: - """Try to get RefSeq assembly id through given GenBank assembly id. +def process_existing_antismash_data(gs_obj: GenomeStatus, extract_root: str | PathLike) -> None: + """Processes already downloaded antiSMASH BGC data archive. - Note that GenBank assembly accession starts with "GCA_" and RefSeq assembly - accession starts with "GCF_". For more info, see - https://www.ncbi.nlm.nih.gov/datasets/docs/v2/troubleshooting/faq + This function ensures that the antiSMASH data archive associated with a given genomic sequence + object is properly extracted into a specified directory. If the data has already been extracted, + the function skips the extraction process. Args: - genbank_id: ID for GenBank assembly accession. + gs_obj: An object representing a genomic sequence, which contains the path + to the antiSMASH BGC data (accessible via `gs_obj.bgc_path`) and + an original identifier (`gs_obj.original_id`). + extract_root: The root directory where the antiSMASH data should be extracted. Raises: - httpx.ReadTimeout: If the request times out. - - Returns: - RefSeq assembly ID if the search is successful, otherwise an empty string. + Any exceptions raised by the `extract_antismash_data` function if the extraction fails. """ - logger.info( - f"Attempting to resolve Genbank assembly accession {genbank_id} to RefSeq accession" - ) - # NCBI Datasets API https://www.ncbi.nlm.nih.gov/datasets/docs/v2/api/ - # Note that there is a API rate limit of 5 requests per second without using an API key - # For more info, see https://www.ncbi.nlm.nih.gov/datasets/docs/v2/troubleshooting/faq/ - - # API for getting revision history of a genome assembly - # For schema, see https://www.ncbi.nlm.nih.gov/datasets/docs/v2/api/rest-api/#get-/genome/accession/-accession-/revision_history - url = f"https://api.ncbi.nlm.nih.gov/datasets/v2/genome/accession/{genbank_id}/revision_history" - - refseq_id = "" - try: - resp = httpx.get( - url, headers={"User-Agent": USER_AGENT}, timeout=10.0, follow_redirects=True - ) - resp.raise_for_status() - - data = resp.json() - if not data: - raise ValueError("No Assembly Revision data found") - - assembly_entries = [ - entry for entry in data["assembly_revisions"] if "refseq_accession" in entry - ] - if not assembly_entries: - raise ValueError("No RefSeq assembly accession found") - - latest_entry = max(assembly_entries, key=lambda x: x["release_date"]) - refseq_id = latest_entry["refseq_accession"] - - except httpx.RequestError as exc: - logger.warning(f"An error occurred while requesting {exc.request.url!r}: {exc}") - except httpx.HTTPStatusError as exc: - logger.warning( - f"Error response {exc.response.status_code} while requesting {exc.request.url!r}" + antismash_id = Path(gs_obj.bgc_path).stem + extract_path = Path(extract_root, "antismash", antismash_id) + completed_marker = extract_path / "completed" + + # Check if archive is already successfully extracted + if completed_marker.exists(): + logger.info( + f"antiSMASH BGC data for {gs_obj.original_id} already extracted at {extract_path}." ) - except httpx.ReadTimeout: - logger.warning("Timed out waiting for result of GenBank assembly lookup") - except ValueError as exc: - logger.warning(f"Error while resolving GenBank assembly accession {genbank_id}: {exc}") + return - return refseq_id + extract_antismash_data(gs_obj.bgc_path, extract_root, antismash_id) + completed_marker.touch(exist_ok=True) -def _resolve_jgi_accession(jgi_id: str) -> str: - """Try to get RefSeq id through given JGI id. +def retrieve_antismash_db_data( + genome_status: GenomeStatus, download_root: str | PathLike, extract_root: str | PathLike +) -> None: + """Retrieve antiSMASH database data for a given genome and update its status. + + This function downloads and extracts antiSMASH data for a genome identified + by its resolved genome ID. It updates the `genome_status` object with the + path to the downloaded data or sets it to an empty string if an error occurs. Args: - jgi_id: JGI_Genome_ID for GenBank accession. + genome_status (GenomeStatus): An object representing the genome's status, + including its resolved genome ID and BGC path. + download_root (str | PathLike): The root directory where the antiSMASH + data will be downloaded. + extract_root (str | PathLike): The root directory where the antiSMASH + data will be extracted. - Returns: - RefSeq ID if search is successful, otherwise an empty string. + Raises: + Exception: If an error occurs during the download or extraction process. """ - url = JGI_GENOME_LOOKUP_URL.format(jgi_id) - logger.info(f"Attempting to resolve JGI_Genome_ID {jgi_id} to GenBank accession via {url}") - # no User-Agent header produces a 403 Forbidden error on this site... - try: - resp = httpx.get( - url, headers={"User-Agent": USER_AGENT}, timeout=10.0, follow_redirects=True + if not genome_status.resolved_id.startswith("GCF_"): + raise ValueError( + f"Resolved genome ID '{genome_status.resolved_id}' is not a valid RefSeq assembly and " + "antiSMASH-DB only contains results for RefSeq assemblies." ) - except httpx.ReadTimeout: - logger.warning("Timed out waiting for result of JGI_Genome_ID lookup") - return "" - - soup = BeautifulSoup(resp.content, "html.parser") - # Find the table entry giving the "NCBI Assembly Accession" ID - link = soup.find("a", href=re.compile("https://www.ncbi.nlm.nih.gov/datasets/genome/.*")) - if link is None: - return "" - - assembly_id = link.text - # check if the assembly ID is already a RefSeq ID - if assembly_id.startswith("GCF_"): - return assembly_id # type: ignore - else: - return _resolve_genbank_accession(assembly_id) + + antismash_id = genome_status.resolved_id + extract_path = Path(extract_root, "antismash", antismash_id) + download_path = Path(download_root, f"{antismash_id}.zip").absolute() + + download_and_extract_from_antismash_db(antismash_id, download_root, extract_root) + Path.touch(extract_path / "completed", exist_ok=True) + genome_status.bgc_path = str(download_path) -def _resolve_refseq_id(genome_id_data: Mapping[str, str]) -> str: - """Get the RefSeq ID to which the genome accession is linked. +def retrieve_antismash_job_data( + job_id: str, + genome_status: GenomeStatus, + download_root: str | PathLike, + extract_root: str | PathLike, +) -> None: + """Retrieve antiSMASH API data for a given genome and update its status. - Check https://pairedomicsdata.bioinformatics.nl/schema.json. + This function downloads and extracts antiSMASH data for a genome identified + by its resolved genome ID. It updates the `genome_status` object with the + path to the downloaded data or sets it to an empty string if an error occurs. Args: - genome_id_data: dictionary containing information - for each genome record present. + job_id (str): The job ID for the antiSMASH API job. + genome_status (GenomeStatus): An object representing the genome's status, + including its resolved genome ID and BGC path. + download_root (str | PathLike): The root directory where the antiSMASH + data will be downloaded. + extract_root (str | PathLike): The root directory where the antiSMASH + data will be extracted. - Returns: - RefSeq ID if present, otherwise an empty string. + Raises: + Exception: If an error occurs during the download or extraction process. """ - if "RefSeq_accession" in genome_id_data: - # best case, can use this directly - return genome_id_data["RefSeq_accession"] - if "GenBank_accession" in genome_id_data: - # resolve via NCBI - return _resolve_genbank_accession(genome_id_data["GenBank_accession"]) - if "JGI_Genome_ID" in genome_id_data: - # resolve via JGI => NCBI - return _resolve_jgi_accession(genome_id_data["JGI_Genome_ID"]) - - logger.warning(f"Unable to resolve genome_ID: {genome_id_data}") - return "" + antismash_id = genome_status.resolved_id + extract_path = Path(extract_root, "antismash", antismash_id) + download_path = Path(download_root, f"{antismash_id}.zip").absolute() + + download_and_extract_from_antismash_api(job_id, antismash_id, download_root, extract_root) + Path.touch(extract_path / "completed", exist_ok=True) + genome_status.bgc_path = str(download_path) diff --git a/src/nplinker/genomics/utils.py b/src/nplinker/genomics/utils.py index ecf6521b..2b723dff 100644 --- a/src/nplinker/genomics/utils.py +++ b/src/nplinker/genomics/utils.py @@ -276,7 +276,7 @@ def extract_mappings_original_genome_id_resolved_genome_id( Generate strain mappings JSON file for PODP pipeline. """ gs_mappings_dict = GenomeStatus.read_json(genome_status_json_file) - return {gs.original_id: gs.resolved_refseq_id for gs in gs_mappings_dict.values()} + return {gs.original_id: gs.resolved_id for gs in gs_mappings_dict.values()} def extract_mappings_resolved_genome_id_bgc_id( diff --git a/src/nplinker/schemas/genome_status_schema.json b/src/nplinker/schemas/genome_status_schema.json index 470c74f4..b3dbdef7 100644 --- a/src/nplinker/schemas/genome_status_schema.json +++ b/src/nplinker/schemas/genome_status_schema.json @@ -17,8 +17,8 @@ "type": "object", "required": [ "original_id", - "resolved_refseq_id", - "resolve_attempted", + "resolved_id", + "failed_previously", "bgc_path" ], "properties": { @@ -28,15 +28,15 @@ "description": "The original ID of the genome", "minLength": 1 }, - "resolved_refseq_id": { + "resolved_id": { "type": "string", - "title": "Resolved RefSeq ID", - "description": "The RefSeq ID that was resolved for this genome" + "title": "Resolved genome ID", + "description": "The genome ID that was resolved for this genome" }, - "resolve_attempted": { + "failed_previously": { "type": "boolean", - "title": "Resolve Attempted", - "description": "Whether or not an attempt was made to resolve this genome" + "title": "Failed Attempt", + "description": "Indicates if a previous attempt to retrieve BGC data for this genome was unsuccessful" }, "bgc_path": { "type": "string", diff --git a/tests/unit/genomics/test_antismash_downloader.py b/tests/unit/genomics/test_antismash_downloader.py index 1dfeb4cf..553699ba 100644 --- a/tests/unit/genomics/test_antismash_downloader.py +++ b/tests/unit/genomics/test_antismash_downloader.py @@ -1,10 +1,10 @@ import pytest -from nplinker.genomics.antismash import download_and_extract_antismash_data +from nplinker.genomics.antismash import download_and_extract_from_antismash_db from nplinker.utils import extract_archive from nplinker.utils import list_files -class TestDownloadAndExtractAntismashData: +class TestDownloadAndExtractFromAntismashDb: antismash_id = "GCF_004339725.1" def test_default(self, tmp_path): @@ -14,7 +14,7 @@ def test_default(self, tmp_path): extract_root.mkdir() original_extract_root = tmp_path / "original" original_extract_root.mkdir() - download_and_extract_antismash_data(self.antismash_id, download_root, extract_root) + download_and_extract_from_antismash_db(self.antismash_id, download_root, extract_root) archive = download_root / "GCF_004339725.1.zip" extracted_folder = extract_root / "antismash" / "GCF_004339725.1" extracted_files = list_files(extracted_folder, keep_parent=False) @@ -32,7 +32,9 @@ def test_error_nonempty_path(self, tmp_path): nonempty_path = tmp_path / "extracted" / "antismash" / f"{self.antismash_id}" / "subdir" nonempty_path.mkdir(parents=True) with pytest.raises(ValueError, match="Nonempty directory"): - download_and_extract_antismash_data(self.antismash_id, tmp_path, tmp_path / "extracted") + download_and_extract_from_antismash_db( + self.antismash_id, tmp_path, tmp_path / "extracted" + ) # test a non-existent ID, which can be either a fake ID, non-existent in NCBI # or a valid NCBI genome ID but it does not have BGC data in antismash database @@ -44,6 +46,6 @@ def test_nonexisting_id(self, tmp_path): extract_root.mkdir() for test_id in nonexisting_ids: with pytest.raises(RuntimeError): - download_and_extract_antismash_data(test_id, download_root, extract_root) + download_and_extract_from_antismash_db(test_id, download_root, extract_root) extracted_folder = extract_root / "antismash" / test_id assert not extracted_folder.exists() diff --git a/tests/unit/genomics/test_genome_accession_resolver.py b/tests/unit/genomics/test_genome_accession_resolver.py new file mode 100644 index 00000000..5b76eedc --- /dev/null +++ b/tests/unit/genomics/test_genome_accession_resolver.py @@ -0,0 +1,203 @@ +from unittest.mock import patch +import pytest +from nplinker.genomics.antismash.genome_accession_resolver import get_latest_assembly_accession +from nplinker.genomics.antismash.genome_accession_resolver import resolve_genome_accession +from nplinker.genomics.antismash.genome_accession_resolver import validate_assembly_acc + + +USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:86.0) Gecko/20100101 Firefox/86.0" + + +@pytest.fixture +def mock_resolvers(): + with ( + patch( + "nplinker.genomics.antismash.genome_accession_resolver._resolve_refseq" + ) as mock_refseq, + patch( + "nplinker.genomics.antismash.genome_accession_resolver._resolve_genbank" + ) as mock_genbank, + patch("nplinker.genomics.antismash.genome_accession_resolver._resolve_jgi") as mock_jgi, + ): + yield mock_refseq, mock_genbank, mock_jgi + + +# test get_latest_assembly_accession + + +def test_get_latest_assembly_accession_refseq_success(): + with patch( + "nplinker.genomics.antismash.genome_accession_resolver._get_revision_history" + ) as mock_get_revision_history: + mock_get_revision_history.return_value = { + "assembly_revisions": [ + {"refseq_accession": "GCF_000001.2", "release_date": "2023-01-01"}, + {"refseq_accession": "GCF_000001.1", "release_date": "2022-01-01"}, + ] + } + + result = get_latest_assembly_accession("GCF_000001.1") + + assert result == "GCF_000001.2" + mock_get_revision_history.assert_called_once_with("GCF_000001.1") + + +def test_get_latest_assembly_accession_genbank_success(): + with patch( + "nplinker.genomics.antismash.genome_accession_resolver._get_revision_history" + ) as mock_get_revision_history: + mock_get_revision_history.return_value = { + "assembly_revisions": [ + {"genbank_accession": "GCA_000002.2", "release_date": "2023-01-01"}, + {"genbank_accession": "GCA_000002.1", "release_date": "2022-01-01"}, + ] + } + + result = get_latest_assembly_accession("GCA_000002.1") + + assert result == "GCA_000002.2" + mock_get_revision_history.assert_called_once_with("GCA_000002.1") + + +def test_get_latest_assembly_accession_refseq_and_genbank(): + with patch( + "nplinker.genomics.antismash.genome_accession_resolver._get_revision_history" + ) as mock_get_revision_history: + mock_get_revision_history.return_value = { + "assembly_revisions": [ + {"refseq_accession": "GCF_000001.2", "release_date": "2023-01-01"}, + {"genbank_accession": "GCA_000002.2", "release_date": "2022-01-01"}, + ] + } + + result = get_latest_assembly_accession("GCF_000001.1") + + assert result == "GCF_000001.2" + mock_get_revision_history.assert_called_once_with("GCF_000001.1") + + +def test_get_latest_assembly_accession_no_valid_accession(): + with patch( + "nplinker.genomics.antismash.genome_accession_resolver._get_revision_history" + ) as mock_get_revision_history: + mock_get_revision_history.return_value = {"assembly_revisions": []} + + with pytest.raises(ValueError, match="No valid genome accession found"): + get_latest_assembly_accession("GCF_000001.1") + + mock_get_revision_history.assert_called_once_with("GCF_000001.1") + + +# test resolve_for_genome_accession + + +def test_resolve_genome_accession_refseq_success(mock_resolvers): + mock_refseq, mock_genbank, mock_jgi = mock_resolvers + mock_refseq.return_value = "GCF_000001.2" + + genome_id_data = {"RefSeq_accession": "GCF_000001.1"} + result = resolve_genome_accession(genome_id_data) + + assert result == "GCF_000001.2" + mock_refseq.assert_called_once_with("GCF_000001.1") + mock_genbank.assert_not_called() + mock_jgi.assert_not_called() + + +def test_resolve_genome_accession_genbank_success(mock_resolvers): + mock_refseq, mock_genbank, mock_jgi = mock_resolvers + mock_refseq.side_effect = Exception("RefSeq resolution failed") + mock_genbank.return_value = "GCA_000002.2" + + genome_id_data = { + "RefSeq_accession": "GCF_000001.1", + "GenBank_accession": "GCA_000002.1", + } + result = resolve_genome_accession(genome_id_data) + + assert result == "GCA_000002.2" + mock_refseq.assert_called_once_with("GCF_000001.1") + mock_genbank.assert_called_once_with("GCA_000002.1") + mock_jgi.assert_not_called() + + +def test_resolve_genome_accession_jgi_success(mock_resolvers): + mock_refseq, mock_genbank, mock_jgi = mock_resolvers + mock_refseq.side_effect = Exception("RefSeq resolution failed") + mock_genbank.side_effect = Exception("GenBank resolution failed") + mock_jgi.return_value = "GCF_000001.2" + + genome_id_data = { + "RefSeq_accession": "GCF_000001.1", + "GenBank_accession": "GCA_000002.1", + "JGI_Genome_ID": "12345", + } + result = resolve_genome_accession(genome_id_data) + + assert result == "GCF_000001.2" + mock_refseq.assert_called_once_with("GCF_000001.1") + mock_genbank.assert_called_once_with("GCA_000002.1") + mock_jgi.assert_called_once_with("12345") + + +def test_resolve_genome_accession_no_valid_accession(mock_resolvers): + mock_refseq, mock_genbank, mock_jgi = mock_resolvers + mock_refseq.side_effect = Exception("RefSeq resolution failed") + mock_genbank.side_effect = Exception("GenBank resolution failed") + mock_jgi.side_effect = Exception("JGI resolution failed") + + genome_id_data = { + "RefSeq_accession": "GCF_000001.1", + "GenBank_accession": "GCA_000002.1", + "JGI_Genome_ID": "12345", + } + + with pytest.raises(RuntimeError, match="No valid assembly accessions found"): + resolve_genome_accession(genome_id_data) + + mock_refseq.assert_called_once_with("GCF_000001.1") + mock_genbank.assert_called_once_with("GCA_000002.1") + mock_jgi.assert_called_once_with("12345") + + +def test_resolve_genome_accession_missing_keys(mock_resolvers): + mock_refseq, mock_genbank, mock_jgi = mock_resolvers + + genome_id_data = {} + with pytest.raises(RuntimeError, match="No valid assembly accessions found"): + resolve_genome_accession(genome_id_data) + + mock_refseq.assert_not_called() + mock_genbank.assert_not_called() + mock_jgi.assert_not_called() + + +def test_validate_assembly_acc_valid(): + # Test valid GenBank accession + validate_assembly_acc("GCA_000002.1", "GenBank") + # Test valid RefSeq accession + validate_assembly_acc("GCF_000001.1", "RefSeq") + + +def test_validate_assembly_acc_invalid_type(): + # Test invalid accession type + with pytest.raises(ValueError, match="Invalid genome assembly accession type 'InvalidType'"): + validate_assembly_acc("GCF_000001.1", "InvalidType") + + +def test_validate_assembly_acc_invalid_prefix(): + # Test invalid prefix for GenBank accession + with pytest.raises(ValueError, match="Invalid GenBank assembly accession"): + validate_assembly_acc("GCF_000002.1", "GenBank") + # Test invalid prefix for RefSeq accession + with pytest.raises(ValueError, match="Invalid RefSeq assembly accession"): + validate_assembly_acc("GCA_000001.1", "RefSeq") + + +def test_validate_assembly_acc_missing_version(): + # Test missing version number for GenBank accession + with pytest.raises(ValueError, match="Invalid assembly accession \\(missing version number\\)"): + validate_assembly_acc("GCA_000002", "GenBank") + # Test missing version number for RefSeq accession + with pytest.raises(ValueError, match="Invalid assembly accession \\(missing version number\\)"): + validate_assembly_acc("GCF_000001", "RefSeq") diff --git a/tests/unit/genomics/test_ncbi_downloader.py b/tests/unit/genomics/test_ncbi_downloader.py new file mode 100644 index 00000000..bc1dda48 --- /dev/null +++ b/tests/unit/genomics/test_ncbi_downloader.py @@ -0,0 +1,45 @@ +from unittest.mock import patch +import httpx +import pytest +from nplinker.genomics.antismash.ncbi_downloader import download_and_extract_ncbi_genome + + +@pytest.fixture +def download_root(tmp_path): + return tmp_path / "download" + + +@pytest.fixture +def extract_root(tmp_path): + return tmp_path / "extracted" + + +def test_download_and_extract_ncbi_genome_success(download_root, extract_root): + assembly_accession = "GCF_000514775.1" + + genome_path = download_and_extract_ncbi_genome(assembly_accession, download_root, extract_root) + + assert genome_path == extract_root / "ncbi_genomes" / f"{assembly_accession}.gbff" + assert not (extract_root / "ncbi_genomes" / "md5sum.txt").exists() + assert not (extract_root / "ncbi_genomes" / "README.md").exists() + assert not (extract_root / "ncbi_genomes" / "ncbi_dataset").exists() + + +def test_download_and_extract_ncbi_genome_max_retries(download_root, extract_root): + assembly_accession = "GCF_000514775.1" + + with patch( + "nplinker.genomics.antismash.ncbi_downloader.download_url", + side_effect=httpx.ReadTimeout("Download failed"), + ): + with pytest.raises(httpx.ReadTimeout, match="Maximum download retries"): + download_and_extract_ncbi_genome( + assembly_accession, download_root, extract_root, max_attempts=1 + ) + + +def test_download_and_extract_ncbi_genome_invalid_accession(download_root, extract_root): + assembly_accession = "invalid_ref_seq_id" + + with pytest.raises(ValueError, match="Not a valid genome assembly accession"): + download_and_extract_ncbi_genome(assembly_accession, download_root, extract_root) diff --git a/tests/unit/genomics/test_podp_antismash_downloader.py b/tests/unit/genomics/test_podp_antismash_downloader.py index 7fb9ec0c..6be23f7f 100644 --- a/tests/unit/genomics/test_podp_antismash_downloader.py +++ b/tests/unit/genomics/test_podp_antismash_downloader.py @@ -36,7 +36,7 @@ def genome_status_file(download_root): ) def test_genome_status_init(params, expected): gs = GenomeStatus(*params) - assert [gs.original_id, gs.resolved_refseq_id, gs.resolve_attempted, gs.bgc_path] == expected + assert [gs.original_id, gs.resolved_id, gs.failed_previously, gs.bgc_path] == expected def test_genome_status_read_json(tmp_path): @@ -44,14 +44,14 @@ def test_genome_status_read_json(tmp_path): "genome_status": [ { "original_id": "genome1", - "resolved_refseq_id": "refseq1", - "resolve_attempted": True, + "resolved_id": "refseq1", + "failed_previously": True, "bgc_path": "/path/to/bgc1", }, { "original_id": "genome2", - "resolved_refseq_id": "", - "resolve_attempted": False, + "resolved_id": "", + "failed_previously": False, "bgc_path": "", }, ], @@ -64,12 +64,12 @@ def test_genome_status_read_json(tmp_path): assert len(genome_status_dict) == 2 assert genome_status_dict["genome1"].original_id == "genome1" - assert genome_status_dict["genome1"].resolved_refseq_id == "refseq1" - assert genome_status_dict["genome1"].resolve_attempted is True + assert genome_status_dict["genome1"].resolved_id == "refseq1" + assert genome_status_dict["genome1"].failed_previously is True assert genome_status_dict["genome1"].bgc_path == "/path/to/bgc1" assert genome_status_dict["genome2"].original_id == "genome2" - assert genome_status_dict["genome2"].resolved_refseq_id == "" - assert genome_status_dict["genome2"].resolve_attempted is False + assert genome_status_dict["genome2"].resolved_id == "" + assert genome_status_dict["genome2"].failed_previously is False assert genome_status_dict["genome2"].bgc_path == "" @@ -86,12 +86,12 @@ def test_genome_status_to_json(tmp_path): assert loaded_data["version"] == "1.0" assert len(loaded_data["genome_status"]) == 2 assert loaded_data["genome_status"][0]["original_id"] == "genome1" - assert loaded_data["genome_status"][0]["resolved_refseq_id"] == "refseq1" - assert loaded_data["genome_status"][0]["resolve_attempted"] is True + assert loaded_data["genome_status"][0]["resolved_id"] == "refseq1" + assert loaded_data["genome_status"][0]["failed_previously"] is True assert loaded_data["genome_status"][0]["bgc_path"] == "/path/to/bgc1" assert loaded_data["genome_status"][1]["original_id"] == "genome2" - assert loaded_data["genome_status"][1]["resolved_refseq_id"] == "" - assert loaded_data["genome_status"][1]["resolve_attempted"] is False + assert loaded_data["genome_status"][1]["resolved_id"] == "" + assert loaded_data["genome_status"][1]["failed_previously"] is False assert loaded_data["genome_status"][1]["bgc_path"] == "" @@ -105,10 +105,10 @@ def test_genome_status_to_json_nofile(): assert isinstance(result, str) assert ( result == '{"genome_status": ' - '[{"original_id": "genome1", "resolved_refseq_id": "refseq1", ' - '"resolve_attempted": true, "bgc_path": "/path/to/bgc1"}, ' - '{"original_id": "genome2", "resolved_refseq_id": "", ' - '"resolve_attempted": false, "bgc_path": ""}], "version": "1.0"}' + '[{"original_id": "genome1", "resolved_id": "refseq1", ' + '"failed_previously": true, "bgc_path": "/path/to/bgc1"}, ' + '{"original_id": "genome2", "resolved_id": "", ' + '"failed_previously": false, "bgc_path": ""}], "version": "1.0"}' ) @@ -215,10 +215,10 @@ def test_caching(download_root, extract_root, genome_status_file, caplog): genome_status_old = GenomeStatus.read_json(genome_status_file) genome_obj = genome_status_old["GCF_000016425.1"] assert Path(genome_obj.bgc_path).exists() - assert genome_obj.resolve_attempted + assert genome_obj.failed_previously is False podp_download_and_extract_antismash_data(genome_records, download_root, extract_root) assert ( - f"Genome ID {genome_obj.original_id} already downloaded to {genome_obj.bgc_path}" + f"antiSMASH BGC data for genome ID {genome_obj.original_id} already downloaded to {genome_obj.bgc_path}" in caplog.text ) assert ( @@ -271,8 +271,8 @@ def test_refseq_id(download_root, extract_root, genome_status_file): genome_status = GenomeStatus.read_json(genome_status_file) genome_obj = genome_status["GCF_000016425.1"] - archive = download_root / Path(str(genome_obj.resolved_refseq_id) + ".zip") - extracted_folder = extract_root / "antismash" / genome_obj.resolved_refseq_id + archive = download_root / Path(str(genome_obj.resolved_id) + ".zip") + extracted_folder = extract_root / "antismash" / genome_obj.resolved_id extracted_files = list_files(extracted_folder, keep_parent=False) assert archive.exists() @@ -298,8 +298,8 @@ def test_genbank_id(download_root, extract_root, genome_status_file): genome_status = GenomeStatus.read_json(genome_status_file) genome_obj = genome_status["GCA_000016425.1"] - archive = download_root / Path(str(genome_obj.resolved_refseq_id) + ".zip") - extracted_folder = extract_root / "antismash" / genome_obj.resolved_refseq_id + archive = download_root / Path(str(genome_obj.resolved_id) + ".zip") + extracted_folder = extract_root / "antismash" / genome_obj.resolved_id extracted_files = list_files(extracted_folder, keep_parent=False) assert archive.exists() @@ -325,8 +325,8 @@ def test_jgi_id(download_root, extract_root, genome_status_file): genome_status = GenomeStatus.read_json(genome_status_file) genome_obj = genome_status["640427140"] - archive = download_root / Path(str(genome_obj.resolved_refseq_id) + ".zip") - extracted_folder = extract_root / "antismash" / genome_obj.resolved_refseq_id + archive = download_root / Path(str(genome_obj.resolved_id) + ".zip") + extracted_folder = extract_root / "antismash" / genome_obj.resolved_id extracted_files = list_files(extracted_folder, keep_parent=False) assert archive.exists() @@ -357,8 +357,8 @@ def test_refseq_jgi_id(download_root, extract_root, genome_status_file): genome_status = GenomeStatus.read_json(genome_status_file) genome_obj = genome_status["GCF_000016425.1"] - archive = download_root / Path(str(genome_obj.resolved_refseq_id) + ".zip") - extracted_folder = extract_root / "antismash" / genome_obj.resolved_refseq_id + archive = download_root / Path(str(genome_obj.resolved_id) + ".zip") + extracted_folder = extract_root / "antismash" / genome_obj.resolved_id extracted_files = list_files(extracted_folder, keep_parent=False) assert archive.exists() @@ -389,8 +389,8 @@ def test_refseq_genbank_id(download_root, extract_root, genome_status_file): genome_status = GenomeStatus.read_json(genome_status_file) genome_obj = genome_status["GCF_000016425.1"] - archive = download_root / Path(str(genome_obj.resolved_refseq_id) + ".zip") - extracted_folder = extract_root / "antismash" / genome_obj.resolved_refseq_id + archive = download_root / Path(str(genome_obj.resolved_id) + ".zip") + extracted_folder = extract_root / "antismash" / genome_obj.resolved_id extracted_files = list_files(extracted_folder, keep_parent=False) assert archive.exists() @@ -421,8 +421,8 @@ def test_genbank_jgi_id(download_root, extract_root, genome_status_file): genome_status = GenomeStatus.read_json(genome_status_file) genome_obj = genome_status["GCA_000016425.1"] - archive = download_root / Path(str(genome_obj.resolved_refseq_id) + ".zip") - extracted_folder = extract_root / "antismash" / genome_obj.resolved_refseq_id + archive = download_root / Path(str(genome_obj.resolved_id) + ".zip") + extracted_folder = extract_root / "antismash" / genome_obj.resolved_id extracted_files = list_files(extracted_folder, keep_parent=False) assert archive.exists() diff --git a/tests/unit/genomics/test_utils.py b/tests/unit/genomics/test_utils.py index 8a17aa69..7d68eb66 100644 --- a/tests/unit/genomics/test_utils.py +++ b/tests/unit/genomics/test_utils.py @@ -192,20 +192,20 @@ def test_extract_mappings_original_genome_id_resolved_genome_id(tmp_path): "genome_status": [ { "original_id": "id1", - "resolved_refseq_id": "refseq1", - "resolve_attempted": True, + "resolved_id": "refseq1", + "failed_previously": True, "bgc_path": "", }, { "original_id": "id2", - "resolved_refseq_id": "refseq2", - "resolve_attempted": True, + "resolved_id": "refseq2", + "failed_previously": True, "bgc_path": "", }, { "original_id": "id3", - "resolved_refseq_id": "refseq3", - "resolve_attempted": True, + "resolved_id": "refseq3", + "failed_previously": True, "bgc_path": "", }, ], diff --git a/tests/unit/schemas/test_genome_status_schema.py b/tests/unit/schemas/test_genome_status_schema.py index de0679a5..ddde0fa9 100644 --- a/tests/unit/schemas/test_genome_status_schema.py +++ b/tests/unit/schemas/test_genome_status_schema.py @@ -10,9 +10,7 @@ data_empty_genome_status = {"genome_status": [], "version": "1.0"} data_no_original_id = { - "genome_status": [ - {"resolved_refseq_id": "id1_refseq", "resolve_attempted": True, "bgc_path": ""} - ], + "genome_status": [{"resolved_id": "id1_refseq", "failed_previously": True, "bgc_path": ""}], "version": "1.0", } @@ -20,8 +18,8 @@ "genome_status": [ { "original_id": "", - "resolved_refseq_id": "id1_refseq", - "resolve_attempted": True, + "resolved_id": "id1_refseq", + "failed_previously": True, "bgc_path": "", } ], @@ -32,37 +30,37 @@ "genome_status": [ { "original_id": 1, - "resolved_refseq_id": "id1_refseq", - "resolve_attempted": True, + "resolved_id": "id1_refseq", + "failed_previously": True, "bgc_path": "", } ], "version": "1.0", } -data_no_resolved_refseq_id = { - "genome_status": [{"original_id": "id1", "resolve_attempted": True, "bgc_path": ""}], +data_no_resolved_id = { + "genome_status": [{"original_id": "id1", "failed_previously": True, "bgc_path": ""}], "version": "1.0", } -data_invalid_resolved_refseq_id = { +data_invalid_resolved_id = { "genome_status": [ - {"original_id": "id1", "resolved_refseq_id": 1, "resolve_attempted": True, "bgc_path": ""} + {"original_id": "id1", "resolved_id": 1, "failed_previously": True, "bgc_path": ""} ], "version": "1.0", } -data_no_resolve_attempted = { - "genome_status": [{"original_id": "id1", "resolved_refseq_id": "id1_refseq", "bgc_path": ""}], +data_no_failed_previously = { + "genome_status": [{"original_id": "id1", "resolved_id": "id1_refseq", "bgc_path": ""}], "version": "1.0", } -data_invalid_resolve_attempted = { +data_invalid_failed_previously = { "genome_status": [ { "original_id": "id1", - "resolved_refseq_id": "id1_refseq", - "resolve_attempted": 1, + "resolved_id": "id1_refseq", + "failed_previously": 1, "bgc_path": "", } ], @@ -71,7 +69,7 @@ data_no_bgc_path = { "genome_status": [ - {"original_id": "id1", "resolved_refseq_id": "id1_refseq", "resolve_attempted": True} + {"original_id": "id1", "resolved_id": "id1_refseq", "failed_previously": True} ], "version": "1.0", } @@ -80,8 +78,8 @@ "genome_status": [ { "original_id": "id1", - "resolved_refseq_id": "id1_refseq", - "resolve_attempted": True, + "resolved_id": "id1_refseq", + "failed_previously": True, "bgc_path": 1, } ], @@ -94,7 +92,7 @@ data_empty_version = { "genome_status": [{"strain_id": "strain1", "strain_alias": ["alias1", "alias2"]}], - "version": "" "", + "version": "", } data_invalid_version = { @@ -112,10 +110,10 @@ [data_no_original_id, "'original_id' is a required property"], [data_empty_original_id, "'' should be non-empty"], [data_invalid_original_id, "1 is not of type 'string'"], - [data_no_resolved_refseq_id, "'resolved_refseq_id' is a required property"], - [data_invalid_resolved_refseq_id, "1 is not of type 'string'"], - [data_no_resolve_attempted, "'resolve_attempted' is a required property"], - [data_invalid_resolve_attempted, "1 is not of type 'boolean'"], + [data_no_resolved_id, "'resolved_id' is a required property"], + [data_invalid_resolved_id, "1 is not of type 'string'"], + [data_no_failed_previously, "'failed_previously' is a required property"], + [data_invalid_failed_previously, "1 is not of type 'boolean'"], [data_no_bgc_path, "'bgc_path' is a required property"], [data_invalid_bgc_path, "1 is not of type 'string'"], [data_no_version, "'version' is a required property"], @@ -135,14 +133,14 @@ def test_valid_data(): "genome_status": [ { "original_id": "id1", - "resolved_refseq_id": "id1_refseq", - "resolve_attempted": True, + "resolved_id": "id1_refseq", + "failed_previously": True, "bgc_path": "", }, { "original_id": "id2", - "resolved_refseq_id": "id2_refseq", - "resolve_attempted": False, + "resolved_id": "id2_refseq", + "failed_previously": False, "bgc_path": "", }, ],