diff --git a/.github/workflows/pytest_codecov.yml b/.github/workflows/pytest_codecov.yml index bf35616e4..25ac64acf 100644 --- a/.github/workflows/pytest_codecov.yml +++ b/.github/workflows/pytest_codecov.yml @@ -32,19 +32,19 @@ jobs: PYTHON: ${{ matrix.python-version }} name: pytest & codecov steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Cache conda - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: ~/conda_pkgs_dir key: ${{ runner.os }}-conda-py${{ matrix.python-version }}-${{ hashFiles('tests/environment.yml') }} - name: Cache test data - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: tests/data/test_data.json key: ${{ runner.os }}-test-data - name: Setup mamba - uses: conda-incubator/setup-miniconda@v2 + uses: conda-incubator/setup-miniconda@v3 with: python-version: ${{ matrix.python-version }} mamba-version: "*" @@ -70,7 +70,7 @@ jobs: shell: bash -l {0} run: python -m pytest --cov-report=xml --cov=autometa tests/ - name: Upload coverage to Codecov - uses: codecov/codecov-action@v2 + uses: codecov/codecov-action@v5 with: env_vars: OS,PYTHON flags: unittests diff --git a/Makefile b/Makefile index 71b0112ba..2a3b47556 100644 --- a/Makefile +++ b/Makefile @@ -28,6 +28,10 @@ clean: find . -type d -name "Autometa.egg-info" -exec rm -r {} + find . -type d -name "dist" -exec rm -r {} + find . -type d -name "build" -exec rm -r {} + + find . -name ".nextflow.log.*" -exec rm -r {} + + find . -name ".nextflow.log" -exec rm {} + + find . -type d -name ".nextflow" -exec rm -r {} + + find . -type d -name "work" -exec rm -r {} + ## Apply black formatting black: diff --git a/autometa/config/databases.py b/autometa/config/databases.py index 74b59e87c..14ea42c3e 100644 --- a/autometa/config/databases.py +++ b/autometa/config/databases.py @@ -9,6 +9,7 @@ import logging import os +from pathlib import Path import requests import sys import subprocess @@ -33,8 +34,15 @@ from autometa.config.utilities import DEFAULT_CONFIG from autometa.config.utilities import AUTOMETA_DIR from autometa.config.utilities import put_config, get_config -from autometa.taxonomy.gtdb import create_gtdb_db - +from autometa.taxonomy.download_gtdb_files import ( + create_combined_gtdb_fasta, + unpack_gtdb_taxdump, +) +from autometa.taxonomy.download_gtdb_files import ( + download_gtdb_taxdump, + download_proteins_aa_reps, + get_latest_gtdb_version, +) logger = logging.getLogger(__name__) urllib_logger = logging.getLogger("urllib3") @@ -404,29 +412,65 @@ def download_ncbi_files(self, options: Iterable) -> None: if "nr" in options: self.format_nr() - def download_gtdb_files(self) -> None: - gtdb_taxdump_url = self.config.get("database_urls", "gtdb_taxdmp") - proteins_aa_reps_url = self.config.get("database_urls", "proteins_aa_reps") - - # User path: - gtdb_taxdump_filepath = self.config.get("gtdb", "gtdb_taxdmp") - proteins_aa_reps_filepath = self.config.get("gtdb", "proteins_aa_reps") - - urls = [gtdb_taxdump_url, proteins_aa_reps_url] - filepaths = [gtdb_taxdump_filepath, proteins_aa_reps_filepath] - - logger.debug(f"starting GTDB databases download") - for url, filepath in zip(urls, filepaths): - cmd = ["wget", url, "-O", filepath] - full_path = os.path.abspath(filepath) - dir_path = os.path.dirname(full_path) - if not os.path.exists(dir_path): - os.makedirs(dir_path) - logger.debug(f"Created missing database directory: {dir_path}") - logger.debug(" ".join(cmd)) - subprocess.run( - cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True - ) + def download_and_format_gtdb_files(self) -> None: + + # urls + gtdb_taxdump_url = self.config.get( + "gtdb", "host" + ) # e.g. data.ace.uq.edu.au/public/gtdb/data + gtdb_version = self.config.get("gtdb", "release") # e.g. latest, 220 + # local file parent directories + gtdb_taxdmp_directory = self.config.get("gtdb", "gtdb_taxdmp") + proteins_aa_reps_directory = self.config.get("gtdb", "proteins_aa_reps") + # ensure the directories exist + if not Path(gtdb_taxdmp_directory).exists(): + logger.info(f"Creating directory: {gtdb_taxdmp_directory}") + Path(gtdb_taxdmp_directory).mkdir(parents=True) + if not Path(proteins_aa_reps_directory).exists(): + logger.info(f"Creating directory: {proteins_aa_reps_directory}") + Path(proteins_aa_reps_directory).mkdir(parents=True) + + if gtdb_version == "latest": + gtdb_version = get_latest_gtdb_version(gtdb_taxdump_url) + logger.info(f"Using 'latest' GTDB version: {gtdb_version}") + self.config.set("gtdb", "release", gtdb_version) + + if "." in gtdb_version: + gtdb_version = gtdb_version.split(".")[0] + gtdb_subversion = gtdb_version.split(".")[1] + else: + gtdb_subversion = "0" + if int(gtdb_version) < 220: + raise ValueError("GTDB versions <220 cannot be used due file differences") + gtdb_taxdmp_path = Path( + gtdb_taxdmp_directory, f"gtdb-taxdump-version-{gtdb_version}.tar.gz" + ) + aa_reps_path = Path( + proteins_aa_reps_directory, + f"gtdb_proteins_aa_reps-version-{gtdb_version}.{gtdb_subversion}.tar.gz", + ) + gtdb_taxdmp_path = download_gtdb_taxdump( + gtdb_version=gtdb_version, outpath=gtdb_taxdmp_path + ) + taxdmp_dir = unpack_gtdb_taxdump( + tar_file=gtdb_taxdmp_path, gtdb_version=gtdb_version + ) + combined_faa_path = Path( + self.config.get("databases", "gtdb"), + f"autometa_formatted_gtdb-version-{gtdb_version}.{gtdb_subversion}.faa.gz", + ) + aa_reps_path = download_proteins_aa_reps( + host=gtdb_taxdump_url, + version=gtdb_version, + subversion=gtdb_subversion, + outpath=aa_reps_path, + ) + create_combined_gtdb_fasta(tar_file=aa_reps_path, outpath=combined_faa_path) + return { + "taxdmp_dir": taxdmp_dir, + "gtdb_aa_reps_path": aa_reps_path, + "combined_faa_path": combined_faa_path, + } def press_hmms(self) -> None: """hmmpress markers hmm database files. @@ -809,19 +853,15 @@ def main(): elif args.update_ncbi: section = "ncbi" elif args.update_gtdb: - if not os.path.exists( - dbs.config.get("gtdb", "proteins_aa_reps") - ) and not os.path.exists(dbs.config.get("gtdb", "gtdb_taxdmp")): - logger.info(f"GTDB database downloading: ") - dbs.download_gtdb_files() - # Format GTDB amino acid database - gtdb_combined = create_gtdb_db( - reps_faa=dbs.config.get("gtdb", "proteins_aa_reps"), - dbdir=dbs.config.get("databases", "gtdb"), - ) + paths = dbs.download_and_format_gtdb_files() + + database_path = str(paths.get("combined_faa_path")).replace(".faa.gz", ".dmnd") + if os.path.exists(database_path): + logger.info(f"GTDB DIAMOND database already exists: {database_path}") + sys.exit(0) diamond.makedatabase( - fasta=gtdb_combined, - database=gtdb_combined.replace(".faa", ".dmnd"), + fasta=str(paths.get("combined_faa_path")), + database=str(paths.get("combined_faa_path")).replace(".faa.gz", ".dmnd"), cpus=args.nproc, ) sys.exit(0) diff --git a/autometa/config/default.config b/autometa/config/default.config index 60a474f64..02880fb4d 100644 --- a/autometa/config/default.config +++ b/autometa/config/default.config @@ -61,9 +61,6 @@ bacteria_single_copy = https://${markers:host}/KwanLab/Autometa/main/autometa/da bacteria_single_copy_cutoffs = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/bacteria.single_copy.cutoffs archaea_single_copy = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/archaea.single_copy.hmm archaea_single_copy_cutoffs = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/archaea.single_copy.cutoffs -proteins_aa_reps = https://${gtdb:host}/releases/${gtdb:release}/genomic_files_reps/gtdb_proteins_aa_reps.tar.gz -gtdb_taxdmp = https://github.com/shenwei356/gtdb-taxdump/releases/latest/download/gtdb-taxdump.tar.gz - [checksums] taxdump = ftp://${ncbi:host}/pub/taxonomy/taxdump.tar.gz.md5 @@ -85,10 +82,10 @@ accession2taxid = ${databases:ncbi}/prot.accession2taxid.gz nr = ${databases:ncbi}/nr.gz [gtdb] -host = data.gtdb.ecogenomic.org -release = latest -proteins_aa_reps = ${databases:gtdb}/gtdb_proteins_aa_reps.tar.gz -gtdb_taxdmp = ${databases:gtdb}/gtdb-taxdump.tar.gz +host = data.ace.uq.edu.au/public/gtdb/data +release = 220 +proteins_aa_reps = ${databases:gtdb} +gtdb_taxdmp = ${databases:gtdb} [markers] host = raw.githubusercontent.com diff --git a/autometa/taxonomy/download_gtdb_files.py b/autometa/taxonomy/download_gtdb_files.py new file mode 100644 index 000000000..ceabc7225 --- /dev/null +++ b/autometa/taxonomy/download_gtdb_files.py @@ -0,0 +1,314 @@ +import requests +import logging +import math +from pathlib import Path +import gzip +import hashlib +import tarfile +import re + +from tqdm import tqdm + + +# Set up logger +logger = logging.getLogger(__name__) + +# --------------------- MD5 Checksum Calculation --------------------- +def calculate_md5(filepath, chunk_size=1024 * 1024): + """Calculates the MD5 checksum of a file""" + md5 = hashlib.md5() + with open(filepath, "rb") as f: + while chunk := f.read(chunk_size): + md5.update(chunk) + return md5.hexdigest() + + +# --------------------- GTDB Version Handling --------------------- +def get_latest_gtdb_version(host): + """Fetches the latest GTDB version number from the GTDB server""" + try: + response = requests.get(f"https://{host}/releases/latest/VERSION.txt") + response.raise_for_status() + version = response.text.splitlines()[0] + version = version[1:] if version.startswith("v") else version + return version + except requests.exceptions.RequestException as e: + raise RuntimeError(f"Failed to fetch GTDB version: {e}") + + +# --------------------- Taxdump Release URL Fetch --------------------- +def get_gtdb_taxdump_release_url(gtdb_version): + """Finds the download URL for the GTDB taxdump file for a specific GTDB release""" + if gtdb_version == "latest": + raise ValueError( + "Latest version not supported. Please specify a version number." + ) + try: + releases_url = "https://api.github.com/repos/shenwei356/gtdb-taxdump/releases" + response = requests.get(releases_url) + response.raise_for_status() + releases = response.json() + version_used = str(math.floor(float(gtdb_version))) + + for release in releases: + if f"r{version_used}" in release["name"]: + for asset in release["assets"]: + if "gtdb-taxdump.tar.gz" in asset["name"]: + download_url = asset["browser_download_url"] + logger.info(f"Download URL found: {download_url}") + return download_url + logger.error(f"Version R{gtdb_version} not found.") + return None + except requests.exceptions.RequestException as e: + logger.error(f"Failed to fetch releases: {e}") + return None + + +# --------------------- GTDB Taxdump Download --------------------- +def download_gtdb_taxdump(gtdb_version, outpath, force=False): + """Downloads the GTDB taxdump file for a specific GTDB release""" + if not force and Path(outpath).exists(): + logger.info(f"File already exists: {outpath}") + return outpath + try: + download_url = get_gtdb_taxdump_release_url(gtdb_version) + if download_url: + response = requests.get(download_url, stream=True) + response.raise_for_status() + total_size = int(response.headers.get("content-length", 0)) + chunk_size = 1024 + with tqdm( + total=total_size, unit="B", unit_scale=True, desc=str(outpath) + ) as pbar: + with open(outpath, "wb") as f: + for chunk in response.iter_content(chunk_size=chunk_size): + if chunk: + f.write(chunk) + pbar.update(len(chunk)) + logger.info(f"Download complete. File saved to: {outpath}") + else: + logger.error("Download URL was not found.") + except requests.exceptions.RequestException as e: + logger.error(f"Failed to download GTDB taxdump: {e}") + except IOError as e: + logger.error(f"File write error: {e}") + return outpath + + +def unpack_gtdb_taxdump(tar_file, gtdb_version, outdir=None, force=False): + """Extracts the GTDB taxdump file and renames the directory to include the GTDB version""" + if not outdir: + outdir = Path(tar_file).parent + target_dir = f"gtdb-taxdump/R{gtdb_version}" + new_dir_prefix = f"gtdb_taxdump-version-{gtdb_version}" + if not force and Path(outdir, new_dir_prefix).exists(): + logger.info( + f"Directory already exists: {outdir}/{new_dir_prefix}, use --force to overwrite." + ) + return Path(outdir, new_dir_prefix) + with tarfile.open(tar_file, "r:gz") as tar: + members = [] + for member in tar.getmembers(): + if member.name.startswith(target_dir): + # Adjust the path to rename the folder on extraction + member.name = member.name.replace(target_dir, new_dir_prefix, 1) + members.append(member) + if members: + tar.extractall(outdir, members=members) + print(f"Extracted and renamed {target_dir} to {new_dir_prefix} in {outdir}") + else: + print(f"Directory {target_dir} not found in the archive.") + return Path(outdir, new_dir_prefix) + + +# --------------------- Proteins AA Reps Download with MD5 Verification --------------------- +def download_proteins_aa_reps(host, version, subversion, outpath, force=False): + """Downloads the GTDB proteins_aa_reps tarball for a specific GTDB release, with MD5 checksum verification""" + if not force and Path(outpath).exists(): + logger.info(f"File already exists: {outpath}") + return + if version == "latest": + try: + version = get_latest_gtdb_version(host) + except requests.exceptions.RequestException as e: + logger.error(f"Failed to fetch GTDB version number: {e}") + raise + logger.info(f"Downloading gtdb_proteins_aa_reps.tar.gz, version {version}") + try: + md5sum_url = f"https://{host}/releases/release{version}/{version}.{subversion}/MD5SUM.txt" + response = requests.get(md5sum_url) + response.raise_for_status() + md5sum_lines = response.text.splitlines() + expected_md5 = None + filename = f"genomic_files_reps/gtdb_proteins_aa_reps_r{version}.tar.gz" + for line in md5sum_lines: + if filename in line: + expected_md5 = line.split()[0] + break + if not expected_md5: + logger.error( + f"MD5 checksum for version {version} not found in {md5sum_url}." + ) + return + except requests.exceptions.RequestException as e: + logger.error(f"Failed to fetch MD5SUM.txt: {e}") + return + url = f"https://{host}/releases/release{version}/{version}.{subversion}/genomic_files_reps/gtdb_proteins_aa_reps_r{version}.tar.gz" + try: + with requests.get(url, stream=True) as r: + r.raise_for_status() + logger.info(f"Downloading from {url}") + total_size = int(r.headers.get("content-length", 0)) + chunk_size = 1024 * 1024 + md5 = hashlib.md5() + + with tqdm( + total=total_size, unit="B", unit_scale=True, desc=str(outpath) + ) as pbar: + with open(outpath, "wb") as f: + for chunk in r.iter_content(chunk_size=chunk_size): + if chunk: + f.write(chunk) + pbar.update(len(chunk)) + md5.update(chunk) + calculated_md5 = md5.hexdigest() + if calculated_md5 == expected_md5: + logger.info(f"MD5 checksum verification passed for {outpath}.") + else: + logger.error( + f"MD5 checksum verification failed for {outpath}. Expected {expected_md5}, got {calculated_md5}." + ) + except requests.exceptions.RequestException as e: + logger.error(f"Failed to download the file: {e}") + except IOError as e: + logger.error(f"File write error: {e}") + return outpath + + +# --------------------- Combined GTDB FASTA Creation --------------------- +def create_combined_gtdb_fasta(tar_file: str, outpath: str, force=False): + """ + Generate a combined faa file to create the GTDB-t database. + + Parameters + ---------- + tar_file : str + The downloaded gtdb_proteins_aa_reps_*.tar.gz file. + outpath : str + Path to the combined FASTA file to be written. + force : bool, optional + If True, overwrite the output file if it already exists, by default False + + Returns + ------- + str + Path to combined faa file. This can be used to make a diamond database. + """ + # Check if the output file already exists + if not force and Path(outpath).exists(): + logger.info(f"File already exists: {outpath}") + return outpath + # Open the combined output file + with gzip.open(outpath, "wt") as f_out: + # Open the tar.gz archive + with tarfile.open(tar_file, "r:gz") as tar: + # Initialize tqdm progress bars + with tqdm( + desc="Files read", unit="file", position=0, leave=True + ) as file_pbar, tqdm( + desc="Sequences written", unit="seq", position=1, leave=True + ) as seq_pbar: + # Iterate over members in the tar file + for member in tar: + # Check if the member is a file and ends with .faa.gz + if member.isfile() and member.name.endswith(".faa.gz"): + # Search for genome accession in the file name + genome_acc_search = re.search( + r"(GCA_\d+\.\d+|GCF_\d+\.\d+)", member.name + ) + if genome_acc_search: + genome_acc = genome_acc_search.group() + else: + raise ValueError( + f"Could not find genome accession for {member.name}" + ) + # Extract and read the content of the .faa.gz file + with tar.extractfile(member) as f_in: + with gzip.GzipFile(fileobj=f_in) as gz_in: + seq_count = ( + 0 # Initialize sequence counter for the file + ) + for line in gz_in: + line = line.decode("utf-8") + if line.startswith(">"): + seqheader = line.lstrip(">").strip() + outline = f">{genome_acc} {seqheader}\n" + seq_pbar.update(seq_count) + else: + outline = line + f_out.write(outline) + seq_count += 1 # Increment sequence count + file_pbar.update( + 1 + ) # Update file progress bar after processing each file + logger.debug(f"Combined GTDB faa file written to {outpath}") + return outpath + + +def download_and_format(gtdb_host, gtdb_version, single_dir, force=False): + """ + Download and format GTDB and NCBI files. + + Parameters + ---------- + gtdb_host : str + The GTDB host to download files from. + gtdb_version : str + The GTDB version to download. + single_dir : str + The single directory to download and format files into. + dryrun : bool, optional + If True, only print what would be done, by default False + """ + if gtdb_version == "latest": + gtdb_version = get_latest_gtdb_version(gtdb_host) + logger.info(f"Using 'latest' GTDB version: {gtdb_version}") + + if "." in gtdb_version: + gtdb_version = gtdb_version.split(".")[0] + gtdb_subversion = gtdb_version.split(".")[1] + else: + gtdb_subversion = "0" + gtdb_taxdmp_path = Path(single_dir, f"gtdb-taxdump-version-{gtdb_version}.tar.gz") + # have to rename because GTDB file doesn't have subversion in the name + aa_reps_path = Path( + single_dir, + f"gtdb_proteins_aa_reps-version-{gtdb_version}.{gtdb_subversion}.tar.gz", + ) + gtdb_taxdmp_path = download_gtdb_taxdump( + gtdb_version=gtdb_version, outpath=gtdb_taxdmp_path, force=force + ) + taxdmp_dir = unpack_gtdb_taxdump( + tar_file=gtdb_taxdmp_path, gtdb_version=gtdb_version, force=force + ) + aa_reps_path = download_proteins_aa_reps( + host=gtdb_host, + version=gtdb_version, + subversion=gtdb_subversion, + outpath=aa_reps_path, + force=force, + ) + combined_gtdb_fasta = create_combined_gtdb_fasta( + tar_file=aa_reps_path, + outpath=Path( + single_dir, + f"autometa_formatted_gtdb-version-{gtdb_version}.{gtdb_subversion}.faa.gz", + ), + force=force, + ) + return { + "gtdb_taxdmp_path": gtdb_taxdmp_path, + "taxdmp_dir": taxdmp_dir, + "aa_reps_path": aa_reps_path, + "combined_gtdb_fasta": combined_gtdb_fasta, + } diff --git a/autometa/taxonomy/gtdb.py b/autometa/taxonomy/gtdb.py index 36590b9b0..6af8fd92e 100644 --- a/autometa/taxonomy/gtdb.py +++ b/autometa/taxonomy/gtdb.py @@ -9,14 +9,12 @@ import gzip import logging import os -import re -import tarfile -import glob from typing import Dict, Set, Tuple from itertools import chain from tqdm import tqdm from typing import Dict +from configparser import ConfigParser import pandas as pd import multiprocessing as mp @@ -24,91 +22,47 @@ from autometa.common.utilities import file_length, is_gz_file from autometa.common.external import diamond from autometa.taxonomy.database import TaxonomyDatabase - +from autometa.taxonomy.download_gtdb_files import ( + create_combined_gtdb_fasta, + get_latest_gtdb_version, +) +from autometa.config.utilities import DEFAULT_CONFIG logger = logging.getLogger(__name__) -def create_gtdb_db(reps_faa: str, dbdir: str) -> str: - """ - Generate a combined faa file to create the GTDB-t database. - - Parameters - ---------- - reps_faa : str - Directory having faa file of all representative genomes. Can be tarballed. - dbdir : str - Path to output directory. - - Returns - ------- - str - Path to combined faa file. This can be used to make a diamond database. - """ - - if reps_faa.endswith(".tar.gz"): - logger.debug( - f"Extracting tarball containing GTDB ref genome animo acid data sequences to: {dbdir}/protein_faa_reps" - ) - tar = tarfile.open(reps_faa) - tar.extractall(path=dbdir) - tar.close() - logger.debug("Extraction done.") - reps_faa = dbdir - - genome_protein_faa_filepaths = glob.glob( - os.path.join(reps_faa, "**", "*_protein.faa*"), - recursive=True - # To find *_protein.faa and *_protein.faa.gz files - ) - - faa_index: Dict[str, str] = {} - for genome_protein_faa_filepath in genome_protein_faa_filepaths: - # Regex to get the genome accession from the file path - genome_acc_search = re.search( - r"GCA_\d+.\d?|GCF_\d+.\d?", genome_protein_faa_filepath - ) - if genome_acc_search: - faa_index[genome_protein_faa_filepath] = genome_acc_search.group() - - # Create dbdir if it doesn't exist - if not os.path.isdir(dbdir): - os.makedirs(dbdir) - - logger.debug(f"Merging {len(faa_index):,} faa files.") - combined_faa = os.path.join(dbdir, "gtdb.faa") - with open(combined_faa, "w") as f_out: - for faa_file, acc in faa_index.items(): - with gzip.open(faa_file, "rb") as f_in: - for line in f_in: - line = line.decode("utf-8") - if line.startswith(">"): - seqheader = line.lstrip(">").strip() - outline = f">{acc} {seqheader}\n" - else: - outline = line - f_out.write(outline) - logger.debug(f"Combined GTDB faa file written to {combined_faa}") - return combined_faa - - class GTDB(TaxonomyDatabase): """Taxonomy utilities for GTDB databases.""" - def __init__(self, dbdir: str, verbose: bool = True): + def __init__(self, dbdir: str, verbose: bool = True, config=DEFAULT_CONFIG): """ Instantiates the GTDB class - """ + if not isinstance(config, ConfigParser): + raise TypeError(f"config is not ConfigParser : {type(config)}") + self.config = config + # before instantiating the class, check if the GTDB database is present + gtdb_version = self.config.get("gtdb", "release") + if gtdb_version == "latest": + gtdb_version = get_latest_gtdb_version() + logger.info(f"Using 'latest' GTDB version: {gtdb_version}") + if "." in gtdb_version: + gtdb_version = gtdb_version.split(".")[0] + gtdb_subversion = gtdb_version.split(".")[1] + else: + gtdb_subversion = "0" self.dbdir = dbdir self.verbose = verbose self.disable = not self.verbose - self.dmnd_db = os.path.join(self.dbdir, "gtdb.dmnd") - self.accession2taxid = os.path.join(self.dbdir, "taxid.map") - self.nodes_fpath = os.path.join(self.dbdir, "nodes.dmp") - self.names_fpath = os.path.join(self.dbdir, "names.dmp") - self.merged_fpath = os.path.join(self.dbdir, "merged.dmp") - self.delnodes_fpath = os.path.join(self.dbdir, "delnodes.dmp") + self.dmnd_db = os.path.join( + self.config.get("databases", "gtdb"), + f"autometa_formatted_gtdb-version-{gtdb_version}.{gtdb_subversion}.faa.gz", + ) + self.accession2taxid = os.path.join(dbdir, "taxid.map") + self.nodes_fpath = os.path.join(dbdir, "nodes.dmp") + self.names_fpath = os.path.join(dbdir, "names.dmp") + self.merged_fpath = os.path.join(dbdir, "merged.dmp") + self.delnodes_fpath = os.path.join(dbdir, "delnodes.dmp") self.verify_databases() self.names = self.parse_names() self.nodes = self.parse_nodes() @@ -341,7 +295,7 @@ def main(): parser.add_argument( "--reps-faa", - help="Path to directory containing GTDB ref genome animo acid data sequences. Can be tarballed.", + help="Path to directory containing the tarballed GTDB ref genome animo acid data sequences.", required=True, ) parser.add_argument( @@ -355,7 +309,8 @@ def main(): args = parser.parse_args() - gtdb_combined = create_gtdb_db(reps_faa=args.reps_faa, dbdir=args.dbdir) + gtdb_combined = create_combined_gtdb_fasta(reps_faa=args.reps_faa, dbdir=args.dbdir) + diamond.makedatabase( fasta=gtdb_combined, database=gtdb_combined.replace(".faa", ".dmnd"), diff --git a/docs/source/databases.rst b/docs/source/databases.rst index 6850d7550..68a0aee16 100644 --- a/docs/source/databases.rst +++ b/docs/source/databases.rst @@ -13,17 +13,17 @@ Markers ####### .. code-block:: bash - + # Point Autometa to where you would like your markers database directory autometa-config \ --section databases --option markers \ --value - + # Update your markers database directory autometa-update-databases --update-markers .. alert:: - + Do NOT use a trailing slash, e.g. NO ``/`` for the database directory paths! Links to these markers files and their associated cutoff values are below: @@ -68,7 +68,7 @@ Genome Taxonomy Database (GTDB) ############################### If you would like to incorporate the benefits of using the Genome Taxonomy Database, -you can either run the following script or manually download the respective databases. +you can either run the following script or manually download the respective databases. GTDB version 220 or later is required. .. code-block:: bash @@ -81,7 +81,7 @@ you can either run the following script or manually download the respective data autometa-config \ --section gtdb --option release \ --value latest - # Or --value r207 or --value r202, etc. + # Or a version number like `--value 220`, or `--value 220.0`, etc. # Download and format the configured GTDB databases release autometa-update-databases --update-gtdb @@ -93,28 +93,21 @@ you can either run the following script or manually download the respective data See ``autometa-update-databases -h`` and ``autometa-config -h`` for full list of options. -The previous command will download the following GTDB databases and format the `gtdb_proteins_aa_reps.tar.gz` to generate `gtdb.dmnd` to be used by Autometa: +The previous command will download the following GTDB databases and format them for use by Autometa. The filenames will be modified to include the release version number for reproducibility. + +The original files - Amino acid sequences of representative genome - - `gtdb_proteins_aa_reps.tar.gz `_ + - `gtdb_proteins_aa_reps.tar.gz `_ - gtdb-taxdump.tar.gz from `shenwei356/gtdb-taxdump `_ - `gtdb-taxdump.tar.gz `_ -Once unzipped `gtdb-taxdump.tar.gz` will have the taxdump files of all the respective GTDB releases. -Make sure that the release you use is in line with the `gtdb_proteins_aa_reps.tar.gz` release version. -It's better to always use the latest version. - -All the taxonomy files for a specific taxonomy database should be in a single directory. -You can now copy the taxdump files of the desired release version in the sample directory as `gtdb.dmnd` +The initial download and formatting of the GTDB databases can take some time. The GTDB databases are large, and downloading/formatting requires ~283 GB of hard disk space. -Alternatively if you have manually downloaded `gtdb_proteins_aa_reps.tar.gz` and `gtdb-taxdump.tar.gz` you can run the -following command to format the `gtdb_proteins_aa_reps.tar.gz` to generate `gtdb.dmnd` and make it ready for Autometa. - -.. code-block:: bash - - autometa-setup-gtdb --reps-faa --dbdir --cpus 20 - -.. note:: +For version 220, the file sizes are approximately: - Again Make sure that the formatted `gtdb_proteins_aa_reps.tar.gz` database and gtdb taxdump files are in the same directory. +- 77 MB gtdb-taxdump-version-220.tar.gz +- 67 GB gtdb_proteins_aa_reps-version-220.tar.gz +- 149 GB autometa_formatted_gtdb-version-220.0.dmnd +- 103 MB ./gtdb_taxdump-version-220/ diff --git a/tests/unit_tests/test_gtdb_download.py b/tests/unit_tests/test_gtdb_download.py new file mode 100644 index 000000000..9f9a9c08d --- /dev/null +++ b/tests/unit_tests/test_gtdb_download.py @@ -0,0 +1,173 @@ +import pytest +from unittest import mock +import requests +import logging + +from autometa.taxonomy.download_gtdb_files import ( + get_latest_gtdb_version, + get_gtdb_taxdump_release_url, + download_gtdb_taxdump, + download_proteins_aa_reps, +) + +# Mock logger to avoid unnecessary logging during tests +logger = logging.getLogger(__name__) + + +@pytest.fixture +def mock_requests_get(): + with mock.patch("requests.get") as mock_get: + yield mock_get + + +def test_get_latest_gtdb_version_success(mock_requests_get): + # Mock a successful request response + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.text = "v220\n" + mock_requests_get.return_value = mock_response + + host = "data.ace.uq.edu.au/public/gtdb/data" + version = get_latest_gtdb_version(host) + + assert version == "220" + mock_requests_get.assert_called_once_with( + f"https://{host}/releases/latest/VERSION.txt" + ) + + +def test_get_latest_gtdb_version_fail(mock_requests_get): + # Mock a failed request + mock_requests_get.side_effect = requests.exceptions.RequestException( + "Error occurred" + ) + + host = "data.ace.uq.edu.au/public/gtdb/data" + with pytest.raises( + RuntimeError, match="Failed to fetch GTDB version: Error occurred" + ): + get_latest_gtdb_version(host) + + +def test_get_gtdb_taxdump_release_url_success(mock_requests_get): + # Mock a response from GitHub API with a valid release + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.json.return_value = [ + { + "name": "r220", + "assets": [ + { + "name": "gtdb-taxdump.tar.gz", + "browser_download_url": "https://example.com/download", + } + ], + } + ] + mock_requests_get.return_value = mock_response + + gtdb_version = "220" + url = get_gtdb_taxdump_release_url(gtdb_version) + + assert url == "https://example.com/download" + mock_requests_get.assert_called_once_with( + "https://api.github.com/repos/shenwei356/gtdb-taxdump/releases" + ) + + +def test_get_gtdb_taxdump_release_url_not_found(mock_requests_get): + # Mock a response with no matching version + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.json.return_value = [ + {"name": "r219", "assets": [{"name": "gtdb-taxdump.tar.gz"}]} + ] + mock_requests_get.return_value = mock_response + + gtdb_version = "220" + url = get_gtdb_taxdump_release_url(gtdb_version) + + assert url is None + mock_requests_get.assert_called_once_with( + "https://api.github.com/repos/shenwei356/gtdb-taxdump/releases" + ) + + +def test_download_gtdb_taxdump_file_exists(): + # Mock the file already existing + with mock.patch("pathlib.Path.exists", return_value=True): + with mock.patch("requests.get") as mock_get: + download_gtdb_taxdump( + "220", "/some/dir/gtdb-taxdump-R220.tar.gz", force=False + ) + mock_get.assert_not_called() + + +def test_download_gtdb_taxdump_success(mock_requests_get): + # Mock a successful file download + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.headers = {"content-length": "1024"} + mock_response.iter_content = mock.Mock(return_value=[b"chunk1", b"chunk2"]) + # Mock the JSON response for the releases + mock_response.json.return_value = [ + { + "name": "r220", + "assets": [ + { + "name": "gtdb-taxdump.tar.gz", + "browser_download_url": "https://example.com/download", + } + ], + } + ] + mock_requests_get.return_value = mock_response + with mock.patch("pathlib.Path.exists", return_value=False): + with mock.patch("builtins.open", mock.mock_open()) as mock_file: + with mock.patch("autometa.taxonomy.download_gtdb_files.tqdm") as mock_tqdm: + download_gtdb_taxdump( + "220", "/some/dir/gtdb-taxdump-R220.tar.gz", force=False + ) + mock_requests_get.assert_called() + mock_file.assert_called_once_with( + "/some/dir/gtdb-taxdump-R220.tar.gz", "wb" + ) + mock_tqdm.assert_called_once() + + +def test_download_proteins_aa_reps_success(mock_requests_get): + # Mock successful file download for proteins_aa_reps + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.headers = {"content-length": "1024"} + mock_response.iter_content = mock.Mock(return_value=[b"chunk1", b"chunk2"]) + # Configure the mock to support the context manager protocol + mock_response.__enter__ = mock.Mock(return_value=mock_response) + mock_response.__exit__ = mock.Mock(return_value=None) + + # Mock MD5SUM response with expected checksum + md5sum_response = mock.Mock() + md5sum_response.status_code = 200 + md5sum_response.text = "d41d8cd98f00b204e9800998ecf8427e genomic_files_reps/gtdb_proteins_aa_reps_r220.tar.gz" + with mock.patch("pathlib.Path.exists", return_value=False): + with mock.patch( + "requests.get", side_effect=[md5sum_response, mock_response] + ) as mock_get: + with mock.patch("builtins.open", mock.mock_open()) as mock_file: + with mock.patch( + "autometa.taxonomy.download_gtdb_files.tqdm" + ) as mock_tqdm: + download_proteins_aa_reps( + "data.ace.uq.edu.au/public/gtdb/data", + "220", + "1", + "/some/dir/gtdb_proteins_aa_reps-R220.tar.gz", + force=False, + ) + assert ( + mock_get.call_count == 2 + ) # One for MD5, one for the actual download + mock_file.assert_called_once_with( + "/some/dir/gtdb_proteins_aa_reps-R220.tar.gz", "wb" + ) + mock_tqdm.assert_called_once()