diff --git a/.github/workflows/pytest_codecov.yml b/.github/workflows/pytest_codecov.yml index bf35616e4..25ac64acf 100644 --- a/.github/workflows/pytest_codecov.yml +++ b/.github/workflows/pytest_codecov.yml @@ -32,19 +32,19 @@ jobs: PYTHON: ${{ matrix.python-version }} name: pytest & codecov steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Cache conda - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: ~/conda_pkgs_dir key: ${{ runner.os }}-conda-py${{ matrix.python-version }}-${{ hashFiles('tests/environment.yml') }} - name: Cache test data - uses: actions/cache@v2 + uses: actions/cache@v4 with: path: tests/data/test_data.json key: ${{ runner.os }}-test-data - name: Setup mamba - uses: conda-incubator/setup-miniconda@v2 + uses: conda-incubator/setup-miniconda@v3 with: python-version: ${{ matrix.python-version }} mamba-version: "*" @@ -70,7 +70,7 @@ jobs: shell: bash -l {0} run: python -m pytest --cov-report=xml --cov=autometa tests/ - name: Upload coverage to Codecov - uses: codecov/codecov-action@v2 + uses: codecov/codecov-action@v5 with: env_vars: OS,PYTHON flags: unittests diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md index f4fd052f1..20bcc4f64 100644 --- a/CODE_OF_CONDUCT.md +++ b/CODE_OF_CONDUCT.md @@ -1,111 +1,133 @@ -# Code of Conduct at nf-core (v1.0) -## Our Pledge - -In the interest of fostering an open, collaborative, and welcoming environment, we as contributors and maintainers of nf-core, pledge to making participation in our projects and community a harassment-free experience for everyone, regardless of: +# Contributor Covenant Code of Conduct -- Age -- Body size -- Familial status -- Gender identity and expression -- Geographical location -- Level of experience -- Nationality and national origins -- Native language -- Physical and neurological ability -- Race or ethnicity -- Religion -- Sexual identity and orientation -- Socioeconomic status - -Please note that the list above is alphabetised and is therefore not ranked in any order of preference or importance. +## Our Pledge -## Preamble +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. -> Note: This Code of Conduct (CoC) has been drafted by the nf-core Safety Officer and been edited after input from members of the nf-core team and others. "We", in this document, refers to the Safety Officer and members of the nf-core core team, both of whom are deemed to be members of the nf-core community and are therefore required to abide by this Code of Conduct. This document will amended periodically to keep it up-to-date, and in case of any dispute, the most current version will apply. +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. -An up-to-date list of members of the nf-core core team can be found [here](https://nf-co.re/about). Our current safety officer is Renuka Kudva. +## Our Standards -nf-core is a young and growing community that welcomes contributions from anyone with a shared vision for [Open Science Policies](https://www.fosteropenscience.eu/taxonomy/term/8). Open science policies encompass inclusive behaviours and we strive to build and maintain a safe and inclusive environment for all individuals. +Examples of behavior that contributes to a positive environment for our +community include: -We have therefore adopted this code of conduct (CoC), which we require all members of our community and attendees in nf-core events to adhere to in all our workspaces at all times. Workspaces include but are not limited to Slack, meetings on Zoom, Jitsi, YouTube live etc. +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community -Our CoC will be strictly enforced and the nf-core team reserve the right to exclude participants who do not comply with our guidelines from our workspaces and future nf-core activities. +Examples of unacceptable behavior include: -We ask all members of our community to help maintain a supportive and productive workspace and to avoid behaviours that can make individuals feel unsafe or unwelcome. Please help us maintain and uphold this CoC. +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting -Questions, concerns or ideas on what we can include? Contact safety [at] nf-co [dot] re +## Enforcement Responsibilities -## Our Responsibilities +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. -The safety officer is responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behaviour. +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. -The safety officer in consultation with the nf-core core team have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. +## Scope -Members of the core team or the safety officer who violate the CoC will be required to recuse themselves pending investigation. They will not have access to any reports of the violations and be subject to the same actions as others in violation of the CoC. +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official email address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. -## When are where does this Code of Conduct apply? +## Enforcement -Participation in the nf-core community is contingent on following these guidelines in all our workspaces and events. This includes but is not limited to the following listed alphabetically and therefore in no order of preference: +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +[INSERT CONTACT METHOD]. +All complaints will be reviewed and investigated promptly and fairly. -- Communicating with an official project email address. -- Communicating with community members within the nf-core Slack channel. -- Participating in hackathons organised by nf-core (both online and in-person events). -- Participating in collaborative work on GitHub, Google Suite, community calls, mentorship meetings, email correspondence. -- Participating in workshops, training, and seminar series organised by nf-core (both online and in-person events). This applies to events hosted on web-based platforms such as Zoom, Jitsi, YouTube live etc. -- Representing nf-core on social media. This includes both official and personal accounts. +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. -## nf-core cares 😊 +## Enforcement Guidelines -nf-core's CoC and expectations of respectful behaviours for all participants (including organisers and the nf-core team) include but are not limited to the following (listed in alphabetical order): +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: -- Ask for consent before sharing another community member’s personal information (including photographs) on social media. -- Be respectful of differing viewpoints and experiences. We are all here to learn from one another and a difference in opinion can present a good learning opportunity. -- Celebrate your accomplishments at events! (Get creative with your use of emojis šŸŽ‰ 🄳 šŸ’Æ šŸ™Œ !) -- Demonstrate empathy towards other community members. (We don’t all have the same amount of time to dedicate to nf-core. If tasks are pending, don’t hesitate to gently remind members of your team. If you are leading a task, ask for help if you feel overwhelmed.) -- Engage with and enquire after others. (This is especially important given the geographically remote nature of the nf-core community, so let’s do this the best we can) -- Focus on what is best for the team and the community. (When in doubt, ask) -- Graciously accept constructive criticism, yet be unafraid to question, deliberate, and learn. -- Introduce yourself to members of the community. (We’ve all been outsiders and we know that talking to strangers can be hard for some, but remember we’re interested in getting to know you and your visions for open science!) -- Show appreciation and **provide clear feedback**. (This is especially important because we don’t see each other in person and it can be harder to interpret subtleties. Also remember that not everyone understands a certain language to the same extent as you do, so **be clear in your communications to be kind.**) -- Take breaks when you feel like you need them. -- Using welcoming and inclusive language. (Participants are encouraged to display their chosen pronouns on Zoom or in communication on Slack.) +### 1. Correction -## nf-core frowns on šŸ˜• +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. -The following behaviours from any participants within the nf-core community (including the organisers) will be considered unacceptable under this code of conduct. Engaging or advocating for any of the following could result in expulsion from nf-core workspaces. +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. -- Deliberate intimidation, stalking or following and sustained disruption of communication among participants of the community. This includes hijacking shared screens through actions such as using the annotate tool in conferencing software such as Zoom. -- ā€œDoxingā€ i.e. posting (or threatening to post) another person’s personal identifying information online. -- Spamming or trolling of individuals on social media. -- Use of sexual or discriminatory imagery, comments, or jokes and unwelcome sexual attention. -- Verbal and text comments that reinforce social structures of domination related to gender, gender identity and expression, sexual orientation, ability, physical appearance, body size, race, age, religion or work experience. +### 2. Warning -### Online Trolling +**Community Impact**: A violation through a single incident or series of +actions. -The majority of nf-core interactions and events are held online. Unfortunately, holding events online comes with the added issue of online trolling. This is unacceptable, reports of such behaviour will be taken very seriously, and perpetrators will be excluded from activities immediately. +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. -All community members are required to ask members of the group they are working within for explicit consent prior to taking screenshots of individuals during video calls. +### 3. Temporary Ban -## Procedures for Reporting CoC violations +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. -If someone makes you feel uncomfortable through their behaviours or actions, report it as soon as possible. +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. -You can reach out to members of the [nf-core core team](https://nf-co.re/about) and they will forward your concerns to the safety officer(s). +### 4. Permanent Ban -Issues directly concerning members of the core team will be dealt with by other members of the core team and the safety manager, and possible conflicts of interest will be taken into account. nf-core is also in discussions about having an ombudsperson, and details will be shared in due course. +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. -All reports will be handled with utmost discretion and confidentially. +**Consequence**: A permanent ban from any sort of public interaction within the +community. -## Attribution and Acknowledgements +## Attribution -- The [Contributor Covenant, version 1.4](http://contributor-covenant.org/version/1/4) -- The [OpenCon 2017 Code of Conduct](http://www.opencon2017.org/code_of_conduct) (CC BY 4.0 OpenCon organisers, SPARC and Right to Research Coalition) -- The [eLife innovation sprint 2020 Code of Conduct](https://sprint.elifesciences.org/code-of-conduct/) -- The [Mozilla Community Participation Guidelines v3.1](https://www.mozilla.org/en-US/about/governance/policies/participation/) (version 3.1, CC BY-SA 3.0 Mozilla) +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.1, available at +[https://www.contributor-covenant.org/version/2/1/code_of_conduct.html][v2.1]. -## Changelog +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder][Mozilla CoC]. -### v1.0 - March 12th, 2021 +For answers to common questions about this code of conduct, see the FAQ at +[https://www.contributor-covenant.org/faq][FAQ]. Translations are available at +[https://www.contributor-covenant.org/translations][translations]. -- Complete rewrite from original [Contributor Covenant](http://contributor-covenant.org/) CoC. +[homepage]: https://www.contributor-covenant.org +[v2.1]: https://www.contributor-covenant.org/version/2/1/code_of_conduct.html +[Mozilla CoC]: https://github.com/mozilla/diversity +[FAQ]: https://www.contributor-covenant.org/faq +[translations]: https://www.contributor-covenant.org/translations diff --git a/Makefile b/Makefile index 71b0112ba..2a3b47556 100644 --- a/Makefile +++ b/Makefile @@ -28,6 +28,10 @@ clean: find . -type d -name "Autometa.egg-info" -exec rm -r {} + find . -type d -name "dist" -exec rm -r {} + find . -type d -name "build" -exec rm -r {} + + find . -name ".nextflow.log.*" -exec rm -r {} + + find . -name ".nextflow.log" -exec rm {} + + find . -type d -name ".nextflow" -exec rm -r {} + + find . -type d -name "work" -exec rm -r {} + ## Apply black formatting black: diff --git a/autometa/config/databases.py b/autometa/config/databases.py index 74b59e87c..14ea42c3e 100644 --- a/autometa/config/databases.py +++ b/autometa/config/databases.py @@ -9,6 +9,7 @@ import logging import os +from pathlib import Path import requests import sys import subprocess @@ -33,8 +34,15 @@ from autometa.config.utilities import DEFAULT_CONFIG from autometa.config.utilities import AUTOMETA_DIR from autometa.config.utilities import put_config, get_config -from autometa.taxonomy.gtdb import create_gtdb_db - +from autometa.taxonomy.download_gtdb_files import ( + create_combined_gtdb_fasta, + unpack_gtdb_taxdump, +) +from autometa.taxonomy.download_gtdb_files import ( + download_gtdb_taxdump, + download_proteins_aa_reps, + get_latest_gtdb_version, +) logger = logging.getLogger(__name__) urllib_logger = logging.getLogger("urllib3") @@ -404,29 +412,65 @@ def download_ncbi_files(self, options: Iterable) -> None: if "nr" in options: self.format_nr() - def download_gtdb_files(self) -> None: - gtdb_taxdump_url = self.config.get("database_urls", "gtdb_taxdmp") - proteins_aa_reps_url = self.config.get("database_urls", "proteins_aa_reps") - - # User path: - gtdb_taxdump_filepath = self.config.get("gtdb", "gtdb_taxdmp") - proteins_aa_reps_filepath = self.config.get("gtdb", "proteins_aa_reps") - - urls = [gtdb_taxdump_url, proteins_aa_reps_url] - filepaths = [gtdb_taxdump_filepath, proteins_aa_reps_filepath] - - logger.debug(f"starting GTDB databases download") - for url, filepath in zip(urls, filepaths): - cmd = ["wget", url, "-O", filepath] - full_path = os.path.abspath(filepath) - dir_path = os.path.dirname(full_path) - if not os.path.exists(dir_path): - os.makedirs(dir_path) - logger.debug(f"Created missing database directory: {dir_path}") - logger.debug(" ".join(cmd)) - subprocess.run( - cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True - ) + def download_and_format_gtdb_files(self) -> None: + + # urls + gtdb_taxdump_url = self.config.get( + "gtdb", "host" + ) # e.g. data.ace.uq.edu.au/public/gtdb/data + gtdb_version = self.config.get("gtdb", "release") # e.g. latest, 220 + # local file parent directories + gtdb_taxdmp_directory = self.config.get("gtdb", "gtdb_taxdmp") + proteins_aa_reps_directory = self.config.get("gtdb", "proteins_aa_reps") + # ensure the directories exist + if not Path(gtdb_taxdmp_directory).exists(): + logger.info(f"Creating directory: {gtdb_taxdmp_directory}") + Path(gtdb_taxdmp_directory).mkdir(parents=True) + if not Path(proteins_aa_reps_directory).exists(): + logger.info(f"Creating directory: {proteins_aa_reps_directory}") + Path(proteins_aa_reps_directory).mkdir(parents=True) + + if gtdb_version == "latest": + gtdb_version = get_latest_gtdb_version(gtdb_taxdump_url) + logger.info(f"Using 'latest' GTDB version: {gtdb_version}") + self.config.set("gtdb", "release", gtdb_version) + + if "." in gtdb_version: + gtdb_version = gtdb_version.split(".")[0] + gtdb_subversion = gtdb_version.split(".")[1] + else: + gtdb_subversion = "0" + if int(gtdb_version) < 220: + raise ValueError("GTDB versions <220 cannot be used due file differences") + gtdb_taxdmp_path = Path( + gtdb_taxdmp_directory, f"gtdb-taxdump-version-{gtdb_version}.tar.gz" + ) + aa_reps_path = Path( + proteins_aa_reps_directory, + f"gtdb_proteins_aa_reps-version-{gtdb_version}.{gtdb_subversion}.tar.gz", + ) + gtdb_taxdmp_path = download_gtdb_taxdump( + gtdb_version=gtdb_version, outpath=gtdb_taxdmp_path + ) + taxdmp_dir = unpack_gtdb_taxdump( + tar_file=gtdb_taxdmp_path, gtdb_version=gtdb_version + ) + combined_faa_path = Path( + self.config.get("databases", "gtdb"), + f"autometa_formatted_gtdb-version-{gtdb_version}.{gtdb_subversion}.faa.gz", + ) + aa_reps_path = download_proteins_aa_reps( + host=gtdb_taxdump_url, + version=gtdb_version, + subversion=gtdb_subversion, + outpath=aa_reps_path, + ) + create_combined_gtdb_fasta(tar_file=aa_reps_path, outpath=combined_faa_path) + return { + "taxdmp_dir": taxdmp_dir, + "gtdb_aa_reps_path": aa_reps_path, + "combined_faa_path": combined_faa_path, + } def press_hmms(self) -> None: """hmmpress markers hmm database files. @@ -809,19 +853,15 @@ def main(): elif args.update_ncbi: section = "ncbi" elif args.update_gtdb: - if not os.path.exists( - dbs.config.get("gtdb", "proteins_aa_reps") - ) and not os.path.exists(dbs.config.get("gtdb", "gtdb_taxdmp")): - logger.info(f"GTDB database downloading: ") - dbs.download_gtdb_files() - # Format GTDB amino acid database - gtdb_combined = create_gtdb_db( - reps_faa=dbs.config.get("gtdb", "proteins_aa_reps"), - dbdir=dbs.config.get("databases", "gtdb"), - ) + paths = dbs.download_and_format_gtdb_files() + + database_path = str(paths.get("combined_faa_path")).replace(".faa.gz", ".dmnd") + if os.path.exists(database_path): + logger.info(f"GTDB DIAMOND database already exists: {database_path}") + sys.exit(0) diamond.makedatabase( - fasta=gtdb_combined, - database=gtdb_combined.replace(".faa", ".dmnd"), + fasta=str(paths.get("combined_faa_path")), + database=str(paths.get("combined_faa_path")).replace(".faa.gz", ".dmnd"), cpus=args.nproc, ) sys.exit(0) diff --git a/autometa/config/default.config b/autometa/config/default.config index 60a474f64..02880fb4d 100644 --- a/autometa/config/default.config +++ b/autometa/config/default.config @@ -61,9 +61,6 @@ bacteria_single_copy = https://${markers:host}/KwanLab/Autometa/main/autometa/da bacteria_single_copy_cutoffs = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/bacteria.single_copy.cutoffs archaea_single_copy = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/archaea.single_copy.hmm archaea_single_copy_cutoffs = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/archaea.single_copy.cutoffs -proteins_aa_reps = https://${gtdb:host}/releases/${gtdb:release}/genomic_files_reps/gtdb_proteins_aa_reps.tar.gz -gtdb_taxdmp = https://github.com/shenwei356/gtdb-taxdump/releases/latest/download/gtdb-taxdump.tar.gz - [checksums] taxdump = ftp://${ncbi:host}/pub/taxonomy/taxdump.tar.gz.md5 @@ -85,10 +82,10 @@ accession2taxid = ${databases:ncbi}/prot.accession2taxid.gz nr = ${databases:ncbi}/nr.gz [gtdb] -host = data.gtdb.ecogenomic.org -release = latest -proteins_aa_reps = ${databases:gtdb}/gtdb_proteins_aa_reps.tar.gz -gtdb_taxdmp = ${databases:gtdb}/gtdb-taxdump.tar.gz +host = data.ace.uq.edu.au/public/gtdb/data +release = 220 +proteins_aa_reps = ${databases:gtdb} +gtdb_taxdmp = ${databases:gtdb} [markers] host = raw.githubusercontent.com diff --git a/autometa/taxonomy/download_gtdb_files.py b/autometa/taxonomy/download_gtdb_files.py new file mode 100644 index 000000000..ceabc7225 --- /dev/null +++ b/autometa/taxonomy/download_gtdb_files.py @@ -0,0 +1,314 @@ +import requests +import logging +import math +from pathlib import Path +import gzip +import hashlib +import tarfile +import re + +from tqdm import tqdm + + +# Set up logger +logger = logging.getLogger(__name__) + +# --------------------- MD5 Checksum Calculation --------------------- +def calculate_md5(filepath, chunk_size=1024 * 1024): + """Calculates the MD5 checksum of a file""" + md5 = hashlib.md5() + with open(filepath, "rb") as f: + while chunk := f.read(chunk_size): + md5.update(chunk) + return md5.hexdigest() + + +# --------------------- GTDB Version Handling --------------------- +def get_latest_gtdb_version(host): + """Fetches the latest GTDB version number from the GTDB server""" + try: + response = requests.get(f"https://{host}/releases/latest/VERSION.txt") + response.raise_for_status() + version = response.text.splitlines()[0] + version = version[1:] if version.startswith("v") else version + return version + except requests.exceptions.RequestException as e: + raise RuntimeError(f"Failed to fetch GTDB version: {e}") + + +# --------------------- Taxdump Release URL Fetch --------------------- +def get_gtdb_taxdump_release_url(gtdb_version): + """Finds the download URL for the GTDB taxdump file for a specific GTDB release""" + if gtdb_version == "latest": + raise ValueError( + "Latest version not supported. Please specify a version number." + ) + try: + releases_url = "https://api.github.com/repos/shenwei356/gtdb-taxdump/releases" + response = requests.get(releases_url) + response.raise_for_status() + releases = response.json() + version_used = str(math.floor(float(gtdb_version))) + + for release in releases: + if f"r{version_used}" in release["name"]: + for asset in release["assets"]: + if "gtdb-taxdump.tar.gz" in asset["name"]: + download_url = asset["browser_download_url"] + logger.info(f"Download URL found: {download_url}") + return download_url + logger.error(f"Version R{gtdb_version} not found.") + return None + except requests.exceptions.RequestException as e: + logger.error(f"Failed to fetch releases: {e}") + return None + + +# --------------------- GTDB Taxdump Download --------------------- +def download_gtdb_taxdump(gtdb_version, outpath, force=False): + """Downloads the GTDB taxdump file for a specific GTDB release""" + if not force and Path(outpath).exists(): + logger.info(f"File already exists: {outpath}") + return outpath + try: + download_url = get_gtdb_taxdump_release_url(gtdb_version) + if download_url: + response = requests.get(download_url, stream=True) + response.raise_for_status() + total_size = int(response.headers.get("content-length", 0)) + chunk_size = 1024 + with tqdm( + total=total_size, unit="B", unit_scale=True, desc=str(outpath) + ) as pbar: + with open(outpath, "wb") as f: + for chunk in response.iter_content(chunk_size=chunk_size): + if chunk: + f.write(chunk) + pbar.update(len(chunk)) + logger.info(f"Download complete. File saved to: {outpath}") + else: + logger.error("Download URL was not found.") + except requests.exceptions.RequestException as e: + logger.error(f"Failed to download GTDB taxdump: {e}") + except IOError as e: + logger.error(f"File write error: {e}") + return outpath + + +def unpack_gtdb_taxdump(tar_file, gtdb_version, outdir=None, force=False): + """Extracts the GTDB taxdump file and renames the directory to include the GTDB version""" + if not outdir: + outdir = Path(tar_file).parent + target_dir = f"gtdb-taxdump/R{gtdb_version}" + new_dir_prefix = f"gtdb_taxdump-version-{gtdb_version}" + if not force and Path(outdir, new_dir_prefix).exists(): + logger.info( + f"Directory already exists: {outdir}/{new_dir_prefix}, use --force to overwrite." + ) + return Path(outdir, new_dir_prefix) + with tarfile.open(tar_file, "r:gz") as tar: + members = [] + for member in tar.getmembers(): + if member.name.startswith(target_dir): + # Adjust the path to rename the folder on extraction + member.name = member.name.replace(target_dir, new_dir_prefix, 1) + members.append(member) + if members: + tar.extractall(outdir, members=members) + print(f"Extracted and renamed {target_dir} to {new_dir_prefix} in {outdir}") + else: + print(f"Directory {target_dir} not found in the archive.") + return Path(outdir, new_dir_prefix) + + +# --------------------- Proteins AA Reps Download with MD5 Verification --------------------- +def download_proteins_aa_reps(host, version, subversion, outpath, force=False): + """Downloads the GTDB proteins_aa_reps tarball for a specific GTDB release, with MD5 checksum verification""" + if not force and Path(outpath).exists(): + logger.info(f"File already exists: {outpath}") + return + if version == "latest": + try: + version = get_latest_gtdb_version(host) + except requests.exceptions.RequestException as e: + logger.error(f"Failed to fetch GTDB version number: {e}") + raise + logger.info(f"Downloading gtdb_proteins_aa_reps.tar.gz, version {version}") + try: + md5sum_url = f"https://{host}/releases/release{version}/{version}.{subversion}/MD5SUM.txt" + response = requests.get(md5sum_url) + response.raise_for_status() + md5sum_lines = response.text.splitlines() + expected_md5 = None + filename = f"genomic_files_reps/gtdb_proteins_aa_reps_r{version}.tar.gz" + for line in md5sum_lines: + if filename in line: + expected_md5 = line.split()[0] + break + if not expected_md5: + logger.error( + f"MD5 checksum for version {version} not found in {md5sum_url}." + ) + return + except requests.exceptions.RequestException as e: + logger.error(f"Failed to fetch MD5SUM.txt: {e}") + return + url = f"https://{host}/releases/release{version}/{version}.{subversion}/genomic_files_reps/gtdb_proteins_aa_reps_r{version}.tar.gz" + try: + with requests.get(url, stream=True) as r: + r.raise_for_status() + logger.info(f"Downloading from {url}") + total_size = int(r.headers.get("content-length", 0)) + chunk_size = 1024 * 1024 + md5 = hashlib.md5() + + with tqdm( + total=total_size, unit="B", unit_scale=True, desc=str(outpath) + ) as pbar: + with open(outpath, "wb") as f: + for chunk in r.iter_content(chunk_size=chunk_size): + if chunk: + f.write(chunk) + pbar.update(len(chunk)) + md5.update(chunk) + calculated_md5 = md5.hexdigest() + if calculated_md5 == expected_md5: + logger.info(f"MD5 checksum verification passed for {outpath}.") + else: + logger.error( + f"MD5 checksum verification failed for {outpath}. Expected {expected_md5}, got {calculated_md5}." + ) + except requests.exceptions.RequestException as e: + logger.error(f"Failed to download the file: {e}") + except IOError as e: + logger.error(f"File write error: {e}") + return outpath + + +# --------------------- Combined GTDB FASTA Creation --------------------- +def create_combined_gtdb_fasta(tar_file: str, outpath: str, force=False): + """ + Generate a combined faa file to create the GTDB-t database. + + Parameters + ---------- + tar_file : str + The downloaded gtdb_proteins_aa_reps_*.tar.gz file. + outpath : str + Path to the combined FASTA file to be written. + force : bool, optional + If True, overwrite the output file if it already exists, by default False + + Returns + ------- + str + Path to combined faa file. This can be used to make a diamond database. + """ + # Check if the output file already exists + if not force and Path(outpath).exists(): + logger.info(f"File already exists: {outpath}") + return outpath + # Open the combined output file + with gzip.open(outpath, "wt") as f_out: + # Open the tar.gz archive + with tarfile.open(tar_file, "r:gz") as tar: + # Initialize tqdm progress bars + with tqdm( + desc="Files read", unit="file", position=0, leave=True + ) as file_pbar, tqdm( + desc="Sequences written", unit="seq", position=1, leave=True + ) as seq_pbar: + # Iterate over members in the tar file + for member in tar: + # Check if the member is a file and ends with .faa.gz + if member.isfile() and member.name.endswith(".faa.gz"): + # Search for genome accession in the file name + genome_acc_search = re.search( + r"(GCA_\d+\.\d+|GCF_\d+\.\d+)", member.name + ) + if genome_acc_search: + genome_acc = genome_acc_search.group() + else: + raise ValueError( + f"Could not find genome accession for {member.name}" + ) + # Extract and read the content of the .faa.gz file + with tar.extractfile(member) as f_in: + with gzip.GzipFile(fileobj=f_in) as gz_in: + seq_count = ( + 0 # Initialize sequence counter for the file + ) + for line in gz_in: + line = line.decode("utf-8") + if line.startswith(">"): + seqheader = line.lstrip(">").strip() + outline = f">{genome_acc} {seqheader}\n" + seq_pbar.update(seq_count) + else: + outline = line + f_out.write(outline) + seq_count += 1 # Increment sequence count + file_pbar.update( + 1 + ) # Update file progress bar after processing each file + logger.debug(f"Combined GTDB faa file written to {outpath}") + return outpath + + +def download_and_format(gtdb_host, gtdb_version, single_dir, force=False): + """ + Download and format GTDB and NCBI files. + + Parameters + ---------- + gtdb_host : str + The GTDB host to download files from. + gtdb_version : str + The GTDB version to download. + single_dir : str + The single directory to download and format files into. + dryrun : bool, optional + If True, only print what would be done, by default False + """ + if gtdb_version == "latest": + gtdb_version = get_latest_gtdb_version(gtdb_host) + logger.info(f"Using 'latest' GTDB version: {gtdb_version}") + + if "." in gtdb_version: + gtdb_version = gtdb_version.split(".")[0] + gtdb_subversion = gtdb_version.split(".")[1] + else: + gtdb_subversion = "0" + gtdb_taxdmp_path = Path(single_dir, f"gtdb-taxdump-version-{gtdb_version}.tar.gz") + # have to rename because GTDB file doesn't have subversion in the name + aa_reps_path = Path( + single_dir, + f"gtdb_proteins_aa_reps-version-{gtdb_version}.{gtdb_subversion}.tar.gz", + ) + gtdb_taxdmp_path = download_gtdb_taxdump( + gtdb_version=gtdb_version, outpath=gtdb_taxdmp_path, force=force + ) + taxdmp_dir = unpack_gtdb_taxdump( + tar_file=gtdb_taxdmp_path, gtdb_version=gtdb_version, force=force + ) + aa_reps_path = download_proteins_aa_reps( + host=gtdb_host, + version=gtdb_version, + subversion=gtdb_subversion, + outpath=aa_reps_path, + force=force, + ) + combined_gtdb_fasta = create_combined_gtdb_fasta( + tar_file=aa_reps_path, + outpath=Path( + single_dir, + f"autometa_formatted_gtdb-version-{gtdb_version}.{gtdb_subversion}.faa.gz", + ), + force=force, + ) + return { + "gtdb_taxdmp_path": gtdb_taxdmp_path, + "taxdmp_dir": taxdmp_dir, + "aa_reps_path": aa_reps_path, + "combined_gtdb_fasta": combined_gtdb_fasta, + } diff --git a/autometa/taxonomy/gtdb.py b/autometa/taxonomy/gtdb.py index 36590b9b0..6af8fd92e 100644 --- a/autometa/taxonomy/gtdb.py +++ b/autometa/taxonomy/gtdb.py @@ -9,14 +9,12 @@ import gzip import logging import os -import re -import tarfile -import glob from typing import Dict, Set, Tuple from itertools import chain from tqdm import tqdm from typing import Dict +from configparser import ConfigParser import pandas as pd import multiprocessing as mp @@ -24,91 +22,47 @@ from autometa.common.utilities import file_length, is_gz_file from autometa.common.external import diamond from autometa.taxonomy.database import TaxonomyDatabase - +from autometa.taxonomy.download_gtdb_files import ( + create_combined_gtdb_fasta, + get_latest_gtdb_version, +) +from autometa.config.utilities import DEFAULT_CONFIG logger = logging.getLogger(__name__) -def create_gtdb_db(reps_faa: str, dbdir: str) -> str: - """ - Generate a combined faa file to create the GTDB-t database. - - Parameters - ---------- - reps_faa : str - Directory having faa file of all representative genomes. Can be tarballed. - dbdir : str - Path to output directory. - - Returns - ------- - str - Path to combined faa file. This can be used to make a diamond database. - """ - - if reps_faa.endswith(".tar.gz"): - logger.debug( - f"Extracting tarball containing GTDB ref genome animo acid data sequences to: {dbdir}/protein_faa_reps" - ) - tar = tarfile.open(reps_faa) - tar.extractall(path=dbdir) - tar.close() - logger.debug("Extraction done.") - reps_faa = dbdir - - genome_protein_faa_filepaths = glob.glob( - os.path.join(reps_faa, "**", "*_protein.faa*"), - recursive=True - # To find *_protein.faa and *_protein.faa.gz files - ) - - faa_index: Dict[str, str] = {} - for genome_protein_faa_filepath in genome_protein_faa_filepaths: - # Regex to get the genome accession from the file path - genome_acc_search = re.search( - r"GCA_\d+.\d?|GCF_\d+.\d?", genome_protein_faa_filepath - ) - if genome_acc_search: - faa_index[genome_protein_faa_filepath] = genome_acc_search.group() - - # Create dbdir if it doesn't exist - if not os.path.isdir(dbdir): - os.makedirs(dbdir) - - logger.debug(f"Merging {len(faa_index):,} faa files.") - combined_faa = os.path.join(dbdir, "gtdb.faa") - with open(combined_faa, "w") as f_out: - for faa_file, acc in faa_index.items(): - with gzip.open(faa_file, "rb") as f_in: - for line in f_in: - line = line.decode("utf-8") - if line.startswith(">"): - seqheader = line.lstrip(">").strip() - outline = f">{acc} {seqheader}\n" - else: - outline = line - f_out.write(outline) - logger.debug(f"Combined GTDB faa file written to {combined_faa}") - return combined_faa - - class GTDB(TaxonomyDatabase): """Taxonomy utilities for GTDB databases.""" - def __init__(self, dbdir: str, verbose: bool = True): + def __init__(self, dbdir: str, verbose: bool = True, config=DEFAULT_CONFIG): """ Instantiates the GTDB class - """ + if not isinstance(config, ConfigParser): + raise TypeError(f"config is not ConfigParser : {type(config)}") + self.config = config + # before instantiating the class, check if the GTDB database is present + gtdb_version = self.config.get("gtdb", "release") + if gtdb_version == "latest": + gtdb_version = get_latest_gtdb_version() + logger.info(f"Using 'latest' GTDB version: {gtdb_version}") + if "." in gtdb_version: + gtdb_version = gtdb_version.split(".")[0] + gtdb_subversion = gtdb_version.split(".")[1] + else: + gtdb_subversion = "0" self.dbdir = dbdir self.verbose = verbose self.disable = not self.verbose - self.dmnd_db = os.path.join(self.dbdir, "gtdb.dmnd") - self.accession2taxid = os.path.join(self.dbdir, "taxid.map") - self.nodes_fpath = os.path.join(self.dbdir, "nodes.dmp") - self.names_fpath = os.path.join(self.dbdir, "names.dmp") - self.merged_fpath = os.path.join(self.dbdir, "merged.dmp") - self.delnodes_fpath = os.path.join(self.dbdir, "delnodes.dmp") + self.dmnd_db = os.path.join( + self.config.get("databases", "gtdb"), + f"autometa_formatted_gtdb-version-{gtdb_version}.{gtdb_subversion}.faa.gz", + ) + self.accession2taxid = os.path.join(dbdir, "taxid.map") + self.nodes_fpath = os.path.join(dbdir, "nodes.dmp") + self.names_fpath = os.path.join(dbdir, "names.dmp") + self.merged_fpath = os.path.join(dbdir, "merged.dmp") + self.delnodes_fpath = os.path.join(dbdir, "delnodes.dmp") self.verify_databases() self.names = self.parse_names() self.nodes = self.parse_nodes() @@ -341,7 +295,7 @@ def main(): parser.add_argument( "--reps-faa", - help="Path to directory containing GTDB ref genome animo acid data sequences. Can be tarballed.", + help="Path to directory containing the tarballed GTDB ref genome animo acid data sequences.", required=True, ) parser.add_argument( @@ -355,7 +309,8 @@ def main(): args = parser.parse_args() - gtdb_combined = create_gtdb_db(reps_faa=args.reps_faa, dbdir=args.dbdir) + gtdb_combined = create_combined_gtdb_fasta(reps_faa=args.reps_faa, dbdir=args.dbdir) + diamond.makedatabase( fasta=gtdb_combined, database=gtdb_combined.replace(".faa", ".dmnd"), diff --git a/docs/source/databases.rst b/docs/source/databases.rst index 6850d7550..68a0aee16 100644 --- a/docs/source/databases.rst +++ b/docs/source/databases.rst @@ -13,17 +13,17 @@ Markers ####### .. code-block:: bash - + # Point Autometa to where you would like your markers database directory autometa-config \ --section databases --option markers \ --value - + # Update your markers database directory autometa-update-databases --update-markers .. alert:: - + Do NOT use a trailing slash, e.g. NO ``/`` for the database directory paths! Links to these markers files and their associated cutoff values are below: @@ -68,7 +68,7 @@ Genome Taxonomy Database (GTDB) ############################### If you would like to incorporate the benefits of using the Genome Taxonomy Database, -you can either run the following script or manually download the respective databases. +you can either run the following script or manually download the respective databases. GTDB version 220 or later is required. .. code-block:: bash @@ -81,7 +81,7 @@ you can either run the following script or manually download the respective data autometa-config \ --section gtdb --option release \ --value latest - # Or --value r207 or --value r202, etc. + # Or a version number like `--value 220`, or `--value 220.0`, etc. # Download and format the configured GTDB databases release autometa-update-databases --update-gtdb @@ -93,28 +93,21 @@ you can either run the following script or manually download the respective data See ``autometa-update-databases -h`` and ``autometa-config -h`` for full list of options. -The previous command will download the following GTDB databases and format the `gtdb_proteins_aa_reps.tar.gz` to generate `gtdb.dmnd` to be used by Autometa: +The previous command will download the following GTDB databases and format them for use by Autometa. The filenames will be modified to include the release version number for reproducibility. + +The original files - Amino acid sequences of representative genome - - `gtdb_proteins_aa_reps.tar.gz `_ + - `gtdb_proteins_aa_reps.tar.gz `_ - gtdb-taxdump.tar.gz from `shenwei356/gtdb-taxdump `_ - `gtdb-taxdump.tar.gz `_ -Once unzipped `gtdb-taxdump.tar.gz` will have the taxdump files of all the respective GTDB releases. -Make sure that the release you use is in line with the `gtdb_proteins_aa_reps.tar.gz` release version. -It's better to always use the latest version. - -All the taxonomy files for a specific taxonomy database should be in a single directory. -You can now copy the taxdump files of the desired release version in the sample directory as `gtdb.dmnd` +The initial download and formatting of the GTDB databases can take some time. The GTDB databases are large, and downloading/formatting requires ~283 GB of hard disk space. -Alternatively if you have manually downloaded `gtdb_proteins_aa_reps.tar.gz` and `gtdb-taxdump.tar.gz` you can run the -following command to format the `gtdb_proteins_aa_reps.tar.gz` to generate `gtdb.dmnd` and make it ready for Autometa. - -.. code-block:: bash - - autometa-setup-gtdb --reps-faa --dbdir --cpus 20 - -.. note:: +For version 220, the file sizes are approximately: - Again Make sure that the formatted `gtdb_proteins_aa_reps.tar.gz` database and gtdb taxdump files are in the same directory. +- 77 MB gtdb-taxdump-version-220.tar.gz +- 67 GB gtdb_proteins_aa_reps-version-220.tar.gz +- 149 GB autometa_formatted_gtdb-version-220.0.dmnd +- 103 MB ./gtdb_taxdump-version-220/ diff --git a/tests/unit_tests/test_gtdb_download.py b/tests/unit_tests/test_gtdb_download.py new file mode 100644 index 000000000..9f9a9c08d --- /dev/null +++ b/tests/unit_tests/test_gtdb_download.py @@ -0,0 +1,173 @@ +import pytest +from unittest import mock +import requests +import logging + +from autometa.taxonomy.download_gtdb_files import ( + get_latest_gtdb_version, + get_gtdb_taxdump_release_url, + download_gtdb_taxdump, + download_proteins_aa_reps, +) + +# Mock logger to avoid unnecessary logging during tests +logger = logging.getLogger(__name__) + + +@pytest.fixture +def mock_requests_get(): + with mock.patch("requests.get") as mock_get: + yield mock_get + + +def test_get_latest_gtdb_version_success(mock_requests_get): + # Mock a successful request response + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.text = "v220\n" + mock_requests_get.return_value = mock_response + + host = "data.ace.uq.edu.au/public/gtdb/data" + version = get_latest_gtdb_version(host) + + assert version == "220" + mock_requests_get.assert_called_once_with( + f"https://{host}/releases/latest/VERSION.txt" + ) + + +def test_get_latest_gtdb_version_fail(mock_requests_get): + # Mock a failed request + mock_requests_get.side_effect = requests.exceptions.RequestException( + "Error occurred" + ) + + host = "data.ace.uq.edu.au/public/gtdb/data" + with pytest.raises( + RuntimeError, match="Failed to fetch GTDB version: Error occurred" + ): + get_latest_gtdb_version(host) + + +def test_get_gtdb_taxdump_release_url_success(mock_requests_get): + # Mock a response from GitHub API with a valid release + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.json.return_value = [ + { + "name": "r220", + "assets": [ + { + "name": "gtdb-taxdump.tar.gz", + "browser_download_url": "https://example.com/download", + } + ], + } + ] + mock_requests_get.return_value = mock_response + + gtdb_version = "220" + url = get_gtdb_taxdump_release_url(gtdb_version) + + assert url == "https://example.com/download" + mock_requests_get.assert_called_once_with( + "https://api.github.com/repos/shenwei356/gtdb-taxdump/releases" + ) + + +def test_get_gtdb_taxdump_release_url_not_found(mock_requests_get): + # Mock a response with no matching version + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.json.return_value = [ + {"name": "r219", "assets": [{"name": "gtdb-taxdump.tar.gz"}]} + ] + mock_requests_get.return_value = mock_response + + gtdb_version = "220" + url = get_gtdb_taxdump_release_url(gtdb_version) + + assert url is None + mock_requests_get.assert_called_once_with( + "https://api.github.com/repos/shenwei356/gtdb-taxdump/releases" + ) + + +def test_download_gtdb_taxdump_file_exists(): + # Mock the file already existing + with mock.patch("pathlib.Path.exists", return_value=True): + with mock.patch("requests.get") as mock_get: + download_gtdb_taxdump( + "220", "/some/dir/gtdb-taxdump-R220.tar.gz", force=False + ) + mock_get.assert_not_called() + + +def test_download_gtdb_taxdump_success(mock_requests_get): + # Mock a successful file download + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.headers = {"content-length": "1024"} + mock_response.iter_content = mock.Mock(return_value=[b"chunk1", b"chunk2"]) + # Mock the JSON response for the releases + mock_response.json.return_value = [ + { + "name": "r220", + "assets": [ + { + "name": "gtdb-taxdump.tar.gz", + "browser_download_url": "https://example.com/download", + } + ], + } + ] + mock_requests_get.return_value = mock_response + with mock.patch("pathlib.Path.exists", return_value=False): + with mock.patch("builtins.open", mock.mock_open()) as mock_file: + with mock.patch("autometa.taxonomy.download_gtdb_files.tqdm") as mock_tqdm: + download_gtdb_taxdump( + "220", "/some/dir/gtdb-taxdump-R220.tar.gz", force=False + ) + mock_requests_get.assert_called() + mock_file.assert_called_once_with( + "/some/dir/gtdb-taxdump-R220.tar.gz", "wb" + ) + mock_tqdm.assert_called_once() + + +def test_download_proteins_aa_reps_success(mock_requests_get): + # Mock successful file download for proteins_aa_reps + mock_response = mock.Mock() + mock_response.status_code = 200 + mock_response.headers = {"content-length": "1024"} + mock_response.iter_content = mock.Mock(return_value=[b"chunk1", b"chunk2"]) + # Configure the mock to support the context manager protocol + mock_response.__enter__ = mock.Mock(return_value=mock_response) + mock_response.__exit__ = mock.Mock(return_value=None) + + # Mock MD5SUM response with expected checksum + md5sum_response = mock.Mock() + md5sum_response.status_code = 200 + md5sum_response.text = "d41d8cd98f00b204e9800998ecf8427e genomic_files_reps/gtdb_proteins_aa_reps_r220.tar.gz" + with mock.patch("pathlib.Path.exists", return_value=False): + with mock.patch( + "requests.get", side_effect=[md5sum_response, mock_response] + ) as mock_get: + with mock.patch("builtins.open", mock.mock_open()) as mock_file: + with mock.patch( + "autometa.taxonomy.download_gtdb_files.tqdm" + ) as mock_tqdm: + download_proteins_aa_reps( + "data.ace.uq.edu.au/public/gtdb/data", + "220", + "1", + "/some/dir/gtdb_proteins_aa_reps-R220.tar.gz", + force=False, + ) + assert ( + mock_get.call_count == 2 + ) # One for MD5, one for the actual download + mock_file.assert_called_once_with( + "/some/dir/gtdb_proteins_aa_reps-R220.tar.gz", "wb" + ) + mock_tqdm.assert_called_once()