Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions .github/workflows/pytest_codecov.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,19 @@ jobs:
PYTHON: ${{ matrix.python-version }}
name: pytest & codecov
steps:
- uses: actions/checkout@v2
- uses: actions/checkout@v4
- name: Cache conda
uses: actions/cache@v2
uses: actions/cache@v4
with:
path: ~/conda_pkgs_dir
key: ${{ runner.os }}-conda-py${{ matrix.python-version }}-${{ hashFiles('tests/environment.yml') }}
- name: Cache test data
uses: actions/cache@v2
uses: actions/cache@v4
with:
path: tests/data/test_data.json
key: ${{ runner.os }}-test-data
- name: Setup mamba
uses: conda-incubator/setup-miniconda@v2
uses: conda-incubator/setup-miniconda@v3
with:
python-version: ${{ matrix.python-version }}
mamba-version: "*"
Expand All @@ -70,7 +70,7 @@ jobs:
shell: bash -l {0}
run: python -m pytest --cov-report=xml --cov=autometa tests/
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v2
uses: codecov/codecov-action@v5
with:
env_vars: OS,PYTHON
flags: unittests
Expand Down
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ clean:
find . -type d -name "Autometa.egg-info" -exec rm -r {} +
find . -type d -name "dist" -exec rm -r {} +
find . -type d -name "build" -exec rm -r {} +
find . -name ".nextflow.log.*" -exec rm -r {} +
find . -name ".nextflow.log" -exec rm {} +
find . -type d -name ".nextflow" -exec rm -r {} +
find . -type d -name "work" -exec rm -r {} +

## Apply black formatting
black:
Expand Down
114 changes: 77 additions & 37 deletions autometa/config/databases.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import logging
import os
from pathlib import Path
import requests
import sys
import subprocess
Expand All @@ -33,8 +34,15 @@
from autometa.config.utilities import DEFAULT_CONFIG
from autometa.config.utilities import AUTOMETA_DIR
from autometa.config.utilities import put_config, get_config
from autometa.taxonomy.gtdb import create_gtdb_db

from autometa.taxonomy.download_gtdb_files import (
create_combined_gtdb_fasta,
unpack_gtdb_taxdump,
)
from autometa.taxonomy.download_gtdb_files import (
download_gtdb_taxdump,
download_proteins_aa_reps,
get_latest_gtdb_version,
)

logger = logging.getLogger(__name__)
urllib_logger = logging.getLogger("urllib3")
Expand Down Expand Up @@ -404,29 +412,65 @@ def download_ncbi_files(self, options: Iterable) -> None:
if "nr" in options:
self.format_nr()

def download_gtdb_files(self) -> None:
gtdb_taxdump_url = self.config.get("database_urls", "gtdb_taxdmp")
proteins_aa_reps_url = self.config.get("database_urls", "proteins_aa_reps")

# User path:
gtdb_taxdump_filepath = self.config.get("gtdb", "gtdb_taxdmp")
proteins_aa_reps_filepath = self.config.get("gtdb", "proteins_aa_reps")

urls = [gtdb_taxdump_url, proteins_aa_reps_url]
filepaths = [gtdb_taxdump_filepath, proteins_aa_reps_filepath]

logger.debug(f"starting GTDB databases download")
for url, filepath in zip(urls, filepaths):
cmd = ["wget", url, "-O", filepath]
full_path = os.path.abspath(filepath)
dir_path = os.path.dirname(full_path)
if not os.path.exists(dir_path):
os.makedirs(dir_path)
logger.debug(f"Created missing database directory: {dir_path}")
logger.debug(" ".join(cmd))
subprocess.run(
cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True
)
def download_and_format_gtdb_files(self) -> None:

# urls
gtdb_taxdump_url = self.config.get(
"gtdb", "host"
) # e.g. data.ace.uq.edu.au/public/gtdb/data
gtdb_version = self.config.get("gtdb", "release") # e.g. latest, 220
# local file parent directories
gtdb_taxdmp_directory = self.config.get("gtdb", "gtdb_taxdmp")
proteins_aa_reps_directory = self.config.get("gtdb", "proteins_aa_reps")
# ensure the directories exist
if not Path(gtdb_taxdmp_directory).exists():
logger.info(f"Creating directory: {gtdb_taxdmp_directory}")
Path(gtdb_taxdmp_directory).mkdir(parents=True)
if not Path(proteins_aa_reps_directory).exists():
logger.info(f"Creating directory: {proteins_aa_reps_directory}")
Path(proteins_aa_reps_directory).mkdir(parents=True)

if gtdb_version == "latest":
gtdb_version = get_latest_gtdb_version(gtdb_taxdump_url)
logger.info(f"Using 'latest' GTDB version: {gtdb_version}")
self.config.set("gtdb", "release", gtdb_version)

if "." in gtdb_version:
gtdb_version = gtdb_version.split(".")[0]
gtdb_subversion = gtdb_version.split(".")[1]
else:
gtdb_subversion = "0"
if int(gtdb_version) < 220:
raise ValueError("GTDB versions <220 cannot be used due file differences")
gtdb_taxdmp_path = Path(
gtdb_taxdmp_directory, f"gtdb-taxdump-version-{gtdb_version}.tar.gz"
)
aa_reps_path = Path(
proteins_aa_reps_directory,
f"gtdb_proteins_aa_reps-version-{gtdb_version}.{gtdb_subversion}.tar.gz",
)
gtdb_taxdmp_path = download_gtdb_taxdump(
gtdb_version=gtdb_version, outpath=gtdb_taxdmp_path
)
taxdmp_dir = unpack_gtdb_taxdump(
tar_file=gtdb_taxdmp_path, gtdb_version=gtdb_version
)
combined_faa_path = Path(
self.config.get("databases", "gtdb"),
f"autometa_formatted_gtdb-version-{gtdb_version}.{gtdb_subversion}.faa.gz",
)
aa_reps_path = download_proteins_aa_reps(
host=gtdb_taxdump_url,
version=gtdb_version,
subversion=gtdb_subversion,
outpath=aa_reps_path,
)
create_combined_gtdb_fasta(tar_file=aa_reps_path, outpath=combined_faa_path)
return {
"taxdmp_dir": taxdmp_dir,
"gtdb_aa_reps_path": aa_reps_path,
"combined_faa_path": combined_faa_path,
}

def press_hmms(self) -> None:
"""hmmpress markers hmm database files.
Expand Down Expand Up @@ -809,19 +853,15 @@ def main():
elif args.update_ncbi:
section = "ncbi"
elif args.update_gtdb:
if not os.path.exists(
dbs.config.get("gtdb", "proteins_aa_reps")
) and not os.path.exists(dbs.config.get("gtdb", "gtdb_taxdmp")):
logger.info(f"GTDB database downloading: ")
dbs.download_gtdb_files()
# Format GTDB amino acid database
gtdb_combined = create_gtdb_db(
reps_faa=dbs.config.get("gtdb", "proteins_aa_reps"),
dbdir=dbs.config.get("databases", "gtdb"),
)
paths = dbs.download_and_format_gtdb_files()

database_path = str(paths.get("combined_faa_path")).replace(".faa.gz", ".dmnd")
if os.path.exists(database_path):
logger.info(f"GTDB DIAMOND database already exists: {database_path}")
sys.exit(0)
diamond.makedatabase(
fasta=gtdb_combined,
database=gtdb_combined.replace(".faa", ".dmnd"),
fasta=str(paths.get("combined_faa_path")),
database=str(paths.get("combined_faa_path")).replace(".faa.gz", ".dmnd"),
cpus=args.nproc,
)
sys.exit(0)
Expand Down
11 changes: 4 additions & 7 deletions autometa/config/default.config
Original file line number Diff line number Diff line change
Expand Up @@ -61,9 +61,6 @@ bacteria_single_copy = https://${markers:host}/KwanLab/Autometa/main/autometa/da
bacteria_single_copy_cutoffs = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/bacteria.single_copy.cutoffs
archaea_single_copy = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/archaea.single_copy.hmm
archaea_single_copy_cutoffs = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/archaea.single_copy.cutoffs
proteins_aa_reps = https://${gtdb:host}/releases/${gtdb:release}/genomic_files_reps/gtdb_proteins_aa_reps.tar.gz
gtdb_taxdmp = https://github.com/shenwei356/gtdb-taxdump/releases/latest/download/gtdb-taxdump.tar.gz


[checksums]
taxdump = ftp://${ncbi:host}/pub/taxonomy/taxdump.tar.gz.md5
Expand All @@ -85,10 +82,10 @@ accession2taxid = ${databases:ncbi}/prot.accession2taxid.gz
nr = ${databases:ncbi}/nr.gz

[gtdb]
host = data.gtdb.ecogenomic.org
release = latest
proteins_aa_reps = ${databases:gtdb}/gtdb_proteins_aa_reps.tar.gz
gtdb_taxdmp = ${databases:gtdb}/gtdb-taxdump.tar.gz
host = data.ace.uq.edu.au/public/gtdb/data
release = 220
proteins_aa_reps = ${databases:gtdb}
gtdb_taxdmp = ${databases:gtdb}

[markers]
host = raw.githubusercontent.com
Expand Down
Loading
Loading