diff --git a/.github/workflows/pytest_codecov.yml b/.github/workflows/pytest_codecov.yml
index bf35616e4..25ac64acf 100644
--- a/.github/workflows/pytest_codecov.yml
+++ b/.github/workflows/pytest_codecov.yml
@@ -32,19 +32,19 @@ jobs:
       PYTHON: ${{ matrix.python-version }}
     name: pytest & codecov
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Cache conda
-        uses: actions/cache@v2
+        uses: actions/cache@v4
         with:
           path: ~/conda_pkgs_dir
           key: ${{ runner.os }}-conda-py${{ matrix.python-version }}-${{ hashFiles('tests/environment.yml') }}
       - name: Cache test data
-        uses: actions/cache@v2
+        uses: actions/cache@v4
         with:
           path: tests/data/test_data.json
           key: ${{ runner.os }}-test-data
       - name: Setup mamba
-        uses: conda-incubator/setup-miniconda@v2
+        uses: conda-incubator/setup-miniconda@v3
         with:
           python-version: ${{ matrix.python-version }}
           mamba-version: "*"
@@ -70,7 +70,7 @@ jobs:
         shell: bash -l {0}
         run: python -m pytest --cov-report=xml --cov=autometa tests/
       - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v2
+        uses: codecov/codecov-action@v5
         with:
           env_vars: OS,PYTHON
           flags: unittests
diff --git a/Makefile b/Makefile
index 71b0112ba..2a3b47556 100644
--- a/Makefile
+++ b/Makefile
@@ -28,6 +28,10 @@ clean:
 	find . -type d -name "Autometa.egg-info" -exec rm -r {} +
 	find . -type d -name "dist" -exec rm -r {} +
 	find . -type d -name "build" -exec rm -r {} +
+	find . -name ".nextflow.log.*" -exec rm -r {} +
+	find . -name ".nextflow.log" -exec rm {} +
+	find . -type d -name ".nextflow" -exec rm -r {} +
+	find . -type d -name "work" -exec rm -r {} +
 
 ## Apply black formatting
 black:
diff --git a/autometa/config/databases.py b/autometa/config/databases.py
index 74b59e87c..14ea42c3e 100644
--- a/autometa/config/databases.py
+++ b/autometa/config/databases.py
@@ -9,6 +9,7 @@
 
 import logging
 import os
+from pathlib import Path
 import requests
 import sys
 import subprocess
@@ -33,8 +34,15 @@
 from autometa.config.utilities import DEFAULT_CONFIG
 from autometa.config.utilities import AUTOMETA_DIR
 from autometa.config.utilities import put_config, get_config
-from autometa.taxonomy.gtdb import create_gtdb_db
-
+from autometa.taxonomy.download_gtdb_files import (
+    create_combined_gtdb_fasta,
+    unpack_gtdb_taxdump,
+)
+from autometa.taxonomy.download_gtdb_files import (
+    download_gtdb_taxdump,
+    download_proteins_aa_reps,
+    get_latest_gtdb_version,
+)
 
 logger = logging.getLogger(__name__)
 urllib_logger = logging.getLogger("urllib3")
@@ -404,29 +412,65 @@ def download_ncbi_files(self, options: Iterable) -> None:
         if "nr" in options:
             self.format_nr()
 
-    def download_gtdb_files(self) -> None:
-        gtdb_taxdump_url = self.config.get("database_urls", "gtdb_taxdmp")
-        proteins_aa_reps_url = self.config.get("database_urls", "proteins_aa_reps")
-
-        # User path:
-        gtdb_taxdump_filepath = self.config.get("gtdb", "gtdb_taxdmp")
-        proteins_aa_reps_filepath = self.config.get("gtdb", "proteins_aa_reps")
-
-        urls = [gtdb_taxdump_url, proteins_aa_reps_url]
-        filepaths = [gtdb_taxdump_filepath, proteins_aa_reps_filepath]
-
-        logger.debug(f"starting GTDB databases download")
-        for url, filepath in zip(urls, filepaths):
-            cmd = ["wget", url, "-O", filepath]
-            full_path = os.path.abspath(filepath)
-            dir_path = os.path.dirname(full_path)
-            if not os.path.exists(dir_path):
-                os.makedirs(dir_path)
-                logger.debug(f"Created missing database directory: {dir_path}")
-            logger.debug(" ".join(cmd))
-            subprocess.run(
-                cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, check=True
-            )
+    def download_and_format_gtdb_files(self) -> None:
+
+        # urls
+        gtdb_taxdump_url = self.config.get(
+            "gtdb", "host"
+        )  # e.g. data.ace.uq.edu.au/public/gtdb/data
+        gtdb_version = self.config.get("gtdb", "release")  # e.g. latest, 220
+        # local file parent directories
+        gtdb_taxdmp_directory = self.config.get("gtdb", "gtdb_taxdmp")
+        proteins_aa_reps_directory = self.config.get("gtdb", "proteins_aa_reps")
+        # ensure the directories exist
+        if not Path(gtdb_taxdmp_directory).exists():
+            logger.info(f"Creating directory: {gtdb_taxdmp_directory}")
+            Path(gtdb_taxdmp_directory).mkdir(parents=True)
+        if not Path(proteins_aa_reps_directory).exists():
+            logger.info(f"Creating directory: {proteins_aa_reps_directory}")
+            Path(proteins_aa_reps_directory).mkdir(parents=True)
+
+        if gtdb_version == "latest":
+            gtdb_version = get_latest_gtdb_version(gtdb_taxdump_url)
+            logger.info(f"Using 'latest' GTDB version: {gtdb_version}")
+            self.config.set("gtdb", "release", gtdb_version)
+
+        if "." in gtdb_version:
+            gtdb_version = gtdb_version.split(".")[0]
+            gtdb_subversion = gtdb_version.split(".")[1]
+        else:
+            gtdb_subversion = "0"
+        if int(gtdb_version) < 220:
+            raise ValueError("GTDB versions <220 cannot be used due file differences")
+        gtdb_taxdmp_path = Path(
+            gtdb_taxdmp_directory, f"gtdb-taxdump-version-{gtdb_version}.tar.gz"
+        )
+        aa_reps_path = Path(
+            proteins_aa_reps_directory,
+            f"gtdb_proteins_aa_reps-version-{gtdb_version}.{gtdb_subversion}.tar.gz",
+        )
+        gtdb_taxdmp_path = download_gtdb_taxdump(
+            gtdb_version=gtdb_version, outpath=gtdb_taxdmp_path
+        )
+        taxdmp_dir = unpack_gtdb_taxdump(
+            tar_file=gtdb_taxdmp_path, gtdb_version=gtdb_version
+        )
+        combined_faa_path = Path(
+            self.config.get("databases", "gtdb"),
+            f"autometa_formatted_gtdb-version-{gtdb_version}.{gtdb_subversion}.faa.gz",
+        )
+        aa_reps_path = download_proteins_aa_reps(
+            host=gtdb_taxdump_url,
+            version=gtdb_version,
+            subversion=gtdb_subversion,
+            outpath=aa_reps_path,
+        )
+        create_combined_gtdb_fasta(tar_file=aa_reps_path, outpath=combined_faa_path)
+        return {
+            "taxdmp_dir": taxdmp_dir,
+            "gtdb_aa_reps_path": aa_reps_path,
+            "combined_faa_path": combined_faa_path,
+        }
 
     def press_hmms(self) -> None:
         """hmmpress markers hmm database files.
@@ -809,19 +853,15 @@ def main():
     elif args.update_ncbi:
         section = "ncbi"
     elif args.update_gtdb:
-        if not os.path.exists(
-            dbs.config.get("gtdb", "proteins_aa_reps")
-        ) and not os.path.exists(dbs.config.get("gtdb", "gtdb_taxdmp")):
-            logger.info(f"GTDB database downloading: ")
-            dbs.download_gtdb_files()
-        # Format GTDB amino acid database
-        gtdb_combined = create_gtdb_db(
-            reps_faa=dbs.config.get("gtdb", "proteins_aa_reps"),
-            dbdir=dbs.config.get("databases", "gtdb"),
-        )
+        paths = dbs.download_and_format_gtdb_files()
+
+        database_path = str(paths.get("combined_faa_path")).replace(".faa.gz", ".dmnd")
+        if os.path.exists(database_path):
+            logger.info(f"GTDB DIAMOND database already exists: {database_path}")
+            sys.exit(0)
         diamond.makedatabase(
-            fasta=gtdb_combined,
-            database=gtdb_combined.replace(".faa", ".dmnd"),
+            fasta=str(paths.get("combined_faa_path")),
+            database=str(paths.get("combined_faa_path")).replace(".faa.gz", ".dmnd"),
             cpus=args.nproc,
         )
         sys.exit(0)
diff --git a/autometa/config/default.config b/autometa/config/default.config
index 60a474f64..02880fb4d 100644
--- a/autometa/config/default.config
+++ b/autometa/config/default.config
@@ -61,9 +61,6 @@ bacteria_single_copy = https://${markers:host}/KwanLab/Autometa/main/autometa/da
 bacteria_single_copy_cutoffs = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/bacteria.single_copy.cutoffs
 archaea_single_copy = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/archaea.single_copy.hmm
 archaea_single_copy_cutoffs = https://${markers:host}/KwanLab/Autometa/main/autometa/databases/markers/archaea.single_copy.cutoffs
-proteins_aa_reps = https://${gtdb:host}/releases/${gtdb:release}/genomic_files_reps/gtdb_proteins_aa_reps.tar.gz
-gtdb_taxdmp = https://github.com/shenwei356/gtdb-taxdump/releases/latest/download/gtdb-taxdump.tar.gz
-
 
 [checksums]
 taxdump = ftp://${ncbi:host}/pub/taxonomy/taxdump.tar.gz.md5
@@ -85,10 +82,10 @@ accession2taxid = ${databases:ncbi}/prot.accession2taxid.gz
 nr = ${databases:ncbi}/nr.gz
 
 [gtdb]
-host = data.gtdb.ecogenomic.org
-release = latest
-proteins_aa_reps = ${databases:gtdb}/gtdb_proteins_aa_reps.tar.gz
-gtdb_taxdmp = ${databases:gtdb}/gtdb-taxdump.tar.gz
+host = data.ace.uq.edu.au/public/gtdb/data
+release = 220
+proteins_aa_reps = ${databases:gtdb}
+gtdb_taxdmp = ${databases:gtdb}
 
 [markers]
 host = raw.githubusercontent.com
diff --git a/autometa/taxonomy/download_gtdb_files.py b/autometa/taxonomy/download_gtdb_files.py
new file mode 100644
index 000000000..ceabc7225
--- /dev/null
+++ b/autometa/taxonomy/download_gtdb_files.py
@@ -0,0 +1,314 @@
+import requests
+import logging
+import math
+from pathlib import Path
+import gzip
+import hashlib
+import tarfile
+import re
+
+from tqdm import tqdm
+
+
+# Set up logger
+logger = logging.getLogger(__name__)
+
+# --------------------- MD5 Checksum Calculation ---------------------
+def calculate_md5(filepath, chunk_size=1024 * 1024):
+    """Calculates the MD5 checksum of a file"""
+    md5 = hashlib.md5()
+    with open(filepath, "rb") as f:
+        while chunk := f.read(chunk_size):
+            md5.update(chunk)
+    return md5.hexdigest()
+
+
+# --------------------- GTDB Version Handling ---------------------
+def get_latest_gtdb_version(host):
+    """Fetches the latest GTDB version number from the GTDB server"""
+    try:
+        response = requests.get(f"https://{host}/releases/latest/VERSION.txt")
+        response.raise_for_status()
+        version = response.text.splitlines()[0]
+        version = version[1:] if version.startswith("v") else version
+        return version
+    except requests.exceptions.RequestException as e:
+        raise RuntimeError(f"Failed to fetch GTDB version: {e}")
+
+
+# --------------------- Taxdump Release URL Fetch ---------------------
+def get_gtdb_taxdump_release_url(gtdb_version):
+    """Finds the download URL for the GTDB taxdump file for a specific GTDB release"""
+    if gtdb_version == "latest":
+        raise ValueError(
+            "Latest version not supported. Please specify a version number."
+        )
+    try:
+        releases_url = "https://api.github.com/repos/shenwei356/gtdb-taxdump/releases"
+        response = requests.get(releases_url)
+        response.raise_for_status()
+        releases = response.json()
+        version_used = str(math.floor(float(gtdb_version)))
+
+        for release in releases:
+            if f"r{version_used}" in release["name"]:
+                for asset in release["assets"]:
+                    if "gtdb-taxdump.tar.gz" in asset["name"]:
+                        download_url = asset["browser_download_url"]
+                        logger.info(f"Download URL found: {download_url}")
+                        return download_url
+        logger.error(f"Version R{gtdb_version} not found.")
+        return None
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Failed to fetch releases: {e}")
+        return None
+
+
+# --------------------- GTDB Taxdump Download ---------------------
+def download_gtdb_taxdump(gtdb_version, outpath, force=False):
+    """Downloads the GTDB taxdump file for a specific GTDB release"""
+    if not force and Path(outpath).exists():
+        logger.info(f"File already exists: {outpath}")
+        return outpath
+    try:
+        download_url = get_gtdb_taxdump_release_url(gtdb_version)
+        if download_url:
+            response = requests.get(download_url, stream=True)
+            response.raise_for_status()
+            total_size = int(response.headers.get("content-length", 0))
+            chunk_size = 1024
+            with tqdm(
+                total=total_size, unit="B", unit_scale=True, desc=str(outpath)
+            ) as pbar:
+                with open(outpath, "wb") as f:
+                    for chunk in response.iter_content(chunk_size=chunk_size):
+                        if chunk:
+                            f.write(chunk)
+                            pbar.update(len(chunk))
+            logger.info(f"Download complete. File saved to: {outpath}")
+        else:
+            logger.error("Download URL was not found.")
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Failed to download GTDB taxdump: {e}")
+    except IOError as e:
+        logger.error(f"File write error: {e}")
+    return outpath
+
+
+def unpack_gtdb_taxdump(tar_file, gtdb_version, outdir=None, force=False):
+    """Extracts the GTDB taxdump file and renames the directory to include the GTDB version"""
+    if not outdir:
+        outdir = Path(tar_file).parent
+    target_dir = f"gtdb-taxdump/R{gtdb_version}"
+    new_dir_prefix = f"gtdb_taxdump-version-{gtdb_version}"
+    if not force and Path(outdir, new_dir_prefix).exists():
+        logger.info(
+            f"Directory already exists: {outdir}/{new_dir_prefix}, use --force to overwrite."
+        )
+        return Path(outdir, new_dir_prefix)
+    with tarfile.open(tar_file, "r:gz") as tar:
+        members = []
+        for member in tar.getmembers():
+            if member.name.startswith(target_dir):
+                # Adjust the path to rename the folder on extraction
+                member.name = member.name.replace(target_dir, new_dir_prefix, 1)
+                members.append(member)
+        if members:
+            tar.extractall(outdir, members=members)
+            print(f"Extracted and renamed {target_dir} to {new_dir_prefix} in {outdir}")
+        else:
+            print(f"Directory {target_dir} not found in the archive.")
+    return Path(outdir, new_dir_prefix)
+
+
+# --------------------- Proteins AA Reps Download with MD5 Verification ---------------------
+def download_proteins_aa_reps(host, version, subversion, outpath, force=False):
+    """Downloads the GTDB proteins_aa_reps tarball for a specific GTDB release, with MD5 checksum verification"""
+    if not force and Path(outpath).exists():
+        logger.info(f"File already exists: {outpath}")
+        return
+    if version == "latest":
+        try:
+            version = get_latest_gtdb_version(host)
+        except requests.exceptions.RequestException as e:
+            logger.error(f"Failed to fetch GTDB version number: {e}")
+            raise
+    logger.info(f"Downloading gtdb_proteins_aa_reps.tar.gz, version {version}")
+    try:
+        md5sum_url = f"https://{host}/releases/release{version}/{version}.{subversion}/MD5SUM.txt"
+        response = requests.get(md5sum_url)
+        response.raise_for_status()
+        md5sum_lines = response.text.splitlines()
+        expected_md5 = None
+        filename = f"genomic_files_reps/gtdb_proteins_aa_reps_r{version}.tar.gz"
+        for line in md5sum_lines:
+            if filename in line:
+                expected_md5 = line.split()[0]
+                break
+        if not expected_md5:
+            logger.error(
+                f"MD5 checksum for version {version} not found in {md5sum_url}."
+            )
+            return
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Failed to fetch MD5SUM.txt: {e}")
+        return
+    url = f"https://{host}/releases/release{version}/{version}.{subversion}/genomic_files_reps/gtdb_proteins_aa_reps_r{version}.tar.gz"
+    try:
+        with requests.get(url, stream=True) as r:
+            r.raise_for_status()
+            logger.info(f"Downloading from {url}")
+            total_size = int(r.headers.get("content-length", 0))
+            chunk_size = 1024 * 1024
+            md5 = hashlib.md5()
+
+            with tqdm(
+                total=total_size, unit="B", unit_scale=True, desc=str(outpath)
+            ) as pbar:
+                with open(outpath, "wb") as f:
+                    for chunk in r.iter_content(chunk_size=chunk_size):
+                        if chunk:
+                            f.write(chunk)
+                            pbar.update(len(chunk))
+                            md5.update(chunk)
+            calculated_md5 = md5.hexdigest()
+            if calculated_md5 == expected_md5:
+                logger.info(f"MD5 checksum verification passed for {outpath}.")
+            else:
+                logger.error(
+                    f"MD5 checksum verification failed for {outpath}. Expected {expected_md5}, got {calculated_md5}."
+                )
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Failed to download the file: {e}")
+    except IOError as e:
+        logger.error(f"File write error: {e}")
+    return outpath
+
+
+# --------------------- Combined GTDB FASTA Creation ---------------------
+def create_combined_gtdb_fasta(tar_file: str, outpath: str, force=False):
+    """
+    Generate a combined faa file to create the GTDB-t database.
+
+    Parameters
+    ----------
+    tar_file : str
+        The downloaded gtdb_proteins_aa_reps_*.tar.gz file.
+    outpath : str
+        Path to the combined FASTA file to be written.
+    force : bool, optional
+        If True, overwrite the output file if it already exists, by default False
+
+    Returns
+    -------
+    str
+        Path to combined faa file. This can be used to make a diamond database.
+    """
+    # Check if the output file already exists
+    if not force and Path(outpath).exists():
+        logger.info(f"File already exists: {outpath}")
+        return outpath
+    # Open the combined output file
+    with gzip.open(outpath, "wt") as f_out:
+        # Open the tar.gz archive
+        with tarfile.open(tar_file, "r:gz") as tar:
+            # Initialize tqdm progress bars
+            with tqdm(
+                desc="Files read", unit="file", position=0, leave=True
+            ) as file_pbar, tqdm(
+                desc="Sequences written", unit="seq", position=1, leave=True
+            ) as seq_pbar:
+                # Iterate over members in the tar file
+                for member in tar:
+                    # Check if the member is a file and ends with .faa.gz
+                    if member.isfile() and member.name.endswith(".faa.gz"):
+                        # Search for genome accession in the file name
+                        genome_acc_search = re.search(
+                            r"(GCA_\d+\.\d+|GCF_\d+\.\d+)", member.name
+                        )
+                        if genome_acc_search:
+                            genome_acc = genome_acc_search.group()
+                        else:
+                            raise ValueError(
+                                f"Could not find genome accession for {member.name}"
+                            )
+                        # Extract and read the content of the .faa.gz file
+                        with tar.extractfile(member) as f_in:
+                            with gzip.GzipFile(fileobj=f_in) as gz_in:
+                                seq_count = (
+                                    0  # Initialize sequence counter for the file
+                                )
+                                for line in gz_in:
+                                    line = line.decode("utf-8")
+                                    if line.startswith(">"):
+                                        seqheader = line.lstrip(">").strip()
+                                        outline = f">{genome_acc} {seqheader}\n"
+                                        seq_pbar.update(seq_count)
+                                    else:
+                                        outline = line
+                                    f_out.write(outline)
+                                    seq_count += 1  # Increment sequence count
+                        file_pbar.update(
+                            1
+                        )  # Update file progress bar after processing each file
+    logger.debug(f"Combined GTDB faa file written to {outpath}")
+    return outpath
+
+
+def download_and_format(gtdb_host, gtdb_version, single_dir, force=False):
+    """
+    Download and format GTDB and NCBI files.
+
+    Parameters
+    ----------
+    gtdb_host : str
+        The GTDB host to download files from.
+    gtdb_version : str
+        The GTDB version to download.
+    single_dir : str
+        The single directory to download and format files into.
+    dryrun : bool, optional
+        If True, only print what would be done, by default False
+    """
+    if gtdb_version == "latest":
+        gtdb_version = get_latest_gtdb_version(gtdb_host)
+        logger.info(f"Using 'latest' GTDB version: {gtdb_version}")
+
+    if "." in gtdb_version:
+        gtdb_version = gtdb_version.split(".")[0]
+        gtdb_subversion = gtdb_version.split(".")[1]
+    else:
+        gtdb_subversion = "0"
+    gtdb_taxdmp_path = Path(single_dir, f"gtdb-taxdump-version-{gtdb_version}.tar.gz")
+    # have to rename because GTDB file doesn't have subversion in the name
+    aa_reps_path = Path(
+        single_dir,
+        f"gtdb_proteins_aa_reps-version-{gtdb_version}.{gtdb_subversion}.tar.gz",
+    )
+    gtdb_taxdmp_path = download_gtdb_taxdump(
+        gtdb_version=gtdb_version, outpath=gtdb_taxdmp_path, force=force
+    )
+    taxdmp_dir = unpack_gtdb_taxdump(
+        tar_file=gtdb_taxdmp_path, gtdb_version=gtdb_version, force=force
+    )
+    aa_reps_path = download_proteins_aa_reps(
+        host=gtdb_host,
+        version=gtdb_version,
+        subversion=gtdb_subversion,
+        outpath=aa_reps_path,
+        force=force,
+    )
+    combined_gtdb_fasta = create_combined_gtdb_fasta(
+        tar_file=aa_reps_path,
+        outpath=Path(
+            single_dir,
+            f"autometa_formatted_gtdb-version-{gtdb_version}.{gtdb_subversion}.faa.gz",
+        ),
+        force=force,
+    )
+    return {
+        "gtdb_taxdmp_path": gtdb_taxdmp_path,
+        "taxdmp_dir": taxdmp_dir,
+        "aa_reps_path": aa_reps_path,
+        "combined_gtdb_fasta": combined_gtdb_fasta,
+    }
diff --git a/autometa/taxonomy/gtdb.py b/autometa/taxonomy/gtdb.py
index 36590b9b0..6af8fd92e 100644
--- a/autometa/taxonomy/gtdb.py
+++ b/autometa/taxonomy/gtdb.py
@@ -9,14 +9,12 @@
 import gzip
 import logging
 import os
-import re
-import tarfile
-import glob
 
 from typing import Dict, Set, Tuple
 from itertools import chain
 from tqdm import tqdm
 from typing import Dict
+from configparser import ConfigParser
 
 import pandas as pd
 import multiprocessing as mp
@@ -24,91 +22,47 @@
 from autometa.common.utilities import file_length, is_gz_file
 from autometa.common.external import diamond
 from autometa.taxonomy.database import TaxonomyDatabase
-
+from autometa.taxonomy.download_gtdb_files import (
+    create_combined_gtdb_fasta,
+    get_latest_gtdb_version,
+)
+from autometa.config.utilities import DEFAULT_CONFIG
 
 logger = logging.getLogger(__name__)
 
 
-def create_gtdb_db(reps_faa: str, dbdir: str) -> str:
-    """
-    Generate a combined faa file to create the GTDB-t database.
-
-    Parameters
-    ----------
-    reps_faa : str
-        Directory having faa file of all representative genomes. Can be tarballed.
-    dbdir : str
-        Path to output directory.
-
-    Returns
-    -------
-    str
-        Path to combined faa file. This can be used to make a diamond database.
-    """
-
-    if reps_faa.endswith(".tar.gz"):
-        logger.debug(
-            f"Extracting tarball containing GTDB ref genome animo acid data sequences to: {dbdir}/protein_faa_reps"
-        )
-        tar = tarfile.open(reps_faa)
-        tar.extractall(path=dbdir)
-        tar.close()
-        logger.debug("Extraction done.")
-        reps_faa = dbdir
-
-    genome_protein_faa_filepaths = glob.glob(
-        os.path.join(reps_faa, "**", "*_protein.faa*"),
-        recursive=True
-        # To find *_protein.faa and *_protein.faa.gz files
-    )
-
-    faa_index: Dict[str, str] = {}
-    for genome_protein_faa_filepath in genome_protein_faa_filepaths:
-        # Regex to get the genome accession from the file path
-        genome_acc_search = re.search(
-            r"GCA_\d+.\d?|GCF_\d+.\d?", genome_protein_faa_filepath
-        )
-        if genome_acc_search:
-            faa_index[genome_protein_faa_filepath] = genome_acc_search.group()
-
-    # Create dbdir if it doesn't exist
-    if not os.path.isdir(dbdir):
-        os.makedirs(dbdir)
-
-    logger.debug(f"Merging {len(faa_index):,} faa files.")
-    combined_faa = os.path.join(dbdir, "gtdb.faa")
-    with open(combined_faa, "w") as f_out:
-        for faa_file, acc in faa_index.items():
-            with gzip.open(faa_file, "rb") as f_in:
-                for line in f_in:
-                    line = line.decode("utf-8")
-                    if line.startswith(">"):
-                        seqheader = line.lstrip(">").strip()
-                        outline = f">{acc} {seqheader}\n"
-                    else:
-                        outline = line
-                    f_out.write(outline)
-    logger.debug(f"Combined GTDB faa file written to {combined_faa}")
-    return combined_faa
-
-
 class GTDB(TaxonomyDatabase):
     """Taxonomy utilities for GTDB databases."""
 
-    def __init__(self, dbdir: str, verbose: bool = True):
+    def __init__(self, dbdir: str, verbose: bool = True, config=DEFAULT_CONFIG):
         """
         Instantiates the GTDB class
-
         """
+        if not isinstance(config, ConfigParser):
+            raise TypeError(f"config is not ConfigParser : {type(config)}")
+        self.config = config
+        # before instantiating the class, check if the GTDB database is present
+        gtdb_version = self.config.get("gtdb", "release")
+        if gtdb_version == "latest":
+            gtdb_version = get_latest_gtdb_version()
+            logger.info(f"Using 'latest' GTDB version: {gtdb_version}")
+        if "." in gtdb_version:
+            gtdb_version = gtdb_version.split(".")[0]
+            gtdb_subversion = gtdb_version.split(".")[1]
+        else:
+            gtdb_subversion = "0"
         self.dbdir = dbdir
         self.verbose = verbose
         self.disable = not self.verbose
-        self.dmnd_db = os.path.join(self.dbdir, "gtdb.dmnd")
-        self.accession2taxid = os.path.join(self.dbdir, "taxid.map")
-        self.nodes_fpath = os.path.join(self.dbdir, "nodes.dmp")
-        self.names_fpath = os.path.join(self.dbdir, "names.dmp")
-        self.merged_fpath = os.path.join(self.dbdir, "merged.dmp")
-        self.delnodes_fpath = os.path.join(self.dbdir, "delnodes.dmp")
+        self.dmnd_db = os.path.join(
+            self.config.get("databases", "gtdb"),
+            f"autometa_formatted_gtdb-version-{gtdb_version}.{gtdb_subversion}.faa.gz",
+        )
+        self.accession2taxid = os.path.join(dbdir, "taxid.map")
+        self.nodes_fpath = os.path.join(dbdir, "nodes.dmp")
+        self.names_fpath = os.path.join(dbdir, "names.dmp")
+        self.merged_fpath = os.path.join(dbdir, "merged.dmp")
+        self.delnodes_fpath = os.path.join(dbdir, "delnodes.dmp")
         self.verify_databases()
         self.names = self.parse_names()
         self.nodes = self.parse_nodes()
@@ -341,7 +295,7 @@ def main():
 
     parser.add_argument(
         "--reps-faa",
-        help="Path to directory containing GTDB ref genome animo acid data sequences. Can be tarballed.",
+        help="Path to directory containing the tarballed GTDB ref genome animo acid data sequences.",
         required=True,
     )
     parser.add_argument(
@@ -355,7 +309,8 @@ def main():
 
     args = parser.parse_args()
 
-    gtdb_combined = create_gtdb_db(reps_faa=args.reps_faa, dbdir=args.dbdir)
+    gtdb_combined = create_combined_gtdb_fasta(reps_faa=args.reps_faa, dbdir=args.dbdir)
+
     diamond.makedatabase(
         fasta=gtdb_combined,
         database=gtdb_combined.replace(".faa", ".dmnd"),
diff --git a/docs/source/databases.rst b/docs/source/databases.rst
index 6850d7550..68a0aee16 100644
--- a/docs/source/databases.rst
+++ b/docs/source/databases.rst
@@ -13,17 +13,17 @@ Markers
 #######
 
 .. code-block:: bash
-    
+
     # Point Autometa to where you would like your markers database directory
     autometa-config \
         --section databases --option markers \
         --value <path/to/your/markers/database/directory>
-    
+
     # Update your markers database directory
     autometa-update-databases --update-markers
 
 .. alert::
-    
+
     Do NOT use a trailing slash, e.g. NO ``/`` for the database directory paths!
 
 Links to these markers files and their associated cutoff values are below:
@@ -68,7 +68,7 @@ Genome Taxonomy Database (GTDB)
 ###############################
 
 If you would like to incorporate the benefits of using the Genome Taxonomy Database,
-you can either run the following script or manually download the respective databases.
+you can either run the following script or manually download the respective databases. GTDB version 220 or later is required.
 
 .. code-block:: bash
 
@@ -81,7 +81,7 @@ you can either run the following script or manually download the respective data
     autometa-config \
         --section gtdb --option release \
         --value latest
-        # Or --value r207 or --value r202, etc.
+        # Or a version number like `--value 220`, or `--value 220.0`, etc.
 
     # Download and format the configured GTDB databases release
     autometa-update-databases --update-gtdb
@@ -93,28 +93,21 @@ you can either run the following script or manually download the respective data
 
     See ``autometa-update-databases -h`` and ``autometa-config -h`` for full list of options.
 
-The previous command will download the following GTDB databases and format the `gtdb_proteins_aa_reps.tar.gz` to generate `gtdb.dmnd` to be used by Autometa:
+The previous command will download the following GTDB databases and format them for use by Autometa. The filenames will be modified to include the release version number for reproducibility.
+
+The original files
 
 - Amino acid sequences of representative genome
-    - `gtdb_proteins_aa_reps.tar.gz <https://data.gtdb.ecogenomic.org/releases/latest/genomic_files_reps/gtdb_proteins_aa_reps.tar.gz>`_
+    - `gtdb_proteins_aa_reps.tar.gz <https://data.ace.uq.edu.au/public/gtdb/data/releases/latest/genomic_files_reps/gtdb_proteins_aa_reps.tar.gz>`_
 - gtdb-taxdump.tar.gz from `shenwei356/gtdb-taxdump <https://github.com/shenwei356/gtdb-taxdump/releases>`_
     - `gtdb-taxdump.tar.gz <https://github.com/shenwei356/gtdb-taxdump/releases/latest/download/gtdb-taxdump.tar.gz>`_
 
 
-Once unzipped `gtdb-taxdump.tar.gz` will have the taxdump files of all the respective GTDB releases. 
-Make sure that the release you use is in line with the `gtdb_proteins_aa_reps.tar.gz` release version. 
-It's better to always use the latest version. 
-
-All the taxonomy files for a specific taxonomy database should be in a single directory. 
-You can now copy the taxdump files of the desired release version in the sample directory as `gtdb.dmnd`
+The initial download and formatting of the GTDB databases can take some time. The GTDB databases are large, and downloading/formatting requires ~283 GB of hard disk space.
 
-Alternatively if you have manually downloaded `gtdb_proteins_aa_reps.tar.gz` and `gtdb-taxdump.tar.gz` you can run the 
-following command to format the `gtdb_proteins_aa_reps.tar.gz` to generate `gtdb.dmnd` and make it ready for Autometa.
-
-.. code-block:: bash
-
-    autometa-setup-gtdb --reps-faa <path/to/gtdb_proteins_aa_reps.tar.gz> --dbdir <path/to/output_directory> --cpus 20
-
-.. note::
+For version 220, the file sizes are approximately:
 
-    Again Make sure that the formatted `gtdb_proteins_aa_reps.tar.gz` database and gtdb taxdump files are in the same directory. 
+- 77 MB gtdb-taxdump-version-220.tar.gz
+- 67 GB gtdb_proteins_aa_reps-version-220.tar.gz
+- 149 GB autometa_formatted_gtdb-version-220.0.dmnd
+- 103 MB ./gtdb_taxdump-version-220/
diff --git a/tests/unit_tests/test_gtdb_download.py b/tests/unit_tests/test_gtdb_download.py
new file mode 100644
index 000000000..9f9a9c08d
--- /dev/null
+++ b/tests/unit_tests/test_gtdb_download.py
@@ -0,0 +1,173 @@
+import pytest
+from unittest import mock
+import requests
+import logging
+
+from autometa.taxonomy.download_gtdb_files import (
+    get_latest_gtdb_version,
+    get_gtdb_taxdump_release_url,
+    download_gtdb_taxdump,
+    download_proteins_aa_reps,
+)
+
+# Mock logger to avoid unnecessary logging during tests
+logger = logging.getLogger(__name__)
+
+
+@pytest.fixture
+def mock_requests_get():
+    with mock.patch("requests.get") as mock_get:
+        yield mock_get
+
+
+def test_get_latest_gtdb_version_success(mock_requests_get):
+    # Mock a successful request response
+    mock_response = mock.Mock()
+    mock_response.status_code = 200
+    mock_response.text = "v220\n"
+    mock_requests_get.return_value = mock_response
+
+    host = "data.ace.uq.edu.au/public/gtdb/data"
+    version = get_latest_gtdb_version(host)
+
+    assert version == "220"
+    mock_requests_get.assert_called_once_with(
+        f"https://{host}/releases/latest/VERSION.txt"
+    )
+
+
+def test_get_latest_gtdb_version_fail(mock_requests_get):
+    # Mock a failed request
+    mock_requests_get.side_effect = requests.exceptions.RequestException(
+        "Error occurred"
+    )
+
+    host = "data.ace.uq.edu.au/public/gtdb/data"
+    with pytest.raises(
+        RuntimeError, match="Failed to fetch GTDB version: Error occurred"
+    ):
+        get_latest_gtdb_version(host)
+
+
+def test_get_gtdb_taxdump_release_url_success(mock_requests_get):
+    # Mock a response from GitHub API with a valid release
+    mock_response = mock.Mock()
+    mock_response.status_code = 200
+    mock_response.json.return_value = [
+        {
+            "name": "r220",
+            "assets": [
+                {
+                    "name": "gtdb-taxdump.tar.gz",
+                    "browser_download_url": "https://example.com/download",
+                }
+            ],
+        }
+    ]
+    mock_requests_get.return_value = mock_response
+
+    gtdb_version = "220"
+    url = get_gtdb_taxdump_release_url(gtdb_version)
+
+    assert url == "https://example.com/download"
+    mock_requests_get.assert_called_once_with(
+        "https://api.github.com/repos/shenwei356/gtdb-taxdump/releases"
+    )
+
+
+def test_get_gtdb_taxdump_release_url_not_found(mock_requests_get):
+    # Mock a response with no matching version
+    mock_response = mock.Mock()
+    mock_response.status_code = 200
+    mock_response.json.return_value = [
+        {"name": "r219", "assets": [{"name": "gtdb-taxdump.tar.gz"}]}
+    ]
+    mock_requests_get.return_value = mock_response
+
+    gtdb_version = "220"
+    url = get_gtdb_taxdump_release_url(gtdb_version)
+
+    assert url is None
+    mock_requests_get.assert_called_once_with(
+        "https://api.github.com/repos/shenwei356/gtdb-taxdump/releases"
+    )
+
+
+def test_download_gtdb_taxdump_file_exists():
+    # Mock the file already existing
+    with mock.patch("pathlib.Path.exists", return_value=True):
+        with mock.patch("requests.get") as mock_get:
+            download_gtdb_taxdump(
+                "220", "/some/dir/gtdb-taxdump-R220.tar.gz", force=False
+            )
+            mock_get.assert_not_called()
+
+
+def test_download_gtdb_taxdump_success(mock_requests_get):
+    # Mock a successful file download
+    mock_response = mock.Mock()
+    mock_response.status_code = 200
+    mock_response.headers = {"content-length": "1024"}
+    mock_response.iter_content = mock.Mock(return_value=[b"chunk1", b"chunk2"])
+    # Mock the JSON response for the releases
+    mock_response.json.return_value = [
+        {
+            "name": "r220",
+            "assets": [
+                {
+                    "name": "gtdb-taxdump.tar.gz",
+                    "browser_download_url": "https://example.com/download",
+                }
+            ],
+        }
+    ]
+    mock_requests_get.return_value = mock_response
+    with mock.patch("pathlib.Path.exists", return_value=False):
+        with mock.patch("builtins.open", mock.mock_open()) as mock_file:
+            with mock.patch("autometa.taxonomy.download_gtdb_files.tqdm") as mock_tqdm:
+                download_gtdb_taxdump(
+                    "220", "/some/dir/gtdb-taxdump-R220.tar.gz", force=False
+                )
+                mock_requests_get.assert_called()
+                mock_file.assert_called_once_with(
+                    "/some/dir/gtdb-taxdump-R220.tar.gz", "wb"
+                )
+                mock_tqdm.assert_called_once()
+
+
+def test_download_proteins_aa_reps_success(mock_requests_get):
+    # Mock successful file download for proteins_aa_reps
+    mock_response = mock.Mock()
+    mock_response.status_code = 200
+    mock_response.headers = {"content-length": "1024"}
+    mock_response.iter_content = mock.Mock(return_value=[b"chunk1", b"chunk2"])
+    # Configure the mock to support the context manager protocol
+    mock_response.__enter__ = mock.Mock(return_value=mock_response)
+    mock_response.__exit__ = mock.Mock(return_value=None)
+
+    # Mock MD5SUM response with expected checksum
+    md5sum_response = mock.Mock()
+    md5sum_response.status_code = 200
+    md5sum_response.text = "d41d8cd98f00b204e9800998ecf8427e  genomic_files_reps/gtdb_proteins_aa_reps_r220.tar.gz"
+    with mock.patch("pathlib.Path.exists", return_value=False):
+        with mock.patch(
+            "requests.get", side_effect=[md5sum_response, mock_response]
+        ) as mock_get:
+            with mock.patch("builtins.open", mock.mock_open()) as mock_file:
+                with mock.patch(
+                    "autometa.taxonomy.download_gtdb_files.tqdm"
+                ) as mock_tqdm:
+                    download_proteins_aa_reps(
+                        "data.ace.uq.edu.au/public/gtdb/data",
+                        "220",
+                        "1",
+                        "/some/dir/gtdb_proteins_aa_reps-R220.tar.gz",
+                        force=False,
+                    )
+                    assert (
+                        mock_get.call_count == 2
+                    )  # One for MD5, one for the actual download
+                    mock_file.assert_called_once_with(
+                        "/some/dir/gtdb_proteins_aa_reps-R220.tar.gz", "wb"
+                    )
+                    mock_tqdm.assert_called_once()