bactopia
diff --git a/‎.gitignore‎
Lines changed: 7 additions & 0 deletions b/‎.gitignore‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎bactopia/cli/atb/atb_downloader.py‎
Lines changed: 1 addition & 1 deletion b/‎bactopia/cli/atb/atb_downloader.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bactopia/cli/datasets.py‎
Lines changed: 1 addition & 1 deletion b/‎bactopia/cli/datasets.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bactopia/cli/download.py‎
Lines changed: 12 additions & 4 deletions b/‎bactopia/cli/download.py‎
Lines changed: 12 additions & 4 deletions
diff --git a/‎bactopia/cli/search.py‎
Lines changed: 35 additions & 104 deletions b/‎bactopia/cli/search.py‎
Lines changed: 35 additions & 104 deletions
diff --git a/‎bactopia/cli/update.py‎
Lines changed: 0 additions & 2 deletions b/‎bactopia/cli/update.py‎
Lines changed: 0 additions & 2 deletions
@@ -1,6 +1,13 @@
+temp/
 bactopia-exclude.tsv
 bactopia-report.tsv
 bactopia-summary.txt
+accessions.txt
+bactopia-accessions.txt
+bactopia-filtered.txt
+bactopia-metadata.txt
+bactopia-search.txt
+species_genome_size.txt.gz
 .vscode/settings.json
 
 # Byte-compiled / optimized / DLL files
 
@@ -10,7 +10,7 @@
 
 import bactopia
 from bactopia.atb import parse_atb_file_list
-from bactopia.ncbi import is_biosample, taxid2name
+from bactopia.databases.ncbi import is_biosample, taxid2name
 from bactopia.utils import (
     download_url,
     execute,
 
@@ -11,7 +11,7 @@
 from rich.logging import RichHandler
 
 import bactopia
-from bactopia.utils import execute, validate_file
+from bactopia.utils import execute
 
 BACTOPIA_CACHEDIR = os.getenv("BACTOPIA_CACHEDIR", f"{Path.home()}/.bactopia")
 
 
@@ -2,6 +2,7 @@
 import os
 import sys
 import time
+from datetime import datetime
 from pathlib import Path
 
 import rich
@@ -11,7 +12,7 @@
 from rich.logging import RichHandler
 
 import bactopia
-from bactopia.utils import execute, validate_file
+from bactopia.utils import execute
 
 BACTOPIA_CACHEDIR = os.getenv("BACTOPIA_CACHEDIR", f"{Path.home()}/.bactopia")
 CONDA_CACHEDIR = os.getenv("NXF_CONDA_CACHEDIR", f"{BACTOPIA_CACHEDIR}/conda")
@@ -262,7 +263,8 @@ def build_env(
         build_conda_env(
             conda_method, envinfo["conda"], conda_prefix, max_retry=max_retry
         )
-        execute(f"date > {conda_complete}")
+        with open(conda_complete, "w") as f:
+            f.write(f"{datetime.now().isoformat()}\n")
         BUILT_ALREADY["conda"][
             conda_prefix
         ] = f"Already built {envname} ({conda_prefix}) this run, skipping rebuild"
@@ -363,11 +365,17 @@ def build_conda_env(
     allow_fail = False
     success = False
     while not success:
+        # Cleanup existing directory if they exist
+        execute(
+            f"rm -rf {conda_path}",
+            allow_fail=allow_fail,
+        )
         result = execute(
-            f"rm -rf {conda_path} && {program} create -y -p {conda_path} -c conda-forge -c bioconda {conda_env}",
+            f"{program} create -y -p {conda_path} -c conda-forge -c bioconda {conda_env}",
             allow_fail=allow_fail,
         )
-        if not result:
+        if result:
+            # Non-zero exit code
             if retry > max_retry:
                 allow_fail = True
             retry += 1
 
@@ -13,8 +13,9 @@
 from rich.logging import RichHandler
 
 import bactopia
-from bactopia.ncbi import is_biosample
-from bactopia.utils import get_ncbi_genome_size
+from bactopia.databases.ena import get_run_info
+from bactopia.databases.ncbi import get_ncbi_genome_size, is_biosample
+from bactopia.utils import chunk_list
 
 # Set up Rich
 stderr = rich.console.Console(stderr=True)
@@ -45,6 +46,7 @@
             "name": "Additional Options",
             "options": [
                 "--genome-size",
+                "--use-ncbi-genome-size",
                 "--outdir",
                 "--prefix",
                 "--force",
@@ -56,87 +58,6 @@
         },
     ]
 }
-ENA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"
-
-
-def get_ena_metadata(query: str, is_accession: bool, limit: int):
-    """Fetch metadata from ENA.
-    https://docs.google.com/document/d/1CwoY84MuZ3SdKYocqssumghBF88PWxUZ/edit#heading=h.ag0eqy2wfin5
-
-    Args:
-        query (str): The query to search for.
-        is_accession (bool): If the query is an accession or not.
-        limit (int): The maximum number of records to return.
-
-    Returns:
-        list: Records associated with the accession.
-    """
-    data = {
-        "dataPortal": "ena",
-        "dccDataOnly": "false",
-        "download": "false",
-        "result": "read_run",
-        "format": "tsv",
-        "limit": limit,
-        "fields": "all",
-    }
-
-    if is_accession:
-        data["includeAccessions"] = query
-    else:
-        data["query"] = (
-            f'"{query} AND library_source=GENOMIC AND '
-            "(library_strategy=OTHER OR library_strategy=WGS OR "
-            "library_strategy=WGA) AND (library_selection=MNase OR "
-            "library_selection=RANDOM OR library_selection=unspecified OR "
-            'library_selection="size fractionation")"'
-        )
-
-    headers = {"accept": "*/*", "Content-type": "application/x-www-form-urlencoded"}
-
-    r = requests.post(ENA_URL, headers=headers, data=data)
-    if r.status_code == requests.codes.ok:
-        data = []
-        col_names = None
-        for line in r.text.split("\n"):
-            cols = line.split("\t")
-            if line:
-                if col_names:
-                    data.append(dict(zip(col_names, cols)))
-                else:
-                    col_names = cols
-        return [True, data]
-    else:
-        return [False, [r.status_code, r.text]]
-
-
-def get_run_info(
-    sra_query: str, ena_query: str, is_accession: bool, limit: int = 1000000
-) -> tuple:
-    """Retrieve a list of samples available from ENA.
-
-    The first attempt will be against ENA, and if that fails, SRA will be queried. This should
-    capture those samples not yet synced between ENA and SRA.
-
-    Args:
-        sra_query (str): A formatted query for SRA searches.
-        ena_query (str): A formatted query for ENA searches.
-        is_accession (bool): If the query is an accession or not.
-        limit (int): The maximum number of records to return.
-
-    Returns:
-        tuple: Records associated with the accession.
-    """
-
-    logging.debug("Querying ENA for metadata...")
-    success, ena_data = get_ena_metadata(ena_query, is_accession, limit=limit)
-    if success:
-        return success, ena_data
-    else:
-        logging.error("There was an issue querying ENA, exiting...")
-        logging.error(f"STATUS: {ena_data[0]}")
-        logging.error(f"TEXT: {ena_data[1]}")
-        sys.exit(1)
 
 
 def parse_accessions(
@@ -209,13 +130,26 @@ def parse_accessions(
 
             # Genome size
             gsize = genome_size
-            if not gsize:
-                if result["tax_id"] in genome_sizes:
-                    gsize = genome_sizes[result["tax_id"]]["expected_ungapped_length"]
+            if not gsize and genome_sizes:
+                if result["tax_id"]:
+                    if result["tax_id"] in genome_sizes:
+                        if "expected_ungapped_length" in genome_sizes[result["tax_id"]]:
+                            gsize = genome_sizes[result["tax_id"]][
+                                "expected_ungapped_length"
+                            ]
+                        else:
+                            logging.warning(
+                                f"Could not find genome size for {result['scientific_name']} (Tax ID {result['tax_id']})"
+                            )
+                    else:
+                        logging.warning(
+                            f"Could not find genome size for {result['scientific_name']} (Tax ID {result['tax_id']})"
+                        )
                 else:
                     logging.warning(
-                        f"Could not find genome size for {result['scientific_name']} (Tax ID {result['tax_id']})"
+                        f"Accession ({result['experiment_accession']}) does not have a tax_id associated with it."
                     )
+                    result["scientific_name"] = "UNKNOWN_SPECIES"
 
             if passes:
                 accessions.append(
@@ -234,15 +168,6 @@ def parse_accessions(
     return [list(set(accessions)), filtered]
 
 
-def chunks(chunk: list, total: int) -> list:
-    """
-    Yield successive n-sized chunks from l.
-    https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top
-    """
-    for i in range(0, len(chunk), total):
-        yield chunk[i : i + total]
-
-
 def parse_query(q, accession_limit, exact_taxon=False):
     """Return the query based on if Taxon ID or BioProject/Study accession."""
     import re
@@ -296,13 +221,13 @@ def parse_query(q, accession_limit, exact_taxon=False):
                 results.append(["taxon_name", f'tax_name("{query}")', f"'{query}'"])
 
     # Split the accessions into set number
-    for chunk in chunks(bioproject_accessions, accession_limit):
+    for chunk in chunk_list(bioproject_accessions, accession_limit):
         results.append(["bioproject_accession", ",".join(chunk), " OR ".join(chunk)])
-    for chunk in chunks(biosample_accessions, accession_limit):
+    for chunk in chunk_list(biosample_accessions, accession_limit):
         results.append(["biosample_accession", ",".join(chunk), " OR ".join(chunk)])
-    for chunk in chunks(experiment_accessions, accession_limit):
+    for chunk in chunk_list(experiment_accessions, accession_limit):
         results.append(["experiment_accession", ",".join(chunk), " OR ".join(chunk)])
-    for chunk in chunks(run_accessions, accession_limit):
+    for chunk in chunk_list(run_accessions, accession_limit):
         results.append(["run_accession", ",".join(chunk), " OR ".join(chunk)])
 
     return results
@@ -375,6 +300,11 @@ def parse_query(q, accession_limit, exact_taxon=False):
     show_default=True,
     help="Genome size to be used for all samples, and for calculating min coverage",
 )
+@click.option(
+    "--use-ncbi-genome-size",
+    is_flag=True,
+    help="If available, use NCBI genome size for species",
+)
 @click.option(
     "--include-empty",
     is_flag=True,
@@ -395,6 +325,7 @@ def search(
     min_read_length,
     min_coverage,
     genome_size,
+    use_ncbi_genome_size,
     include_empty,
     force,
     verbose,
@@ -419,23 +350,23 @@ def search(
     if min_coverage and genome_size:
         if min_base_count:
             logging.error(
-                "--min_base_count cannot be used with --coverage/--genome_size. Exiting...",
+                "--min-base-count cannot be used with --min-coverage/--genome-size. Exiting...",
                 file=sys.stderr,
             )
             sys.exit(1)
         else:
             min_base_count = min_coverage * genome_size
     elif min_coverage or genome_size:
         logging.error(
-            "--coverage and --genome_size must be used together. Exiting...",
+            "--min-coverage and --genome-size must be used together. Exiting...",
             file=sys.stderr,
         )
         sys.exit(1)
 
     if biosample_subset > 0:
         if not is_biosample(query):
             logging.error(
-                "--biosample_subset requires a single BioSample. Input query: {query} is not a BioSample. Exiting...",
+                "--biosample-subset requires a single BioSample. Input query: {query} is not a BioSample. Exiting...",
                 file=sys.stderr,
             )
             sys.exit(1)
@@ -458,7 +389,7 @@ def search(
     accessions_file = f"{outdir}/{prefix}-accessions.txt".replace("//", "/")
     filtered_file = f"{outdir}/{prefix}-filtered.txt".replace("//", "/")
     summary_file = f"{outdir}/{prefix}-search.txt".replace("//", "/")
-    genome_sizes = get_ncbi_genome_size()
+    genome_sizes = get_ncbi_genome_size() if use_ncbi_genome_size else None
     for query_type, ena_query, sra_query in queries:
         logging.info(f"Submitting query (type - {query_type})")
         is_accession = True if query_type.endswith("accession") else False
 
@@ -1,5 +1,4 @@
 import logging
-import os
 import sys
 import time
 from pathlib import Path
@@ -12,7 +11,6 @@
 from rich.logging import RichHandler
 
 import bactopia
-from bactopia.utils import execute, validate_file
 
 # Set up Rich
 stderr = rich.console.Console(stderr=True)