Skip to content

Commit 7ebb18a

Browse files
committed
make ncbi query optional in bactopia-search, fix issue related to missing tax id with bactopia search, remove executor dependency
1 parent d74cc19 commit 7ebb18a

File tree

12 files changed

+559
-543
lines changed

12 files changed

+559
-543
lines changed

.gitignore

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
1+
temp/
12
bactopia-exclude.tsv
23
bactopia-report.tsv
34
bactopia-summary.txt
5+
accessions.txt
6+
bactopia-accessions.txt
7+
bactopia-filtered.txt
8+
bactopia-metadata.txt
9+
bactopia-search.txt
10+
species_genome_size.txt.gz
411
.vscode/settings.json
512

613
# Byte-compiled / optimized / DLL files

bactopia/cli/atb/atb_downloader.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
import bactopia
1212
from bactopia.atb import parse_atb_file_list
13-
from bactopia.ncbi import is_biosample, taxid2name
13+
from bactopia.databases.ncbi import is_biosample, taxid2name
1414
from bactopia.utils import (
1515
download_url,
1616
execute,

bactopia/cli/datasets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from rich.logging import RichHandler
1212

1313
import bactopia
14-
from bactopia.utils import execute, validate_file
14+
from bactopia.utils import execute
1515

1616
BACTOPIA_CACHEDIR = os.getenv("BACTOPIA_CACHEDIR", f"{Path.home()}/.bactopia")
1717

bactopia/cli/download.py

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import os
33
import sys
44
import time
5+
from datetime import datetime
56
from pathlib import Path
67

78
import rich
@@ -11,7 +12,7 @@
1112
from rich.logging import RichHandler
1213

1314
import bactopia
14-
from bactopia.utils import execute, validate_file
15+
from bactopia.utils import execute
1516

1617
BACTOPIA_CACHEDIR = os.getenv("BACTOPIA_CACHEDIR", f"{Path.home()}/.bactopia")
1718
CONDA_CACHEDIR = os.getenv("NXF_CONDA_CACHEDIR", f"{BACTOPIA_CACHEDIR}/conda")
@@ -262,7 +263,8 @@ def build_env(
262263
build_conda_env(
263264
conda_method, envinfo["conda"], conda_prefix, max_retry=max_retry
264265
)
265-
execute(f"date > {conda_complete}")
266+
with open(conda_complete, "w") as f:
267+
f.write(f"{datetime.now().isoformat()}\n")
266268
BUILT_ALREADY["conda"][
267269
conda_prefix
268270
] = f"Already built {envname} ({conda_prefix}) this run, skipping rebuild"
@@ -363,11 +365,17 @@ def build_conda_env(
363365
allow_fail = False
364366
success = False
365367
while not success:
368+
# Cleanup existing directory if they exist
369+
execute(
370+
f"rm -rf {conda_path}",
371+
allow_fail=allow_fail,
372+
)
366373
result = execute(
367-
f"rm -rf {conda_path} && {program} create -y -p {conda_path} -c conda-forge -c bioconda {conda_env}",
374+
f"{program} create -y -p {conda_path} -c conda-forge -c bioconda {conda_env}",
368375
allow_fail=allow_fail,
369376
)
370-
if not result:
377+
if result:
378+
# Non-zero exit code
371379
if retry > max_retry:
372380
allow_fail = True
373381
retry += 1

bactopia/cli/search.py

Lines changed: 35 additions & 104 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@
1313
from rich.logging import RichHandler
1414

1515
import bactopia
16-
from bactopia.ncbi import is_biosample
17-
from bactopia.utils import get_ncbi_genome_size
16+
from bactopia.databases.ena import get_run_info
17+
from bactopia.databases.ncbi import get_ncbi_genome_size, is_biosample
18+
from bactopia.utils import chunk_list
1819

1920
# Set up Rich
2021
stderr = rich.console.Console(stderr=True)
@@ -45,6 +46,7 @@
4546
"name": "Additional Options",
4647
"options": [
4748
"--genome-size",
49+
"--use-ncbi-genome-size",
4850
"--outdir",
4951
"--prefix",
5052
"--force",
@@ -56,87 +58,6 @@
5658
},
5759
]
5860
}
59-
ENA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"
60-
61-
62-
def get_ena_metadata(query: str, is_accession: bool, limit: int):
63-
"""Fetch metadata from ENA.
64-
https://docs.google.com/document/d/1CwoY84MuZ3SdKYocqssumghBF88PWxUZ/edit#heading=h.ag0eqy2wfin5
65-
66-
Args:
67-
query (str): The query to search for.
68-
is_accession (bool): If the query is an accession or not.
69-
limit (int): The maximum number of records to return.
70-
71-
Returns:
72-
list: Records associated with the accession.
73-
"""
74-
data = {
75-
"dataPortal": "ena",
76-
"dccDataOnly": "false",
77-
"download": "false",
78-
"result": "read_run",
79-
"format": "tsv",
80-
"limit": limit,
81-
"fields": "all",
82-
}
83-
84-
if is_accession:
85-
data["includeAccessions"] = query
86-
else:
87-
data["query"] = (
88-
f'"{query} AND library_source=GENOMIC AND '
89-
"(library_strategy=OTHER OR library_strategy=WGS OR "
90-
"library_strategy=WGA) AND (library_selection=MNase OR "
91-
"library_selection=RANDOM OR library_selection=unspecified OR "
92-
'library_selection="size fractionation")"'
93-
)
94-
95-
headers = {"accept": "*/*", "Content-type": "application/x-www-form-urlencoded"}
96-
97-
r = requests.post(ENA_URL, headers=headers, data=data)
98-
if r.status_code == requests.codes.ok:
99-
data = []
100-
col_names = None
101-
for line in r.text.split("\n"):
102-
cols = line.split("\t")
103-
if line:
104-
if col_names:
105-
data.append(dict(zip(col_names, cols)))
106-
else:
107-
col_names = cols
108-
return [True, data]
109-
else:
110-
return [False, [r.status_code, r.text]]
111-
112-
113-
def get_run_info(
114-
sra_query: str, ena_query: str, is_accession: bool, limit: int = 1000000
115-
) -> tuple:
116-
"""Retrieve a list of samples available from ENA.
117-
118-
The first attempt will be against ENA, and if that fails, SRA will be queried. This should
119-
capture those samples not yet synced between ENA and SRA.
120-
121-
Args:
122-
sra_query (str): A formatted query for SRA searches.
123-
ena_query (str): A formatted query for ENA searches.
124-
is_accession (bool): If the query is an accession or not.
125-
limit (int): The maximum number of records to return.
126-
127-
Returns:
128-
tuple: Records associated with the accession.
129-
"""
130-
131-
logging.debug("Querying ENA for metadata...")
132-
success, ena_data = get_ena_metadata(ena_query, is_accession, limit=limit)
133-
if success:
134-
return success, ena_data
135-
else:
136-
logging.error("There was an issue querying ENA, exiting...")
137-
logging.error(f"STATUS: {ena_data[0]}")
138-
logging.error(f"TEXT: {ena_data[1]}")
139-
sys.exit(1)
14061

14162

14263
def parse_accessions(
@@ -209,13 +130,26 @@ def parse_accessions(
209130

210131
# Genome size
211132
gsize = genome_size
212-
if not gsize:
213-
if result["tax_id"] in genome_sizes:
214-
gsize = genome_sizes[result["tax_id"]]["expected_ungapped_length"]
133+
if not gsize and genome_sizes:
134+
if result["tax_id"]:
135+
if result["tax_id"] in genome_sizes:
136+
if "expected_ungapped_length" in genome_sizes[result["tax_id"]]:
137+
gsize = genome_sizes[result["tax_id"]][
138+
"expected_ungapped_length"
139+
]
140+
else:
141+
logging.warning(
142+
f"Could not find genome size for {result['scientific_name']} (Tax ID {result['tax_id']})"
143+
)
144+
else:
145+
logging.warning(
146+
f"Could not find genome size for {result['scientific_name']} (Tax ID {result['tax_id']})"
147+
)
215148
else:
216149
logging.warning(
217-
f"Could not find genome size for {result['scientific_name']} (Tax ID {result['tax_id']})"
150+
f"Accession ({result['experiment_accession']}) does not have a tax_id associated with it."
218151
)
152+
result["scientific_name"] = "UNKNOWN_SPECIES"
219153

220154
if passes:
221155
accessions.append(
@@ -234,15 +168,6 @@ def parse_accessions(
234168
return [list(set(accessions)), filtered]
235169

236170

237-
def chunks(chunk: list, total: int) -> list:
238-
"""
239-
Yield successive n-sized chunks from l.
240-
https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top
241-
"""
242-
for i in range(0, len(chunk), total):
243-
yield chunk[i : i + total]
244-
245-
246171
def parse_query(q, accession_limit, exact_taxon=False):
247172
"""Return the query based on if Taxon ID or BioProject/Study accession."""
248173
import re
@@ -296,13 +221,13 @@ def parse_query(q, accession_limit, exact_taxon=False):
296221
results.append(["taxon_name", f'tax_name("{query}")', f"'{query}'"])
297222

298223
# Split the accessions into set number
299-
for chunk in chunks(bioproject_accessions, accession_limit):
224+
for chunk in chunk_list(bioproject_accessions, accession_limit):
300225
results.append(["bioproject_accession", ",".join(chunk), " OR ".join(chunk)])
301-
for chunk in chunks(biosample_accessions, accession_limit):
226+
for chunk in chunk_list(biosample_accessions, accession_limit):
302227
results.append(["biosample_accession", ",".join(chunk), " OR ".join(chunk)])
303-
for chunk in chunks(experiment_accessions, accession_limit):
228+
for chunk in chunk_list(experiment_accessions, accession_limit):
304229
results.append(["experiment_accession", ",".join(chunk), " OR ".join(chunk)])
305-
for chunk in chunks(run_accessions, accession_limit):
230+
for chunk in chunk_list(run_accessions, accession_limit):
306231
results.append(["run_accession", ",".join(chunk), " OR ".join(chunk)])
307232

308233
return results
@@ -375,6 +300,11 @@ def parse_query(q, accession_limit, exact_taxon=False):
375300
show_default=True,
376301
help="Genome size to be used for all samples, and for calculating min coverage",
377302
)
303+
@click.option(
304+
"--use-ncbi-genome-size",
305+
is_flag=True,
306+
help="If available, use NCBI genome size for species",
307+
)
378308
@click.option(
379309
"--include-empty",
380310
is_flag=True,
@@ -395,6 +325,7 @@ def search(
395325
min_read_length,
396326
min_coverage,
397327
genome_size,
328+
use_ncbi_genome_size,
398329
include_empty,
399330
force,
400331
verbose,
@@ -419,23 +350,23 @@ def search(
419350
if min_coverage and genome_size:
420351
if min_base_count:
421352
logging.error(
422-
"--min_base_count cannot be used with --coverage/--genome_size. Exiting...",
353+
"--min-base-count cannot be used with --min-coverage/--genome-size. Exiting...",
423354
file=sys.stderr,
424355
)
425356
sys.exit(1)
426357
else:
427358
min_base_count = min_coverage * genome_size
428359
elif min_coverage or genome_size:
429360
logging.error(
430-
"--coverage and --genome_size must be used together. Exiting...",
361+
"--min-coverage and --genome-size must be used together. Exiting...",
431362
file=sys.stderr,
432363
)
433364
sys.exit(1)
434365

435366
if biosample_subset > 0:
436367
if not is_biosample(query):
437368
logging.error(
438-
"--biosample_subset requires a single BioSample. Input query: {query} is not a BioSample. Exiting...",
369+
"--biosample-subset requires a single BioSample. Input query: {query} is not a BioSample. Exiting...",
439370
file=sys.stderr,
440371
)
441372
sys.exit(1)
@@ -458,7 +389,7 @@ def search(
458389
accessions_file = f"{outdir}/{prefix}-accessions.txt".replace("//", "/")
459390
filtered_file = f"{outdir}/{prefix}-filtered.txt".replace("//", "/")
460391
summary_file = f"{outdir}/{prefix}-search.txt".replace("//", "/")
461-
genome_sizes = get_ncbi_genome_size()
392+
genome_sizes = get_ncbi_genome_size() if use_ncbi_genome_size else None
462393
for query_type, ena_query, sra_query in queries:
463394
logging.info(f"Submitting query (type - {query_type})")
464395
is_accession = True if query_type.endswith("accession") else False

bactopia/cli/update.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
11
import logging
2-
import os
32
import sys
43
import time
54
from pathlib import Path
@@ -12,7 +11,6 @@
1211
from rich.logging import RichHandler
1312

1413
import bactopia
15-
from bactopia.utils import execute, validate_file
1614

1715
# Set up Rich
1816
stderr = rich.console.Console(stderr=True)

0 commit comments

Comments
 (0)