1313from rich .logging import RichHandler
1414
1515import bactopia
16- from bactopia .ncbi import is_biosample
17- from bactopia .utils import get_ncbi_genome_size
16+ from bactopia .databases .ena import get_run_info
17+ from bactopia .databases .ncbi import get_ncbi_genome_size , is_biosample
18+ from bactopia .utils import chunk_list
1819
1920# Set up Rich
2021stderr = rich .console .Console (stderr = True )
4546 "name" : "Additional Options" ,
4647 "options" : [
4748 "--genome-size" ,
49+ "--use-ncbi-genome-size" ,
4850 "--outdir" ,
4951 "--prefix" ,
5052 "--force" ,
5658 },
5759 ]
5860}
59- ENA_URL = "https://www.ebi.ac.uk/ena/portal/api/search"
60-
61-
62- def get_ena_metadata (query : str , is_accession : bool , limit : int ):
63- """Fetch metadata from ENA.
64- https://docs.google.com/document/d/1CwoY84MuZ3SdKYocqssumghBF88PWxUZ/edit#heading=h.ag0eqy2wfin5
65-
66- Args:
67- query (str): The query to search for.
68- is_accession (bool): If the query is an accession or not.
69- limit (int): The maximum number of records to return.
70-
71- Returns:
72- list: Records associated with the accession.
73- """
74- data = {
75- "dataPortal" : "ena" ,
76- "dccDataOnly" : "false" ,
77- "download" : "false" ,
78- "result" : "read_run" ,
79- "format" : "tsv" ,
80- "limit" : limit ,
81- "fields" : "all" ,
82- }
83-
84- if is_accession :
85- data ["includeAccessions" ] = query
86- else :
87- data ["query" ] = (
88- f'"{ query } AND library_source=GENOMIC AND '
89- "(library_strategy=OTHER OR library_strategy=WGS OR "
90- "library_strategy=WGA) AND (library_selection=MNase OR "
91- "library_selection=RANDOM OR library_selection=unspecified OR "
92- 'library_selection="size fractionation")"'
93- )
94-
95- headers = {"accept" : "*/*" , "Content-type" : "application/x-www-form-urlencoded" }
96-
97- r = requests .post (ENA_URL , headers = headers , data = data )
98- if r .status_code == requests .codes .ok :
99- data = []
100- col_names = None
101- for line in r .text .split ("\n " ):
102- cols = line .split ("\t " )
103- if line :
104- if col_names :
105- data .append (dict (zip (col_names , cols )))
106- else :
107- col_names = cols
108- return [True , data ]
109- else :
110- return [False , [r .status_code , r .text ]]
111-
112-
113- def get_run_info (
114- sra_query : str , ena_query : str , is_accession : bool , limit : int = 1000000
115- ) -> tuple :
116- """Retrieve a list of samples available from ENA.
117-
118- The first attempt will be against ENA, and if that fails, SRA will be queried. This should
119- capture those samples not yet synced between ENA and SRA.
120-
121- Args:
122- sra_query (str): A formatted query for SRA searches.
123- ena_query (str): A formatted query for ENA searches.
124- is_accession (bool): If the query is an accession or not.
125- limit (int): The maximum number of records to return.
126-
127- Returns:
128- tuple: Records associated with the accession.
129- """
130-
131- logging .debug ("Querying ENA for metadata..." )
132- success , ena_data = get_ena_metadata (ena_query , is_accession , limit = limit )
133- if success :
134- return success , ena_data
135- else :
136- logging .error ("There was an issue querying ENA, exiting..." )
137- logging .error (f"STATUS: { ena_data [0 ]} " )
138- logging .error (f"TEXT: { ena_data [1 ]} " )
139- sys .exit (1 )
14061
14162
14263def parse_accessions (
@@ -209,13 +130,26 @@ def parse_accessions(
209130
210131 # Genome size
211132 gsize = genome_size
212- if not gsize :
213- if result ["tax_id" ] in genome_sizes :
214- gsize = genome_sizes [result ["tax_id" ]]["expected_ungapped_length" ]
133+ if not gsize and genome_sizes :
134+ if result ["tax_id" ]:
135+ if result ["tax_id" ] in genome_sizes :
136+ if "expected_ungapped_length" in genome_sizes [result ["tax_id" ]]:
137+ gsize = genome_sizes [result ["tax_id" ]][
138+ "expected_ungapped_length"
139+ ]
140+ else :
141+ logging .warning (
142+ f"Could not find genome size for { result ['scientific_name' ]} (Tax ID { result ['tax_id' ]} )"
143+ )
144+ else :
145+ logging .warning (
146+ f"Could not find genome size for { result ['scientific_name' ]} (Tax ID { result ['tax_id' ]} )"
147+ )
215148 else :
216149 logging .warning (
217- f"Could not find genome size for { result ['scientific_name ' ]} (Tax ID { result [ ' tax_id' ] } ) "
150+ f"Accession ( { result ['experiment_accession ' ]} ) does not have a tax_id associated with it. "
218151 )
152+ result ["scientific_name" ] = "UNKNOWN_SPECIES"
219153
220154 if passes :
221155 accessions .append (
@@ -234,15 +168,6 @@ def parse_accessions(
234168 return [list (set (accessions )), filtered ]
235169
236170
237- def chunks (chunk : list , total : int ) -> list :
238- """
239- Yield successive n-sized chunks from l.
240- https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks?page=1&tab=votes#tab-top
241- """
242- for i in range (0 , len (chunk ), total ):
243- yield chunk [i : i + total ]
244-
245-
246171def parse_query (q , accession_limit , exact_taxon = False ):
247172 """Return the query based on if Taxon ID or BioProject/Study accession."""
248173 import re
@@ -296,13 +221,13 @@ def parse_query(q, accession_limit, exact_taxon=False):
296221 results .append (["taxon_name" , f'tax_name("{ query } ")' , f"'{ query } '" ])
297222
298223 # Split the accessions into set number
299- for chunk in chunks (bioproject_accessions , accession_limit ):
224+ for chunk in chunk_list (bioproject_accessions , accession_limit ):
300225 results .append (["bioproject_accession" , "," .join (chunk ), " OR " .join (chunk )])
301- for chunk in chunks (biosample_accessions , accession_limit ):
226+ for chunk in chunk_list (biosample_accessions , accession_limit ):
302227 results .append (["biosample_accession" , "," .join (chunk ), " OR " .join (chunk )])
303- for chunk in chunks (experiment_accessions , accession_limit ):
228+ for chunk in chunk_list (experiment_accessions , accession_limit ):
304229 results .append (["experiment_accession" , "," .join (chunk ), " OR " .join (chunk )])
305- for chunk in chunks (run_accessions , accession_limit ):
230+ for chunk in chunk_list (run_accessions , accession_limit ):
306231 results .append (["run_accession" , "," .join (chunk ), " OR " .join (chunk )])
307232
308233 return results
@@ -375,6 +300,11 @@ def parse_query(q, accession_limit, exact_taxon=False):
375300 show_default = True ,
376301 help = "Genome size to be used for all samples, and for calculating min coverage" ,
377302)
303+ @click .option (
304+ "--use-ncbi-genome-size" ,
305+ is_flag = True ,
306+ help = "If available, use NCBI genome size for species" ,
307+ )
378308@click .option (
379309 "--include-empty" ,
380310 is_flag = True ,
@@ -395,6 +325,7 @@ def search(
395325 min_read_length ,
396326 min_coverage ,
397327 genome_size ,
328+ use_ncbi_genome_size ,
398329 include_empty ,
399330 force ,
400331 verbose ,
@@ -419,23 +350,23 @@ def search(
419350 if min_coverage and genome_size :
420351 if min_base_count :
421352 logging .error (
422- "--min_base_count cannot be used with --coverage/--genome_size . Exiting..." ,
353+ "--min-base-count cannot be used with --min- coverage/--genome-size . Exiting..." ,
423354 file = sys .stderr ,
424355 )
425356 sys .exit (1 )
426357 else :
427358 min_base_count = min_coverage * genome_size
428359 elif min_coverage or genome_size :
429360 logging .error (
430- "--coverage and --genome_size must be used together. Exiting..." ,
361+ "--min- coverage and --genome-size must be used together. Exiting..." ,
431362 file = sys .stderr ,
432363 )
433364 sys .exit (1 )
434365
435366 if biosample_subset > 0 :
436367 if not is_biosample (query ):
437368 logging .error (
438- "--biosample_subset requires a single BioSample. Input query: {query} is not a BioSample. Exiting..." ,
369+ "--biosample-subset requires a single BioSample. Input query: {query} is not a BioSample. Exiting..." ,
439370 file = sys .stderr ,
440371 )
441372 sys .exit (1 )
@@ -458,7 +389,7 @@ def search(
458389 accessions_file = f"{ outdir } /{ prefix } -accessions.txt" .replace ("//" , "/" )
459390 filtered_file = f"{ outdir } /{ prefix } -filtered.txt" .replace ("//" , "/" )
460391 summary_file = f"{ outdir } /{ prefix } -search.txt" .replace ("//" , "/" )
461- genome_sizes = get_ncbi_genome_size ()
392+ genome_sizes = get_ncbi_genome_size () if use_ncbi_genome_size else None
462393 for query_type , ena_query , sra_query in queries :
463394 logging .info (f"Submitting query (type - { query_type } )" )
464395 is_accession = True if query_type .endswith ("accession" ) else False
0 commit comments