11from Bio import SeqIO
2- from Bio .SeqUtils import GC
2+ from Bio .SeqUtils import gc_fraction
33from mob_suite .blast import BlastRunner
44from mob_suite .blast import BlastReader
55import os , re , time
1111from ete3 import NCBITaxa
1212from mob_suite .constants import \
1313 MOB_TYPER_REPORT_HEADER , \
14- MGE_INFO_HEADER , \
15- ETE3DBTAXAFILE
14+ MGE_INFO_HEADER
1615
1716
1817def getAssocValues (query_list_values , look_up_key , value_key , data ):
@@ -65,30 +64,6 @@ def parseMash(mash_results):
6564 return hits
6665
6766
68- def getHeirarchy (taxid ):
69- if not isETE3DBTAXAFILEexists (ETE3DBTAXAFILE ):
70- logging .info ("Did not find taxa.sqlite in {}. Initializaing ete3 taxonomy database" .format (ETE3DBTAXAFILE ))
71- initETE3Database ()
72-
73- ncbi = NCBITaxa (dbfile = ETE3DBTAXAFILE )
74- if not isETE3DBTAXAFILEexists (ETE3DBTAXAFILE ):
75- logging .error (
76- "Tried ete3 init, but still was not able to find taxa.sqlite file for ete3 lib in {}. Aborting" .format (
77- ETE3DBTAXAFILE ))
78- return ['-' , '-' ]
79-
80- if not isinstance (taxid , int ):
81- return {'names' : [], 'ranks' : []}
82-
83- lineage = ncbi .get_lineage (taxid )
84- names = ncbi .get_taxid_translator (lineage )
85- ranks = []
86- for id in lineage :
87- ranks .append (ncbi .get_rank (id ))
88-
89- return {'names' : names , 'ranks' : names }
90-
91-
9267def filter_invalid_taxids (taxids ):
9368 filtered = []
9469 for i in taxids :
@@ -104,7 +79,7 @@ def filter_invalid_taxids(taxids):
10479 return filtered
10580
10681
107- def getHeirarchy (taxid ):
82+ def getHeirarchy (taxid , ETE3DBTAXAFILE ):
10883 if not isETE3DBTAXAFILEexists (ETE3DBTAXAFILE ):
10984 logging .info ("Did not find taxa.sqlite in {}. Initializaing ete3 taxonomy database" .format (ETE3DBTAXAFILE ))
11085 initETE3Database ()
@@ -128,7 +103,7 @@ def getHeirarchy(taxid):
128103 return {'names' : names , 'ranks' : names }
129104
130105
131- def getTaxid (taxon ):
106+ def getTaxid (taxon , ETE3DBTAXAFILE ):
132107 if not isETE3DBTAXAFILEexists (ETE3DBTAXAFILE ):
133108 logging .info ("Did not find taxa.sqlite in {}. Initializaing ete3 taxonomy database" .format (ETE3DBTAXAFILE ))
134109 initETE3Database ()
@@ -146,7 +121,7 @@ def getTaxid(taxon):
146121
147122
148123
149- def NamesToTaxIDs (names ):
124+ def NamesToTaxIDs (names , ETE3DBTAXAFILE ):
150125 if not isETE3DBTAXAFILEexists (ETE3DBTAXAFILE ):
151126 logging .info ("Did not find taxa.sqlite in {}. Initializaing ete3 taxonomy database" .format (ETE3DBTAXAFILE ))
152127 initETE3Database (ETE3DBTAXAFILE )
@@ -163,7 +138,7 @@ def NamesToTaxIDs(names):
163138
164139
165140
166- def getTaxonConvergence (taxids ):
141+ def getTaxonConvergence (taxids , ETE3DBTAXAFILE ):
167142 if not isETE3DBTAXAFILEexists (ETE3DBTAXAFILE ):
168143 logging .info ("Did not find taxa.sqlite in {}. Initializaing ete3 taxonomy database" .format (ETE3DBTAXAFILE ))
169144 initETE3Database (ETE3DBTAXAFILE )
@@ -231,7 +206,7 @@ def getTaxonConvergence(taxids):
231206 return (['-' , '-' ])
232207
233208
234- def hostrange (replion_types , relaxase_types , mob_cluster_id , ncbi , lit ):
209+ def hostrange (replion_types , relaxase_types , mob_cluster_id , ncbi , lit , ETE3DBTAXAFILE ):
235210 host_range_predictions = {
236211 'observed_host_range_ncbi_name' : '' ,
237212 'observed_host_range_ncbi_rank' : '' ,
@@ -276,25 +251,25 @@ def hostrange(replion_types, relaxase_types, mob_cluster_id, ncbi, lit):
276251 ncbi_unique_taxids = filter_invalid_taxids (
277252 list (set (ncbi_replicon_taxids + ncbi_cluster_taxids + ncbi_relaxase_taxids )))
278253 host_range_predictions ['observed_host_range_ncbi_rank' ], host_range_predictions [
279- 'observed_host_range_ncbi_name' ] = getTaxonConvergence (ncbi_unique_taxids )
254+ 'observed_host_range_ncbi_name' ] = getTaxonConvergence (ncbi_unique_taxids , ETE3DBTAXAFILE )
280255
281256 # Determine taxids associated with literature
282257
283258 lit_unique_taxids = filter_invalid_taxids (list (set (lit_replicon_taxids )))
284259
285260 host_range_predictions ['reported_host_range_lit_rank' ], host_range_predictions [
286- 'reported_host_range_lit_name' ] = getTaxonConvergence (lit_unique_taxids )
261+ 'reported_host_range_lit_name' ] = getTaxonConvergence (lit_unique_taxids , ETE3DBTAXAFILE )
287262
288263 # determine overall host range
289264 overall_taxids = filter_invalid_taxids (list (set (ncbi_unique_taxids + lit_unique_taxids )))
290265 host_range_predictions ['predicted_host_range_overall_rank' ], host_range_predictions [
291- 'predicted_host_range_overall_name' ] = getTaxonConvergence (overall_taxids )
266+ 'predicted_host_range_overall_name' ] = getTaxonConvergence (overall_taxids , ETE3DBTAXAFILE )
292267
293268 # move host-range prediction up to family when it is at genus or species level
294269 if host_range_predictions ['predicted_host_range_overall_rank' ] == 'genus' or host_range_predictions [
295270 'predicted_host_range_overall_rank' ] == 'species' :
296- taxid = getTaxid (host_range_predictions ['predicted_host_range_overall_name' ])
297- heir = getHeirarchy (taxid )
271+ taxid = getTaxid (host_range_predictions ['predicted_host_range_overall_name' ], ETE3DBTAXAFILE )
272+ heir = getHeirarchy (taxid , ETE3DBTAXAFILE )
298273 names = heir ['names' ]
299274 ranks = heir ['ranks' ]
300275
@@ -869,7 +844,7 @@ def calcFastaStatsIndividual(fasta):
869844 id = record .id
870845 seq = record .seq
871846 genome_size = len (seq )
872- gc = GC (seq )
847+ gc = gc_fraction (seq )
873848 md5 = calc_md5 (seq )
874849 stats [id ] = {
875850 'total_length' : genome_size ,
@@ -892,7 +867,7 @@ def calcFastaStats(fasta):
892867 num_seqs += 1
893868 seq = seq + record .seq
894869 genome_size = len (seq )
895- gc = GC (seq )
870+ gc = gc_fraction (seq )
896871 md5 = calc_md5 (seq )
897872
898873 return {
@@ -974,7 +949,7 @@ def determine_mpf_type(hits):
974949 return max (types , key = lambda i : types [i ])
975950
976951
977- def build_mobtyper_report (plasmid_contig_info , out_dir , outfile , seq_dict , ncbi , lit ):
952+ def build_mobtyper_report (plasmid_contig_info , out_dir , outfile , seq_dict , ncbi , lit , ETE3DBTAXAFILE ):
978953 mob_typer_results = {}
979954 for clust_id in plasmid_contig_info :
980955
@@ -1008,7 +983,7 @@ def build_mobtyper_report(plasmid_contig_info, out_dir, outfile, seq_dict, ncbi,
1008983 cluster_seq = sorted (cluster_seq ,key = len )
1009984 seq = "" .join (cluster_seq )
1010985 mob_typer_results [clust_id ]['md5' ] = [calc_md5 (seq )]
1011- mob_typer_results [clust_id ]['gc' ] = [GC (seq )]
986+ mob_typer_results [clust_id ]['gc' ] = [gc_fraction (seq )]
1012987 mob_typer_results [clust_id ]['size' ] = [len (seq )]
1013988 mob_typer_results [clust_id ]['num_contigs' ] = len (cluster_seq )
1014989
@@ -1089,7 +1064,7 @@ def build_mobtyper_report(plasmid_contig_info, out_dir, outfile, seq_dict, ncbi,
10891064 else :
10901065 mob_cluster_id = '-'
10911066
1092- host_range = hostrange (rep_types , relaxase_types , mob_cluster_id , ncbi , lit )
1067+ host_range = hostrange (rep_types , relaxase_types , mob_cluster_id , ncbi , lit , ETE3DBTAXAFILE )
10931068
10941069 for field in host_range :
10951070 mob_typer_results [clust_id ][field ] = host_range [field ]
0 commit comments