1717
1818from Bio import SeqIO
1919
20+ from autometa .taxonomy .database import TaxonomyDatabase
2021from autometa .taxonomy .ncbi import NCBI
22+ from autometa .taxonomy .gtdb import GTDB
2123from autometa .taxonomy import majority_vote
2224from autometa .common .markers import load as load_markers
2325
@@ -226,16 +228,16 @@ def get_metabin_stats(
226228
227229
228230def get_metabin_taxonomies (
229- bin_df : pd .DataFrame , ncbi : NCBI , cluster_col : str = "cluster"
231+ bin_df : pd .DataFrame , taxa_db : TaxonomyDatabase , cluster_col : str = "cluster"
230232) -> pd .DataFrame :
231233 """Retrieve taxonomies of all clusters recovered from Autometa binning.
232234
233235 Parameters
234236 ----------
235237 bin_df : pd.DataFrame
236238 Autometa binning table. index=contig, cols=['cluster','length','taxid', *canonical_ranks]
237- ncbi : autometa.taxonomy.ncbi.NCBI instance
238- Autometa NCBI class instance
239+ taxa_db : autometa.taxonomy.ncbi.TaxonomyDatabase instance
240+ Autometa NCBI or GTDB class instance
239241 cluster_col : str, optional
240242 Clustering column by which to group metabins
241243
@@ -246,7 +248,9 @@ def get_metabin_taxonomies(
246248 Indexed by cluster
247249 """
248250 logger .info (f"Retrieving metabin taxonomies for { cluster_col } " )
249- canonical_ranks = [rank for rank in NCBI .CANONICAL_RANKS if rank != "root" ]
251+ canonical_ranks = [
252+ rank for rank in TaxonomyDatabase .CANONICAL_RANKS if rank != "root"
253+ ]
250254 is_clustered = bin_df [cluster_col ].notnull ()
251255 bin_df = bin_df [is_clustered ]
252256 outcols = [cluster_col , "length" , "taxid" , * canonical_ranks ]
@@ -277,11 +281,13 @@ def get_metabin_taxonomies(
277281 taxonomies [cluster ][canonical_rank ].update ({taxid : length })
278282 else :
279283 taxonomies [cluster ][canonical_rank ][taxid ] += length
280- cluster_taxonomies = majority_vote .rank_taxids (taxonomies , ncbi )
284+ cluster_taxonomies = majority_vote .rank_taxids (taxonomies , taxa_db = taxa_db )
281285 # With our cluster taxonomies, let's place these into a dataframe for easy data accession
282286 cluster_taxa_df = pd .Series (data = cluster_taxonomies , name = "taxid" ).to_frame ()
283287 # With the list of taxids, we'll retrieve their complete canonical-rank information
284- lineage_df = ncbi .get_lineage_dataframe (cluster_taxa_df .taxid .tolist (), fillna = True )
288+ lineage_df = taxa_db .get_lineage_dataframe (
289+ cluster_taxa_df .taxid .tolist (), fillna = True
290+ )
285291 # Now put it all together
286292 cluster_taxa_df = pd .merge (
287293 cluster_taxa_df , lineage_df , how = "left" , left_on = "taxid" , right_index = True
@@ -323,11 +329,18 @@ def main():
323329 required = True ,
324330 )
325331 parser .add_argument (
326- "--ncbi " ,
327- help = "Path to user NCBI databases directory (Required for retrieving metabin taxonomies)" ,
332+ "--dbdir " ,
333+ help = "Path to user taxonomy database directory (Required for retrieving metabin taxonomies)" ,
328334 metavar = "dirpath" ,
329335 required = False ,
330336 )
337+ parser .add_argument (
338+ "--dbtype" ,
339+ help = "Taxonomy database type to use (NOTE: must correspond to the same database type used during contig taxon assignment.)" ,
340+ choices = ["ncbi" , "gtdb" ],
341+ required = False ,
342+ default = "ncbi" ,
343+ )
331344 parser .add_argument (
332345 "--binning-column" ,
333346 help = "Binning column to use for grouping metabins" ,
@@ -377,14 +390,17 @@ def main():
377390 logger .info (f"Wrote metabin stats to { args .output_stats } " )
378391 # Finally if taxonomy information is available then write out each metabin's taxonomy by modified majority voting method.
379392 if "taxid" in bin_df .columns :
380- if not args .ncbi :
393+ if not args .dbdir :
381394 logger .warn (
382- "taxid found in dataframe. --ncbi argument is required to retrieve metabin taxonomies. Skipping..."
395+ "taxid found in dataframe. --dbdir argument is required to retrieve metabin taxonomies. Skipping..."
383396 )
384397 else :
385- ncbi = NCBI (dirpath = args .ncbi )
398+ if args .dbtype == "ncbi" :
399+ taxa_db = NCBI (dbdir = args .dbdir )
400+ elif args .dbtype == "gtdb" :
401+ taxa_db = GTDB (dbdir = args .dbdir )
386402 taxa_df = get_metabin_taxonomies (
387- bin_df = bin_df , ncbi = ncbi , cluster_col = args .binning_column
403+ bin_df = bin_df , taxa_db = taxa_db , cluster_col = args .binning_column
388404 )
389405 taxa_df .to_csv (args .output_taxonomy , sep = "\t " , index = True , header = True )
390406
0 commit comments