11#!/usr/bin/env python3
22
3- ## Last update: 11/6 /2018
4- ## Author: T.F. Jesus
5- ## This script runs MASH in plasmid databases making a pairwise diagonal matrix
6- ## for each pairwise comparison between libraries
7- ## Note: each header in fasta is considered a reference
3+ # Last update: 14/8 /2018
4+ # Author: T.F. Jesus
5+ # This script runs MASH in plasmid databases making a pairwise diagonal matrix
6+ # for each pairwise comparison between libraries
7+ # Note: each header in fasta is considered a reference
88
99import argparse
1010import sys
2121try :
2222 from utils .hist_util import plot_histogram
2323 from utils .taxa_fetch import executor
24+ from utils .crowd_curation import black_list
2425 from db_manager .db_app import db , models
2526except ImportError :
2627 from patlas .utils .hist_util import plot_histogram
2728 from patlas .utils .taxa_fetch import executor
29+ from patlas .utils .crowd_curation import black_list
2830 from patlas .db_manager .db_app import db , models
2931
30- # This is a rather sketchy solution TODO remove this with a refactor of node_crawler
32+ # TODO This is a rather sketchy solution, remove this with a refactor of node_crawler
3133sys .setrecursionlimit (10000 )
3234
33-
3435class Record :
3536
3637 def __init__ (self , accession , size , distance , percentage_hashes ):
@@ -103,7 +104,6 @@ def output_tree(infile, tag):
103104
104105 """
105106
106-
107107 mother_directory = os .path .join (os .path .dirname (os .path .abspath (infile )),
108108 tag )
109109 dirs = ["" , "tmp" , "results" , "reference_sketch" , "genome_sketchs" ,
@@ -233,6 +233,10 @@ def master_fasta(fastas, output_tag, mother_directory):
233233
234234 species_output = open (species_out , "w" )
235235
236+ # creates a list of accession numbers, listing all accessions in
237+ # input seuqences
238+ all_accessions = []
239+
236240 # sets first length instance
237241 length = 0
238242 accession = False
@@ -306,18 +310,21 @@ def master_fasta(fastas, output_tag, mother_directory):
306310 plasmid_name = search_substing (line )
307311 # species related functions
308312 all_species .append (" " .join (species .split ("_" )))
313+ # append accession that will be outputed to file
314+ all_accessions .append (accession )
309315
310316 # added this if statement to check whether CDS is present in
311317 # fasta header, since database contain them with CDS in string
312318 if "cds" in line .lower () and line .lower ().count ("cds" ) <= 1 \
313319 and "plasmid" not in line .lower ():
314320 truePlasmid = False
315321 reason = "cds"
316- #continue
317322 elif "origin" in line .lower ():
318323 truePlasmid = False
319324 reason = "origin"
320- #continue
325+ elif accession in black_list :
326+ truePlasmid = False
327+ reason = black_list [accession ]
321328 else :
322329 truePlasmid = True
323330
@@ -362,6 +369,14 @@ def master_fasta(fastas, output_tag, mother_directory):
362369 # writes a species list to output file
363370 species_output .write ("\n " .join (str (i ) for i in list (set (all_species ))))
364371 species_output .close ()
372+
373+ # write accessions to a file
374+ accession_out = os .path .join (mother_directory ,
375+ "accessions_list_{}.lst" .format (output_tag ))
376+ with open (accession_out , "w" ) as fh :
377+ fh .write ("version: 1.5.2\n " )
378+ fh .write ("\n " .join (all_accessions ))
379+
365380 return out_file , sequence_info , all_species
366381
367382
@@ -873,7 +888,8 @@ def main():
873888
874889 mash_options = parser .add_argument_group ("MASH related options" )
875890 mash_options .add_argument ("-k" , "--kmers" , dest = "kmer_size" , default = "21" ,
876- help = "Provide the number of k-mers to be provided to mash "
891+ help = "Provide the number of k-mers to be provided"
892+ " to mash "
877893 "sketch. Default: 21." )
878894 mash_options .add_argument ("-p" , "--pvalue" , dest = "pvalue" ,
879895 default = "0.05" , help = "Provide the p-value to "
@@ -888,8 +904,8 @@ def main():
888904 other_options = parser .add_argument_group ("Other options" )
889905 other_options .add_argument ("-rm" , "--remove" , dest = "remove" ,
890906 action = "store_true" , help = "Remove any temporary "
891- "files and folders not "
892- "needed (not present "
907+ "files and folders not"
908+ " needed (not present "
893909 "in results "
894910 "subdirectory)." )
895911 other_options .add_argument ("-hist" , "--histograms" , dest = "histograms" ,
@@ -911,8 +927,8 @@ def main():
911927 help = "this option allows to only run the part "
912928 "of the script that is required to "
913929 "generate the filtered fasta. Allowing for "
914- "instance to debug sequences that shoudn't "
915- "be removed using 'cds' and 'origin' "
930+ "instance to debug sequences that shouldn't "
931+ " be removed using 'cds' and 'origin' "
916932 "keywords" )
917933
918934 args = parser .parse_args ()
@@ -929,23 +945,22 @@ def main():
929945 names_file = args .names_file
930946 nodes_file = args .nodes_file
931947
932- ## lists all fastas given to argparser
948+ # lists all fastas given to argparser
933949 fastas = [f for f in args .inputfile if f .endswith ((".fas" , ".fasta" ,
934950 ".fna" , ".fsa" , ".fa" ))]
935951
936- ## creates output directory tree
937- output_tag = args .output_tag .replace ("/" , "" ) ## if the user gives and
952+ # creates output directory tree
953+ output_tag = args .output_tag .replace ("/" , "" ) # if the user gives and
938954 # input tag that is already a folder
939955 mother_directory = output_tree (fastas [0 ], output_tag )
940956
941- ## checks if multiple fastas are provided or not avoiding master_fasta
957+ # checks if multiple fastas are provided or not avoiding master_fasta
942958 # function
943959 print ("***********************************" )
944960 print ("Creating main database...\n " )
945961 main_fasta , sequence_info , all_species = master_fasta (fastas , output_tag ,
946962 mother_directory )
947963
948-
949964 # if the parameter sequences_to_remove is provided the script will only
950965 # generate the fasta files and a list of the sequences that were removed
951966 # from ncbi refseq original fasta.
@@ -954,11 +969,11 @@ def main():
954969 "Leaving script..." )
955970 sys .exit (0 )
956971
957- #########################
958- ### genera block here ## #
959- #########################
972+ #####################
973+ # genera block here #
974+ #####################
960975
961- ## runs mash related functions
976+ # runs mash related functions
962977 print ("***********************************" )
963978 print ("Sketching reference...\n " )
964979 ref_sketch = sketch_references (main_fasta , output_tag , threads , kmer_size ,
@@ -969,7 +984,7 @@ def main():
969984 print ("Making temporary files for each genome in fasta...\n " )
970985 genomes = genomes_parser (main_fasta , mother_directory )
971986
972- ## This must be multiprocessed since it is extremely fast to do mash
987+ # This must be multiprocessed since it is extremely fast to do mash
973988 # against one plasmid sequence
974989 print ("***********************************" )
975990 print ("Sketching genomes and running mash distances...\n " )
@@ -979,7 +994,7 @@ def main():
979994 output_tag , kmer_size , mother_directory ),
980995 genomes ) # process genomes iterable with pool
981996
982- ## loop to print a nice progress bar
997+ # loop to print a nice progress bar
983998 try :
984999 for _ in tqdm .tqdm (mp , total = len (genomes )):
9851000 pass
@@ -991,7 +1006,7 @@ def main():
9911006 # remaining options are triggered
9921007 print ("\n Finished MASH... uf uf uf!" )
9931008
994- ## Makes distances matrix csv file
1009+ # Makes distances matrix csv file
9951010 print ("\n ***********************************" )
9961011 print ("Creating distance matrix...\n " )
9971012 lists_traces = mash_distance_matrix (mother_directory , sequence_info ,
0 commit comments