44GREEN=' \033[0;32m'
55RED=' \033[0;31m'
66NC=' \033[0m'
7- VERSION=" v1.1.13 "
7+ VERSION=" v1.2.1 "
88
99
1010if [ " $1 " == " --version" ] || [ " $1 " == " -v" ]; then
@@ -63,6 +63,9 @@ if [ "$#" == 0 ] || [ $1 == "-h" ] || [ $1 == "help" ]; then
6363 printf " the labels if using TaxonKit (-t flag specified). E.g., all would be\n"
6464 printf " \" -L Domain,Phylum,Class,Order,Family,Genus,Species,Strain\" \n\n"
6565
66+ printf " - [-N] default: false\n"
67+ printf " No tree. Generate alignment only.\n\n"
68+
6669 printf " - [-T <str>] default: FastTree\n"
6770 printf " Which program to use for tree generation. Currently supported are\n"
6871 printf " \" FastTree\" and \" IQ-TREE\" . As of now, these run with default settings\n"
@@ -149,9 +152,10 @@ taxonkit_id_swap='false'
149152debug_flag=' false'
150153lineage_spec=" Domain,Phylum,Class,Species,Strain"
151154best_hit_mode=' false'
155+ align_only=' false'
152156tree_program=' FastTree'
153157
154- while getopts :a:g:f:A:H:o:m:tL:T :c:G:Bdn:j: args
158+ while getopts :a:g:f:A:H:o:m:tL:NT :c:G:Bdn:j: args
155159do
156160 case " ${args} "
157161 in
164168 m) file_to_genome_id_map=${OPTARG} ;;
165169 t) taxonkit_id_swap=' true' ;;
166170 L) lineage_spec=${OPTARG} ;;
171+ N) align_only=' true' ;;
167172 T) tree_program=${OPTARG} ;;
168173 c) len_cutoff=${OPTARG} ;;
169174 G) gen_cutoff=${OPTARG} ;;
488493# ############################################################################
489494# ############# EXPLICITLY STATING IF DEFAULT BEHAVIOR CHANGED #############
490495# ############################################################################
491- if [ $output_dir != " GToTree_output" ] || [ " $file_to_genome_id_map " != " " ] || [ $taxonkit_id_swap != " false" ] || [ $len_cutoff != " 0.2" ] || [ $gen_cutoff != " 0.75" ] || [ $debug_flag == " true" ] || [ $best_hit_mode == " true" ] || [ $num_jobs != " 1" ] || [ $num_cpus != 2 ] || [ $lineage_spec != " Domain,Phylum,Class,Species,Strain" ] || [ $tree_program != " FastTree" ]; then
496+ if [ $output_dir != " GToTree_output" ] || [ " $file_to_genome_id_map " != " " ] || [ $taxonkit_id_swap != " false" ] || [ $len_cutoff != " 0.2" ] || [ $gen_cutoff != " 0.75" ] || [ $debug_flag == " true" ] || [ $best_hit_mode == " true" ] || [ $num_jobs != " 1" ] || [ $num_cpus != 2 ] || [ $lineage_spec != " Domain,Phylum,Class,Species,Strain" ] || [ $tree_program != " FastTree" ] || [ $align_only == ' true ' ] ; then
492497
493498 if [ " $file_to_genome_id_map " != " " ]; then
494499 if [ ! -s $file_to_genome_id_map ]; then
@@ -521,8 +526,16 @@ if [ $output_dir != "GToTree_output" ] || [ "$file_to_genome_id_map" != "" ] ||
521526 printf " - Gene-length filtering cutoff threshold (\" -c\" ) has been set to $len_cutoff .\n" | tee -a $gtotree_log
522527 fi
523528
524- if [ $tree_program != " FastTree" ]; then
525- printf " - Tree generation program (\" -T\" ) has been set to $tree_program .\n" | tee -a $gtotree_log
529+ if [ $align_only == ' true' ]; then
530+ printf " - Only generating alignment, no tree, as \" -N\" option has been provided.\n" | tee -a $gtotree_log
531+ fi
532+
533+ if [ $align_only == ' false' ]; then
534+
535+ if [ $tree_program != " FastTree" ]; then
536+ printf " - Tree generation program (\" -T\" ) has been set to $tree_program .\n" | tee -a $gtotree_log
537+ fi
538+
526539 fi
527540
528541 if [ $gen_cutoff != " 0.5" ]; then
@@ -1579,6 +1592,42 @@ grep ">" ${tmp_dir}/cat.tmp | cut -f1 > ${tmp_dir}/headers.tmp
15791592grep -v " >" ${tmp_dir} /cat.tmp | sed -e $' s/\t /XXXXX/g' > ${tmp_dir} /seqs.tmp
15801593paste -d " \n" ${tmp_dir} /headers.tmp ${tmp_dir} /seqs.tmp > ${output_dir} /Aligned_SCGs.faa
15811594
1595+ # creating partitions file of gene coordinates for those that want might want it for mixed models with something like iqtree
1596+ curr_start=1
1597+ curr_stop=0
1598+ n=1
1599+ num_to_do=$( ls ${tmp_dir} /* _all_aligned.faa | wc -l)
1600+
1601+ for gene_file in $( ls ${tmp_dir} /* _all_aligned.faa)
1602+
1603+ do
1604+ if [ $n == 1 ]; then
1605+ n=$(( $n + 1 ))
1606+ gene_base=$( basename $gene_file )
1607+ gene=${gene_base%% _all_aligned.faa}
1608+ len_tmp=$( head -n 2 $gene_file | grep -v " >" | wc -c)
1609+ len_tmp=$( echo " $len_tmp - 1" | bc)
1610+
1611+ printf " AA, $gene = 1-${len_tmp} \n"
1612+
1613+ curr_start=$( echo " $len_tmp + 6" | bc)
1614+
1615+ else
1616+ n=$(( $n + 1 ))
1617+ gene_base=$( basename $gene_file )
1618+ gene=${gene_base%% _all_aligned.faa}
1619+ len_tmp=$( head -n 2 $gene_file | grep -v " >" | wc -c)
1620+ len_tmp=$( echo " $len_tmp - 1" | bc)
1621+
1622+ curr_stop=$( echo " $curr_start + $len_tmp - 1" | bc)
1623+
1624+ printf " AA, $gene = ${curr_start} -${curr_stop} \n"
1625+
1626+ curr_start=$( echo " $curr_stop + 6" | bc)
1627+ fi
1628+
1629+ done > ${output_dir} /Partitions.txt
1630+
15821631# storing genomes that made it through workflow to report at end
15831632genomes_retained=$( wc -l ${tmp_dir} /final_genomes_from_all_sources.tmp | sed " s/^ *//" | cut -d " " -f 1)
15841633
@@ -1938,47 +1987,51 @@ fi
19381987# ############################# MAKING TREE ###############################
19391988# ############################################################################
19401989
1941- if [ $tree_program == ' FastTree ' ]; then
1990+ if [ $align_only == ' false ' ]; then
19421991
1943- printf " \n ############################################################################## \n" | tee -a $gtotree_log
1944- printf " #### Running FastTree ####\n" | tee -a $gtotree_log
1945- printf " ############################################################################## \n\n" | tee -a $gtotree_log
1992+ if [ $tree_program == ' FastTree' ]; then
19461993
1947- curr_time=$( date +" %I:%M %p" )
1948- duration=$SECONDS
1994+ printf " \n ############################################################################## \n" | tee -a $gtotree_log
1995+ printf " #### Running FastTree ####\n" | tee -a $gtotree_log
1996+ printf " ############################################################################## \n\n" | tee -a $gtotree_log
19491997
1950- printf " It is currently $ curr_time; the process started at $start_time .\n " | tee -a $gtotree_log
1951- printf " Current process runtime: $(( $ duration / 60 / 60 )) hours and $(( ( $duration / 60 ) % 60 )) minutes.\n\n " | tee -a $gtotree_log
1998+ curr_time= $( date + " %I:%M %p " )
1999+ duration= $SECONDS
19522000
2001+ printf " It is currently $curr_time ; the process started at $start_time .\n" | tee -a $gtotree_log
2002+ printf " Current process runtime: $(( $duration / 60 / 60 )) hours and $(( ($duration / 60 ) % 60 )) minutes.\n\n" | tee -a $gtotree_log
2003+
2004+
2005+ if [ -s ${output_dir} /Aligned_SCGs_mod_names.faa ]; then
2006+ FastTree ${output_dir} /Aligned_SCGs_mod_names.faa > ${output_dir} /Aligned_SCGs_mod_names.tre | tee -a $gtotree_log
2007+ else
2008+ FastTree ${output_dir} /Aligned_SCGs.faa > ${output_dir} /Aligned_SCGs.tre | tee -a $gtotree_log
2009+ fi
19532010
1954- if [ -s ${output_dir} /Aligned_SCGs_mod_names.faa ]; then
1955- FastTree ${output_dir} /Aligned_SCGs_mod_names.faa > ${output_dir} /Aligned_SCGs_mod_names.tre | tee -a $gtotree_log
19562011 else
1957- FastTree ${output_dir} /Aligned_SCGs.faa > ${output_dir} /Aligned_SCGs.tre | tee -a $gtotree_log
1958- fi
19592012
1960- else
2013+ printf " \n ############################################################################## \n" | tee -a $gtotree_log
2014+ printf " #### Running IQ-TREE ####\n" | tee -a $gtotree_log
2015+ printf " ############################################################################## \n\n" | tee -a $gtotree_log
19612016
1962- printf " \n ############################################################################## \n" | tee -a $gtotree_log
1963- printf " #### Running IQ-TREE ####\n" | tee -a $gtotree_log
1964- printf " ############################################################################## \n\n" | tee -a $gtotree_log
2017+ curr_time=$( date +" %I:%M %p" )
2018+ duration=$SECONDS
19652019
1966- curr_time= $( date + " %I:%M %p " )
1967- duration= $SECONDS
2020+ printf " It is currently $curr_time ; the process started at $start_time .\n " | tee -a $gtotree_log
2021+ printf " Current process runtime: $(( $ duration / 60 / 60 )) hours and $(( ( $duration / 60 ) % 60 )) minutes.\n\n " | tee -a $gtotree_log
19682022
1969- printf " It is currently $curr_time ; the process started at $start_time .\n" | tee -a $gtotree_log
1970- printf " Current process runtime: $(( $duration / 60 / 60 )) hours and $(( ($duration / 60 ) % 60 )) minutes.\n\n" | tee -a $gtotree_log
19712023
2024+ if [ -s ${output_dir} /Aligned_SCGs_mod_names.faa ]; then
2025+ iqtree -s ${output_dir} /Aligned_SCGs_mod_names.faa -nt $num_jobs -mset WAG,LG -bb 1000 -pre iqtree_out
2026+ else
2027+ iqtree -s ${output_dir} /Aligned_SCGs.faa -nt $num_jobs -mset WAG,LG -bb 1000 -pre iqtree_out
2028+ fi
19722029
1973- if [ -s ${output_dir} /Aligned_SCGs_mod_names.faa ]; then
1974- iqtree -s ${output_dir} /Aligned_SCGs_mod_names.faa -nt $num_jobs -mset WAG,LG -bb 1000 -pre iqtree_out
1975- else
1976- iqtree -s ${output_dir} /Aligned_SCGs.faa -nt $num_jobs -mset WAG,LG -bb 1000 -pre iqtree_out
1977- fi
2030+ mkdir ${output_dir} /iqtree_out/
2031+ mv iqtree_out* ${output_dir} /iqtree_out/
2032+ cp ${output_dir} /iqtree_out/iqtree_out.treefile ${output_dir} /Aligned_SCGs_mod_names.tre
19782033
1979- mkdir ${output_dir} /iqtree_out/
1980- mv iqtree_out* ${output_dir} /iqtree_out/
1981- cp ${output_dir} /iqtree_out/iqtree_out.treefile ${output_dir} /Aligned_SCGs_mod_names.tre
2034+ fi
19822035
19832036fi
19842037
@@ -2000,22 +2053,34 @@ fi
20002053
20012054# reporting primary output files
20022055
2003- printf " Full alignment written to file:\n" | tee -a $gtotree_log
2056+ if [ $align_only == ' false' ]; then
2057+
2058+ if [ -s ${output_dir} /Aligned_SCGs_mod_names.faa ]; then
2059+
2060+ printf " Tree written to:\n" | tee -a $gtotree_log
2061+ printf " ${GREEN}${output_dir} /Aligned_SCGs_mod_names.tre${NC} \n\n" | tee -a $gtotree_log
2062+ else
2063+ printf " Tree written to:\n" | tee -a $gtotree_log
2064+ printf " ${GREEN}${output_dir} /Aligned_SCGs.tre${NC} \n\n" | tee -a $gtotree_log
2065+ fi
2066+
2067+ fi
2068+
2069+ printf " Full alignment written to:\n" | tee -a $gtotree_log
20042070printf " ${GREEN}${output_dir} /Aligned_SCGs.faa${NC} \n\n" | tee -a $gtotree_log
2071+
20052072if [ -s ${output_dir} /Aligned_SCGs_mod_names.faa ]; then
2006- printf " Alignment with altered headers written to file :\n" | tee -a $gtotree_log
2073+ printf " Alignment with altered headers written to:\n" | tee -a $gtotree_log
20072074 printf " ${GREEN}${output_dir} /Aligned_SCGs_mod_names.faa${NC} \n\n" | tee -a $gtotree_log
2008- printf " Tree written to file:\n" | tee -a $gtotree_log
2009- printf " ${GREEN}${output_dir} /Aligned_SCGs_mod_names.tre${NC} \n\n" | tee -a $gtotree_log
2010- else
2011- printf " Tree written to file:\n" | tee -a $gtotree_log
2012- printf " ${GREEN}${output_dir} /Aligned_SCGs.tre${NC} \n\n" | tee -a $gtotree_log
20132075fi
20142076
2015- printf " Summary file with comp./redund. estimates written to file:\n" | tee -a $gtotree_log
2077+ printf " Partitions (for downstream use with mixed-model treeing) written to:\n" | tee -a $gtotree_log
2078+ printf " ${GREEN}${output_dir} /Partitions.txt${NC} \n\n" | tee -a $gtotree_log
2079+
2080+ printf " Summary file with comp./redund. estimates written to:\n" | tee -a $gtotree_log
20162081printf " ${GREEN}${output_dir} /All_genomes_summary_info.tsv${NC} \n\n" | tee -a $gtotree_log
20172082
2018- printf " Summary table with hits per target gene per genome written to file :\n" | tee -a $gtotree_log
2083+ printf " Summary table with hits per target gene per genome written to:\n" | tee -a $gtotree_log
20192084printf " ${GREEN}${output_dir} /All_genomes_SCG_hit_counts.tsv${NC} \n\n" | tee -a $gtotree_log
20202085
20212086# reporting any problem files/accessions
0 commit comments