Skip to content

Commit 701aa9f

Browse files
committed
added GToTree-pfam-search; added citation report file based on what was used
1 parent 8e6cd4a commit 701aa9f

11 files changed

+1213
-11
lines changed

bin/GToTree

Lines changed: 99 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
GREEN='\033[0;32m'
55
RED='\033[0;31m'
66
NC='\033[0m'
7-
VERSION="v1.2.1"
7+
VERSION="v1.3.1"
88

99

1010
if [ "$1" == "--version" ] || [ "$1" == "-v" ]; then
@@ -69,7 +69,7 @@ if [ "$#" == 0 ] || [ $1 == "-h" ] || [ $1 == "help" ]; then
6969
printf " - [-T <str>] default: FastTree\n"
7070
printf " Which program to use for tree generation. Currently supported are\n"
7171
printf " \"FastTree\" and \"IQ-TREE\". As of now, these run with default settings\n"
72-
printf " only (and QT-TREE includes \"-mset WAG,LG\". To run either with more\n"
72+
printf " only (and IQ-TREE includes \"-mset WAG,LG\". To run either with more\n"
7373
printf " specific options (and there is a lot of room for variation here), you\n"
7474
printf " can use the output alignment file from GToTree as input.\n\n"
7575

@@ -142,6 +142,19 @@ if ! command -v FastTree > /dev/null; then
142142
exit
143143
fi
144144

145+
#############################################################################
146+
########## SETTING VARIABLES TO REPORT WHAT SHOULD BE CITED AT END #########
147+
#############################################################################
148+
parallel_used="false"
149+
prodigal_used="false"
150+
hmmer_used="false"
151+
muscle_used="false"
152+
trimal_used="false"
153+
taxonkit_used="false"
154+
fasttree_used="false"
155+
iqtree_used="false"
156+
universal_SCGs_used="false"
157+
145158

146159
#############################################################################
147160
############################ PARSING ARGUMENTS ############################
@@ -190,6 +203,26 @@ if [ $taxonkit_id_swap != "false" ]; then
190203
fi
191204
fi
192205

206+
# checking iqtree is available if it was specified, and checking tree program properly specified
207+
if [ $tree_program != "FastTree" ]; then
208+
if [ $tree_program == "IQ-TREE" ]; then
209+
if ! command -v iqtree > /dev/null; then
210+
printf "\n ${RED}You specified to use IQ-TREE, but 'iqtree' not found in your PATH :(${NC}\n"
211+
printf "\nExiting for now.\n\n"
212+
exit
213+
fi
214+
else
215+
printf "\n ${RED}You specified to use $tree_program, but that is not one of the options :(${NC}\n"
216+
printf "\n Currently available options are 'FastTree' (the default), or 'IQ-TREE'.\n\n"
217+
218+
printf " You can also run GToTree in alignment-only mode by adding the \"-N\" flag,\n"
219+
printf " and then take your concatenated alignment to another tree program :)\n"
220+
printf "\nExiting for now.\n\n"
221+
exit
222+
fi
223+
fi
224+
225+
193226
# checking no duplicates in NCBI accession file
194227
if [ -f "$NCBI_acc_file" ]; then
195228
num_dupes=$(uniq -d "$NCBI_acc_file" | wc -l | sed "s/^ *//" | cut -d " " -f 1)
@@ -263,6 +296,7 @@ else
263296
printf "\nExiting for now.\n\n"
264297
exit
265298
fi
299+
266300
fi
267301

268302

@@ -468,6 +502,11 @@ printf "\n ${GREEN}Total input genomes: $total_input
468502
#### checking and reporting specified hmm source ####
469503
printf "\n HMM source to be used:\n" | tee -a $gtotree_log
470504

505+
if [ $hmm_file == "Universal" ]; then
506+
hmm_file="Universal_Hug_et_al"
507+
universal_SCGs_used="true"
508+
fi
509+
471510
if [ -f "$hmm_file" ]; then
472511
grep "^NAME" $hmm_file | tr -s " " | cut -f2 -d " " > uniq_hmm_names.tmp
473512
hmm_target_genes_total=$(wc -l uniq_hmm_names.tmp | sed "s/^ *//" | cut -d " " -f 1)
@@ -489,7 +528,6 @@ else
489528
fi
490529

491530

492-
493531
#############################################################################
494532
############## EXPLICITLY STATING IF DEFAULT BEHAVIOR CHANGED #############
495533
#############################################################################
@@ -515,6 +553,7 @@ if [ $output_dir != "GToTree_output" ] || [ "$file_to_genome_id_map" != "" ] ||
515553

516554
if [ $taxonkit_id_swap != "false" ]; then
517555
printf " - Taxonkit will be used to add lineage info to labels.\n" | tee -a $gtotree_log
556+
taxonkit_used="true"
518557
fi
519558

520559
if [ $lineage_spec != "Domain,Phylum,Class,Species,Strain" ]; then
@@ -548,6 +587,7 @@ if [ $output_dir != "GToTree_output" ] || [ "$file_to_genome_id_map" != "" ] ||
548587

549588
if [ $num_jobs != "1" ]; then
550589
printf " - Number of jobs to run during parallelizable steps has been set to $num_jobs.\n" | tee -a $gtotree_log
590+
parallel_used="true"
551591
fi
552592

553593
if [ $best_hit_mode == "true" ]; then
@@ -581,14 +621,17 @@ if [ $total_input_genomes -le 20 ]; then
581621

582622
printf " ${RED}****************************************************************************${NC} \n\n" | tee -a $gtotree_log
583623

584-
## if i want this as manual input can use the following...
585-
## for now just warning and moving on so this stays fully automated once started
624+
### if i want this as manual input can use the following... ###
625+
586626
# read -n 1 -s -p " Press any key to continue with \"-c\" set to $len_cutoff for this run, or
587627
# press \"CTRL+C\" to cancel and exit.
588628
# "
589629

590630
# printf "\n\n\t Moving forward with \"-c\" set to $len_cutoff this run.\n\n"
591631

632+
633+
### for now just warning and moving on so this stays fully automated once started ###
634+
592635
sleep 3
593636
fi
594637

@@ -788,6 +831,11 @@ if [ -n "$NCBI_acc_file" ]; then
788831
cat ${tmp_dir}/ncbi_accessions_info.tmp | parallel -j $num_jobs gtt-ncbi-parallel.sh {} $tmp_dir $hmm_file $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode
789832
fi
790833

834+
## checking if prodigal was used to add to citations list being reported at end
835+
if [ -s ${tmp_dir}/prodigal_used ]; then
836+
prodigal_used="true"
837+
fi
838+
791839
printf "________________________________________________________________________________\n\n" | tee -a $gtotree_log
792840

793841

@@ -953,6 +1001,11 @@ if [ -n "$genbank_list_file" ]; then
9531001
# adding retained genomes to genomes from all sources file
9541002
cat ${tmp_dir}/final_included_genbank_genomes.tmp >> ${tmp_dir}/genomes_from_all_sources.tmp
9551003

1004+
## checking if prodigal was used to add to citations list being reported at end
1005+
if [ -s ${tmp_dir}/prodigal_used ]; then
1006+
prodigal_used="true"
1007+
fi
1008+
9561009
printf "________________________________________________________________________________\n\n" | tee -a $gtotree_log
9571010

9581011
fi
@@ -1061,12 +1114,13 @@ if [ -n "$fasta_files" ]; then
10611114

10621115
fi
10631116

1064-
1065-
1066-
10671117
# adding retained genomes to genomes from all sources file
10681118
cat ${tmp_dir}/fasta_genomes_list.tmp >> ${tmp_dir}/genomes_from_all_sources.tmp
10691119

1120+
## prodigal must have been used with input fastas, setting to add to citations list being reported at end
1121+
prodigal_used="true"
1122+
1123+
10701124
printf "_______________________________________________________________________________\n\n" | tee -a $gtotree_log
10711125

10721126
fi
@@ -2008,6 +2062,8 @@ if [ $align_only == 'false' ]; then
20082062
FastTree ${output_dir}/Aligned_SCGs.faa > ${output_dir}/Aligned_SCGs.tre | tee -a $gtotree_log
20092063
fi
20102064

2065+
fasttree_used="true" # setting to report citations at end
2066+
20112067
else
20122068

20132069
printf "\n ############################################################################## \n" | tee -a $gtotree_log
@@ -2031,6 +2087,8 @@ if [ $align_only == 'false' ]; then
20312087
mv iqtree_out* ${output_dir}/iqtree_out/
20322088
cp ${output_dir}/iqtree_out/iqtree_out.treefile ${output_dir}/Aligned_SCGs_mod_names.tre
20332089

2090+
iqtree_used="true" # setting to report citations at end
2091+
20342092
fi
20352093

20362094
fi
@@ -2142,6 +2200,39 @@ printf "________________________________________________________________________
21422200
printf " Log file written to:\n" | tee -a $gtotree_log
21432201
printf " ${GREEN}${output_dir}/gtotree-runlog.txt${NC}\n\n" | tee -a $gtotree_log
21442202

2203+
### checking programs used and reporting a citations file ###
2204+
printf " Programs used and their citations have been written to:\n" | tee -a $gtotree_log
2205+
printf " ${GREEN}${output_dir}/citations.txt${NC}\n\n" | tee -a $gtotree_log
2206+
2207+
printf "GToTree\nLee MD. GToTree: a user-friendly workflow for phylogenomics. Bioinformatics. 2019; (March):1-3. doi:10.1093/bioinformatics/btz188\n\n" >> ${output_dir}/citations.txt
2208+
printf "HMMER3\nEddy SR. Accelerated profile HMM searches. PLoS Comput. Biol. 2011; (7)10. doi:10.1371/journal.pcbi.1002195\n\n" >> ${output_dir}/citations.txt
2209+
printf "Muscle\nEdgar RC. MUSCLE: a multiple sequence alignment method with reduced time and space complexity. BMC Bioinformatics. 2004; 5, 113. doi:10.1093/nar/gkh340\n\n" >> ${output_dir}/citations.txt
2210+
printf "TrimAl\nGutierrez SC. et al. TrimAl: a Tool for automatic alignment trimming. Bioinformatics. 2009; 25, 1972–1973. doi:10.1093/bioinformatics/btp348\n\n" >> ${output_dir}/citations.txt
2211+
2212+
if [ $prodigal_used == "true" ]; then
2213+
printf "Prodigal\nHyatt,D. et al. Gene and translation initiation site prediction in metagenomic sequences. Bioinformatics. 2010; 28, 2223–2230. doi:10.1186/1471-2105-11-119\n\n" >> ${output_dir}/citations.txt
2214+
fi
2215+
2216+
if [ $taxonkit_used == "true" ]; then
2217+
printf "TaxonKit\nShen W. and Xiong J. TaxonKit: a cross-platform and efficient NCBI taxonomy toolkit. bioRxiv. 2019. doi:10.1101/513523\n\n" >> ${output_dir}/citations.txt
2218+
fi
2219+
2220+
if [ $fasttree_used == "true" ]; then
2221+
printf "FastTree 2\nPrice MN. et al. FastTree 2 - approximately maximum-likelihood trees for large alignments. PLoS One. 2010; 5. doi:10.1371/journal.pone.0009490\n\n" >> ${output_dir}/citations.txt
2222+
fi
2223+
2224+
if [ $iqtree_used == "true" ]; then
2225+
printf "IQ-TREE\nNguyen L.-T. et al. IQ-TREE: a fast and effective stochastic algorithm for estimating maximum likelihood phylogenies. Mol. Biol. Evol. 2015; 32, 268–274. doi:10.1093/molbev/msu300\n\n" >> ${output_dir}/citations.txt
2226+
fi
2227+
2228+
if [ $parallel_used == "true" ]; then
2229+
printf "GNU Parallel\nTange O. GNU Parallel 2018. doi:10.5281/zenodo.1146014\n\n" >> ${output_dir}/citations.txt
2230+
fi
2231+
2232+
if [ $universal_SCGs_used == "true" ]; then
2233+
printf "Universal SCG-set\nHug LA. et al. A new view of the tree of life. Nat. Microbiol. 2016; 1, 1–6. doi:10.1038/NMICROBIOL.2016.48\n\n" >> ${output_dir}/citations.txt
2234+
fi
2235+
21452236
duration=$SECONDS
21462237

21472238
printf " Total process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n" | tee -a $gtotree_log

0 commit comments

Comments
 (0)