Skip to content

Commit b6fb21d

Browse files
author
AstrobioMike
committed
- enabled gzipped fasta files or genbank files to be provided as inputs
- added specific versions to the conda-installed packages to the script
1 parent 1fd94a8 commit b6fb21d

13 files changed

+286
-103726
lines changed

bin/GToTree

Lines changed: 133 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
GREEN='\033[0;32m'
55
RED='\033[0;31m'
66
NC='\033[0m'
7-
VERSION="v1.1.8"
7+
VERSION="v1.1.9"
88

99

1010
if [ "$1" == "--version" ] || [ "$1" == "-v" ]; then
@@ -807,8 +807,71 @@ if [ -n "$genbank_list_file" ]; then
807807
### running in parallel if set, otherwise running in serial ###
808808
if [ $num_jobs == "1" ]; then
809809
gtt-genbank-serial.sh $genbank_list_file $tmp_dir $hmm_file $genbank_genomes_total $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode
810+
811+
### kill backstop ###
812+
# if there was a problem with the serial genbank genome processing, killing main program here and reporting
813+
if [ -s ${tmp_dir}/kill_genbank_serial.prodigal ]; then
814+
815+
problem_assembly=$(head -n 1 ${tmp_dir}/kill_genbank_serial.prodigal)
816+
817+
printf "\n\n ${RED}############################################################################## \n" | tee -a $gtotree_log
818+
printf " ${RED}############################################################################## \n" | tee -a $gtotree_log
819+
printf " ####${NC} GToTree is exiting without completing :( ${RED}####\n" | tee -a $gtotree_log
820+
printf " ##############################################################################${NC} \n" | tee -a $gtotree_log
821+
printf " ${RED}############################################################################## \n\n" | tee -a $gtotree_log
822+
823+
printf " ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC} \n" | tee -a $gtotree_log
824+
printf " Something went wrong with prodigal trying to call genes on the fasta\n" | tee -a $gtotree_log
825+
printf " file generated from assembly ${GREEN}${problem_assembly}${NC}.\n" | tee -a $gtotree_log
826+
printf " GToTree is not sure it should move forward when something odd is going\n" | tee -a $gtotree_log
827+
printf " on like this :(\n" | tee -a $gtotree_log
828+
printf " ${RED}**************************************************************************** ${NC}\n\n" | tee -a $gtotree_log
829+
830+
printf "\nExiting for now.\n\n" | tee -a $gtotree_log
831+
832+
# removing tmp directory unless debug set
833+
if [ $debug_flag == 'false' ]; then
834+
rm -rf $tmp_dir
835+
fi
836+
837+
mv $gtotree_log ${output_dir}/gtotree-runlog.txt
838+
exit
839+
fi
840+
810841
else
811842
cat $genbank_list_file | parallel -j $num_jobs gtt-genbank-parallel.sh {} $tmp_dir $hmm_file $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode
843+
844+
845+
### kill backstop ###
846+
# if there was a problem with the parallel genbank genome processing, killing main program here and reporting
847+
if [ -s ${tmp_dir}/kill_genbank_parallel.prodigal ]; then
848+
849+
problem_assembly=$(head -n 1 ${tmp_dir}/kill_genbank_parallel.prodigal)
850+
851+
printf "\n\n ${RED}############################################################################## \n" | tee -a $gtotree_log
852+
printf " ${RED}############################################################################## \n" | tee -a $gtotree_log
853+
printf " ####${NC} GToTree is exiting without completing :( ${RED}####\n" | tee -a $gtotree_log
854+
printf " ##############################################################################${NC} \n" | tee -a $gtotree_log
855+
printf " ${RED}############################################################################## \n\n" | tee -a $gtotree_log
856+
857+
printf " ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC} \n" | tee -a $gtotree_log
858+
printf " Something went wrong with prodigal trying to call genes on the fasta\n" | tee -a $gtotree_log
859+
printf " file generated from assembly ${GREEN}${problem_assembly}${NC}.\n" | tee -a $gtotree_log
860+
printf " GToTree is not sure it should move forward when something odd is going\n" | tee -a $gtotree_log
861+
printf " on like this :(\n" | tee -a $gtotree_log
862+
printf " ${RED}**************************************************************************** ${NC}\n\n" | tee -a $gtotree_log
863+
864+
printf "\nExiting for now.\n\n" | tee -a $gtotree_log
865+
866+
# removing tmp directory unless debug set
867+
if [ $debug_flag == 'false' ]; then
868+
rm -rf $tmp_dir
869+
fi
870+
871+
mv $gtotree_log ${output_dir}/gtotree-runlog.txt
872+
exit
873+
fi
874+
812875
fi
813876

814877
mv ${tmp_dir}/genbank_genomes_list.tmp ${tmp_dir}/final_included_genbank_genomes.tmp
@@ -821,8 +884,6 @@ if [ -n "$genbank_list_file" ]; then
821884
fi
822885

823886

824-
825-
826887
#############################################################################
827888
##################### FASTA-DERIVED GENOME PROCESSING #####################
828889
#############################################################################
@@ -848,10 +909,73 @@ if [ -n "$fasta_files" ]; then
848909
### running in parallel if set, otherwise running in serial ###
849910
if [ $num_jobs == "1" ]; then
850911
gtt-fasta-serial.sh $fasta_files $tmp_dir $hmm_file $fasta_genomes_total $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode
912+
913+
### kill backstop ###
914+
# if there was a problem with the serial fasta genome processing, killing main program here and reporting
915+
if [ -s ${tmp_dir}/kill_fasta_serial.prodigal ]; then
916+
917+
problem_assembly=$(head -n 1 ${tmp_dir}/kill_fasta_serial.prodigal)
918+
919+
printf "\n\n ${RED}############################################################################## \n" | tee -a $gtotree_log
920+
printf " ${RED}############################################################################## \n" | tee -a $gtotree_log
921+
printf " ####${NC} GToTree is exiting without completing :( ${RED}####\n" | tee -a $gtotree_log
922+
printf " ##############################################################################${NC} \n" | tee -a $gtotree_log
923+
printf " ${RED}############################################################################## \n\n" | tee -a $gtotree_log
924+
925+
printf " ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC} \n" | tee -a $gtotree_log
926+
printf " Something went wrong with prodigal trying to call genes on assembly\n" | tee -a $gtotree_log
927+
printf " ${GREEN}${problem_assembly}${NC}. GToTree is not sure it should\n" | tee -a $gtotree_log
928+
printf " move forward when something odd is going on like this :(\n" | tee -a $gtotree_log
929+
printf " ${RED}**************************************************************************** ${NC}\n\n" | tee -a $gtotree_log
930+
931+
printf "\nExiting for now.\n\n" | tee -a $gtotree_log
932+
933+
# removing tmp directory unless debug set
934+
if [ $debug_flag == 'false' ]; then
935+
rm -rf $tmp_dir
936+
fi
937+
938+
mv $gtotree_log ${output_dir}/gtotree-runlog.txt
939+
exit
940+
fi
941+
851942
else
852943
cat $fasta_files | parallel -j $num_jobs gtt-fasta-parallel.sh {} $tmp_dir $hmm_file $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode
944+
945+
### kill backstop ###
946+
# if there was a problem with the parallel fasta genome processing, killing main program here and reporting
947+
if [ -s ${tmp_dir}/kill_fasta_parallel.prodigal ]; then
948+
949+
problem_assembly=$(head -n 1 ${tmp_dir}/kill_fasta_parallel.prodigal)
950+
951+
printf "\n\n ${RED}############################################################################## \n" | tee -a $gtotree_log
952+
printf " ${RED}############################################################################## \n" | tee -a $gtotree_log
953+
printf " ####${NC} GToTree is exiting without completing :( ${RED}####\n" | tee -a $gtotree_log
954+
printf " ##############################################################################${NC} \n" | tee -a $gtotree_log
955+
printf " ${RED}############################################################################## \n\n" | tee -a $gtotree_log
956+
957+
printf " ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC} \n" | tee -a $gtotree_log
958+
printf " Something went wrong with prodigal trying to call genes on assembly\n" | tee -a $gtotree_log
959+
printf " ${GREEN}${problem_assembly}${NC}. GToTree is not sure it should\n" | tee -a $gtotree_log
960+
printf " move forward when something odd is going on like this :(\n" | tee -a $gtotree_log
961+
printf " ${RED}**************************************************************************** ${NC}\n\n" | tee -a $gtotree_log
962+
963+
printf "\nExiting for now.\n\n" | tee -a $gtotree_log
964+
965+
# removing tmp directory unless debug set
966+
if [ $debug_flag == 'false' ]; then
967+
rm -rf $tmp_dir
968+
fi
969+
970+
mv $gtotree_log ${output_dir}/gtotree-runlog.txt
971+
exit
972+
fi
973+
853974
fi
854975

976+
977+
978+
855979
# adding retained genomes to genomes from all sources file
856980
cat ${tmp_dir}/fasta_genomes_list.tmp >> ${tmp_dir}/genomes_from_all_sources.tmp
857981

@@ -1658,6 +1782,7 @@ else
16581782
fi
16591783

16601784
# reporting primary output files
1785+
16611786
printf " Full alignment written to file:\n" | tee -a $gtotree_log
16621787
printf " ${GREEN}${output_dir}/Aligned_SCGs.faa${NC}\n\n" | tee -a $gtotree_log
16631788
if [ -n "$file_to_genome_id_map" ] || [ $taxonkit_id_swap != "false" ]; then
@@ -1734,5 +1859,9 @@ duration=$SECONDS
17341859

17351860
printf " Total process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee -a $gtotree_log
17361861

1862+
# reporting log file output
1863+
printf " Log file written to:\n" | tee -a $gtotree_log
1864+
printf " ${GREEN}${output_dir}/gtotree-runlog.txt${NC}\n\n" | tee -a $gtotree_log
1865+
17371866
mv $gtotree_log ${output_dir}/gtotree-runlog.txt
1738-
rm -rf *.tmp
1867+
rm -rf *.tmp

bin/gtt-fasta-parallel.sh

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,25 @@ hmm_target_genes_total=$5
1212
output_dir=$6
1313
best_hit_mode=$7
1414

15-
# setting assembly name as filename with no extension
16-
assembly="$(basename ${1%.*})"
15+
16+
### kill backstop
17+
# if there is a problem, all child processes launched (by this script) will exit immediately,
18+
# upon returning to main script, will check and terminate parent process
19+
if [ -s ${tmp_dir}/kill_fasta_parallel.prodigal ]; then
20+
exit
21+
fi
22+
23+
## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way
24+
if $(file $1 | grep -q "gzip"); then
25+
was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards
26+
file_location=${1%.*}
27+
gunzip -c $1 > $file_location
28+
assembly="$(basename ${file_location%.*})"
29+
else
30+
file_location=$1
31+
assembly="$(basename ${1%.*})"
32+
was_gzipped=FALSE
33+
fi
1734

1835
printf " -------------------------------------------------------------------------- \n\n"
1936
printf " Genome: ${GREEN}$assembly${NC}\n"
@@ -23,10 +40,25 @@ echo $assembly >> ${tmp_dir}/fasta_genomes_list.tmp
2340

2441
num=$((num+1)) # to track progress
2542

26-
## running prodigal to get coding sequences
27-
prodigal -c -q -i $1 -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null
43+
44+
prodigal -c -q -i $file_location -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null 2> ${file_location}_prodigal.stderr
45+
46+
if [ -s ${file_location}_prodigal.stderr ]; then
47+
printf "$assembly" >> ${tmp_dir}/kill_fasta_parallel.prodigal
48+
rm -rf ${file_location}_prodigal.stderr
49+
exit
50+
else
51+
rm -rf ${file_location}_prodigal.stderr
52+
fi
53+
2854
tr -d '*' < ${tmp_dir}/${assembly}_genes1.tmp > ${tmp_dir}/${assembly}_genes2.tmp
2955

56+
57+
## removing gunzipped genome file if it was gunzipped
58+
if [ $was_gzipped == "TRUE" ]; then
59+
rm -rf $file_location
60+
fi
61+
3062
## renaming seqs to have assembly name
3163
gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes.tmp
3264

bin/gtt-fasta-serial.sh

Lines changed: 34 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,24 @@ best_hit_mode=$8
1717
while IFS=$'\t' read -r -a file
1818
do
1919

20-
# setting assembly name as filename with no extension
21-
assembly="$(basename ${file%.*})"
20+
### kill backstop
21+
# if there is a problem on any iteration, exiting this subprocess and then exiting main script with report of problem assembly
22+
if [ -s ${tmp_dir}_kill_fasta_serial.prodigal ]; then
23+
exit
24+
fi
25+
26+
27+
## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way
28+
if $(file $file | grep -q "gzip"); then
29+
was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards
30+
file_location=${file%.*}
31+
gunzip -c $file > $file_location
32+
assembly="$(basename ${file_location%.*})"
33+
else
34+
file_location=$file
35+
assembly="$(basename ${file%.*})"
36+
was_gzipped=FALSE
37+
fi
2238

2339
# adding assembly to ongoing genomes list
2440
echo $assembly >> ${tmp_dir}/fasta_genomes_list.tmp
@@ -32,9 +48,24 @@ do
3248
printf " Getting coding seqs...\n\n"
3349

3450
## running prodigal to get coding sequences
35-
prodigal -c -q -i $file -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null
51+
prodigal -c -q -i $file_location -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null 2> ${file_location}_prodigal.stderr
52+
53+
if [ -s ${file_location}_prodigal.stderr ]; then
54+
printf "$assembly" >> ${tmp_dir}/kill_fasta_serial.prodigal
55+
rm -rf ${file_location}_prodigal.stderr
56+
57+
exit
58+
else
59+
rm -rf ${file_location}_prodigal.stderr
60+
fi
61+
3662
tr -d '*' < ${tmp_dir}/${assembly}_genes1.tmp > ${tmp_dir}/${assembly}_genes2.tmp
3763

64+
## removing gunzipped genome file if it was gunzipped
65+
if [ $was_gzipped == "TRUE" ]; then
66+
rm -rf $file_location
67+
fi
68+
3869
## renaming seqs to have assembly name
3970
gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes.tmp
4071

bin/gtt-genbank-parallel.sh

Lines changed: 43 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,38 +12,55 @@ hmm_target_genes_total=$5
1212
output_dir=$6
1313
best_hit_mode=$7
1414

15-
assembly="$(basename ${1%.*})"
15+
16+
### kill backstop
17+
# if there is a problem, all child processes launched (by this script) will exit immediately,
18+
# upon returning to main script, will check and terminate parent process
19+
if [ -s ${tmp_dir}/kill_genbank_parallel.prodigal ]; then
20+
exit
21+
fi
22+
23+
## checking if gzipped, gunzipping if so, and setting assembly name and file location variable either way
24+
if $(file $1 | grep -q "gzip"); then
25+
was_gzipped=TRUE # setting variable to be able to check and remove gunzipped file afterwards
26+
file_location=${1%.*}
27+
gunzip -c $1 > $file_location
28+
assembly="$(basename ${file_location%.*})"
29+
else
30+
file_location=$1
31+
assembly="$(basename ${1%.*})"
32+
was_gzipped=FALSE
33+
fi
34+
1635

1736
printf " -------------------------------------------------------------------------- \n\n"
1837
printf " Genome: ${GREEN}$assembly${NC}\n"
1938

20-
rm -rf ${output_dir}/Genbank_files_with_no_CDSs.txt # deleting if file exists
21-
2239
# adding assembly to ongoing genomes list
2340
echo $assembly >> ${tmp_dir}/genbank_genomes_list.tmp
2441

2542
# storing more info about the assembly if it's present in the genbank file:
2643
# checking for organism:
27-
if grep -q "ORGANISM" $1; then
28-
org_name=$(grep -m1 "ORGANISM" $1 | tr -s " " | cut -f3- -d " " | tr "[ ./\\]" "_" | tr -s "_")
44+
if grep -q "ORGANISM" $file_location; then
45+
org_name=$(grep -m1 "ORGANISM" $file_location | tr -s " " | cut -f3- -d " " | tr "[ ./\\]" "_" | tr -s "_")
2946
else
3047
org_name="NA"
3148
fi
3249

33-
if grep -q "strain=" $1; then
34-
strain=$(grep -m1 "strain=" $1 | tr -s " " | cut -f 2 -d '"')
50+
if grep -q "strain=" $file_location; then
51+
strain=$(grep -m1 "strain=" $file_location | tr -s " " | cut -f 2 -d '"')
3552
else
3653
strain="NA"
3754
fi
3855

39-
if grep -q "taxon" $1; then
40-
taxid=$(grep -m1 "taxon" $1 | cut -f2 -d ":" | tr -d '"')
56+
if grep -q "taxon" $file_location; then
57+
taxid=$(grep -m1 "taxon" $file_location | cut -f2 -d ":" | tr -d '"')
4158
else
4259
taxid="NA"
4360
fi
4461

4562
# extracting AA coding sequences from genbank file
46-
gtt-genbank-to-AA-seqs -i $1 -o ${tmp_dir}/${assembly}_genes2.tmp 2> /dev/null
63+
gtt-genbank-to-AA-seqs -i $file_location -o ${tmp_dir}/${assembly}_genes2.tmp 2> /dev/null
4764

4865
# checking that the file had CDS annotations
4966
if [ ! -s ${tmp_dir}/${assembly}_genes2.tmp ]; then
@@ -59,14 +76,28 @@ if [ ! -s ${tmp_dir}/${assembly}_genes2.tmp ]; then
5976
rm -rf ${tmp_dir}/${assembly}_genes2.tmp
6077

6178
# pulling out full nucleotide fasta from genbank file
62-
gtt-genbank-to-fasta -i $1 -o ${tmp_dir}/${assembly}_fasta.tmp 2> /dev/null
79+
gtt-genbank-to-fasta -i $file_location -o ${tmp_dir}/${assembly}_fasta.tmp 2> /dev/null
6380

6481
# running prodigal
65-
prodigal -c -q -i ${tmp_dir}/${assembly}_fasta.tmp -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null
82+
prodigal -c -q -i ${tmp_dir}/${assembly}_fasta.tmp -a ${tmp_dir}/${assembly}_genes1.tmp > /dev/null 2> ${file_location}_prodigal.stderr
83+
84+
if [ -s ${file_location}_prodigal.stderr ]; then
85+
printf "$assembly" >> ${tmp_dir}/kill_genbank_parallel.prodigal
86+
rm -rf ${file_location}_prodigal.stderr
87+
exit
88+
else
89+
rm -rf ${file_location}_prodigal.stderr
90+
fi
91+
6692
tr -d '*' < ${tmp_dir}/${assembly}_genes1.tmp > ${tmp_dir}/${assembly}_genes2.tmp
6793

6894
fi
6995

96+
## removing gunzipped genome file if it was gunzipped
97+
if [ $was_gzipped == "TRUE" ]; then
98+
rm -rf $file_location
99+
fi
100+
70101
## renaming seqs to have assembly name
71102
gtt-rename-fasta-headers -i ${tmp_dir}/${assembly}_genes2.tmp -w $assembly -o ${tmp_dir}/${assembly}_genes.tmp
72103

0 commit comments

Comments
 (0)