Skip to content

Commit 2ad2d27

Browse files
Mike LeeMike Lee
authored andcommitted
adding some logic to catch if muscle doesn't produce an alignment (#101)
1 parent ca1d9fe commit 2ad2d27

File tree

2 files changed

+79
-8
lines changed

2 files changed

+79
-8
lines changed

bin/GToTree

Lines changed: 70 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ GREEN='\033[0;32m'
55
RED='\033[0;31m'
66
YELLOW='\033[0;33m'
77
NC='\033[0m'
8-
VERSION="v1.8.8"
8+
VERSION="v1.8.9"
99

1010
if [ "$1" == "--version" ] || [ "$1" == "-v" ]; then
1111
printf "GToTree ${VERSION}\n"
@@ -1992,7 +1992,7 @@ if [ -n "$amino_acid_files" ]; then
19921992
cat $amino_acid_files | parallel -j $num_jobs gtt-amino-acid-parallel.sh {} $tmp_dir $hmm_file $num_cpus $hmm_target_genes_total $output_dir $best_hit_mode $additional_pfam_targets ${ko_targets} ${target_KOs}
19931993

19941994
### kill backstop ###
1995-
# if there was a problem with the parallel fasta genome processing, killing main program here and reporting
1995+
# if there was a problem with the parallel amino acid genome processing, killing main program here and reporting
19961996
if [ -s ${tmp_dir}/kill_amino_acid_parallel.problem ]; then
19971997

19981998
problem_assembly=$(head -n 1 ${tmp_dir}/kill_amino_acid_parallel.problem)
@@ -2404,13 +2404,42 @@ if [ $num_jobs == "1" ]; then
24042404

24052405
# aligning
24062406
if [ $total_input_genomes -ge 1000 ] && [ $override_faster_alignment == 'false' ]; then
2407-
muscle -super5 ${tmp_dir}/${SCG}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/aligned.tmp -threads ${num_muscle_threads}
2407+
muscle -super5 ${tmp_dir}/${SCG}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/${SCG}-aligned.tmp -threads ${num_muscle_threads} | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
24082408
else
2409-
muscle -align ${tmp_dir}/${SCG}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/aligned.tmp -threads ${num_muscle_threads}
2409+
muscle -align ${tmp_dir}/${SCG}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/${SCG}-aligned.tmp -threads ${num_muscle_threads} | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2410+
fi
2411+
2412+
# checking if alignment was successful (really this is a sloppy way of checking, but it's better than nothing and the muscle logs will be in the stdout and log)
2413+
if [ ! -s ${tmp_dir}/${SCG}-aligned.tmp ]; then
2414+
2415+
printf "\n\n ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2416+
printf " ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2417+
printf " ####${NC} GToTree is exiting without completing :( ${RED}####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2418+
printf " ##############################################################################${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2419+
printf " ${RED}############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2420+
2421+
printf " ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2422+
printf " There was a problem with muscle generating an alignment, so GToTree is exiting. This\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2423+
printf " is most often due to running out of memory, leading to a 'core dumped' message from muscle.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2424+
printf " You can check the muscle log output printed above, specifically look for ${SCG}.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2425+
printf " If you can't access more memory, it would help to reduce the number of included\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2426+
printf " genomes if possible." | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2427+
printf " ${RED}**************************************************************************** ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2428+
2429+
printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2430+
2431+
# removing tmp directory unless debug set
2432+
if [ $debug_flag == 'false' ]; then
2433+
rm -rf $tmp_dir
2434+
fi
2435+
2436+
mv $gtotree_log ${output_dir}/gtotree-runlog.txt
2437+
exit
2438+
24102439
fi
24112440

24122441
# trimming
2413-
trimal -in ${tmp_dir}/aligned.tmp -out ${tmp_dir}/trimmed${target_gene_suffix}.tmp -automated1
2442+
trimal -in ${tmp_dir}/${SCG}-aligned.tmp -out ${tmp_dir}/trimmed${target_gene_suffix}.tmp -automated1
24142443

24152444
# removing linewraps:
24162445
sed 's/ .*$//' ${tmp_dir}/trimmed${target_gene_suffix}.tmp | awk '!/^>/ { printf "%s", $0; n="\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' > ${tmp_dir}/formatted${target_gene_suffix}.tmp
@@ -2465,6 +2494,42 @@ else
24652494

24662495
cat ${tmp_dir}/final_genes_list.tmp | parallel -j $num_jobs gtt-align-and-trim-parallel.sh {} $tmp_dir $faster_alignment $num_muscle_threads $target_gene_suffix
24672496

2497+
### kill backstop ###
2498+
# if there was a problem with the alignments, killing main program here and reporting
2499+
2500+
if [ -f ${tmp_dir}/kill_align_and_trim_parallel.problem ]; then
2501+
2502+
problem_alignment=$(head -n 1 ${tmp_dir}/kill_align_and_trim_parallel.problem)
2503+
2504+
printf "\n\n ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2505+
printf " ${RED}############################################################################## \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2506+
printf " ####${NC} GToTree is exiting without completing :( ${RED}####\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2507+
printf " ##############################################################################${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2508+
printf " ${RED}############################################################################## \n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2509+
2510+
printf " ${RED}************************** ${NC}REASON FOR TERMINATION ${RED}**************************${NC} \n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2511+
printf " There was a problem with muscle generating an alignment, so GToTree is exiting. This\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2512+
printf " is most often due to running out of memory, leading to a 'core dumped' message from muscle.\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2513+
printf " You can check the muscle log output in one of the problem sets by looking at:\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2514+
printf " ${problem_alignment}-muscle.log. If you can't access more memory, it would help to\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2515+
printf " reduce the number of included genomes if possible." | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2516+
printf " ${RED}**************************************************************************** ${NC}\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2517+
2518+
printf "\nExiting for now.\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )
2519+
2520+
# copying muscle log file to primary working directory
2521+
cp ${tmp_dir}/${problem_alignment}-muscle.log .
2522+
2523+
# removing tmp directory unless debug set
2524+
if [ $debug_flag == 'false' ]; then
2525+
rm -rf $tmp_dir
2526+
fi
2527+
2528+
mv $gtotree_log ${output_dir}/gtotree-runlog.txt
2529+
exit
2530+
2531+
fi
2532+
24682533
fi
24692534

24702535
printf "\n\n\n________________________________________________________________________________\n\n\n" | tee >( sed 's/\x1b\[[0-9;]*m//g' >> ${gtotree_log} )

bin/gtt-align-and-trim-parallel.sh

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,15 @@ gtt-parse-fasta-by-headers -i ${tmp_dir}/${1}_hits_filtered.tmp -w ${tmp_dir}/so
1616

1717
# aligning
1818
if [ $faster_alignment == 'true' ]; then
19-
muscle -super5 ${tmp_dir}/${1}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/${1}_aligned.tmp -threads ${num_muscle_threads} &> /dev/null
19+
muscle -super5 ${tmp_dir}/${1}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/${1}_aligned.tmp -threads ${num_muscle_threads} > ${tmp_dir}/${1}-muscle.log 2>&1
2020
else
21-
muscle -align ${tmp_dir}/${1}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/${1}_aligned.tmp -threads ${num_muscle_threads} &> /dev/null
21+
muscle -align ${tmp_dir}/${1}_hits_filtered${target_gene_suffix} -output ${tmp_dir}/${1}_aligned.tmp -threads ${num_muscle_threads} > ${tmp_dir}/${1}-muscle.log 2>&1
22+
fi
23+
24+
# checking if alignment was successful (really this is a sloppy way of checking, but it's better than nothing and the muscle log file will be available)
25+
if [ ! -s ${tmp_dir}/${1}_aligned.tmp ]; then
26+
printf "${1}\n" >> ${tmp_dir}/kill_align_and_trim_parallel.problem
27+
exit
2228
fi
2329

2430
# trimming
@@ -42,7 +48,7 @@ if [ -s ${tmp_dir}/${1}_needed_gappers.tmp ]; then
4248

4349
# getting length of the alignment for the current gene:
4450
aln_length_tmp=$(sed -n '2p' ${tmp_dir}/${1}_formatted${target_gene_suffix}.tmp | wc -c | tr -s " " | cut -f2 -d " ")
45-
# subtracting 1 for newline characters
51+
# subtracting 1 for newline characters
4652
aln_length_tmp=$(echo "$aln_length_tmp"-1 | bc)
4753
# making a string of gaps the length of the alignment for those missing it:
4854
gap_seq=$(printf "%0.s-" $(seq 1 1 $aln_length_tmp))

0 commit comments

Comments
 (0)