Skip to content

Commit 8e6cd4a

Browse files
committed
generate and report partitions file now, update to v1.2.1
1 parent bc3cc9a commit 8e6cd4a

File tree

8 files changed

+381
-141
lines changed

8 files changed

+381
-141
lines changed

bin/GToTree

Lines changed: 107 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
GREEN='\033[0;32m'
55
RED='\033[0;31m'
66
NC='\033[0m'
7-
VERSION="v1.1.13"
7+
VERSION="v1.2.1"
88

99

1010
if [ "$1" == "--version" ] || [ "$1" == "-v" ]; then
@@ -63,6 +63,9 @@ if [ "$#" == 0 ] || [ $1 == "-h" ] || [ $1 == "help" ]; then
6363
printf " the labels if using TaxonKit (-t flag specified). E.g., all would be\n"
6464
printf " \"-L Domain,Phylum,Class,Order,Family,Genus,Species,Strain\"\n\n"
6565

66+
printf " - [-N] default: false\n"
67+
printf " No tree. Generate alignment only.\n\n"
68+
6669
printf " - [-T <str>] default: FastTree\n"
6770
printf " Which program to use for tree generation. Currently supported are\n"
6871
printf " \"FastTree\" and \"IQ-TREE\". As of now, these run with default settings\n"
@@ -149,9 +152,10 @@ taxonkit_id_swap='false'
149152
debug_flag='false'
150153
lineage_spec="Domain,Phylum,Class,Species,Strain"
151154
best_hit_mode='false'
155+
align_only='false'
152156
tree_program='FastTree'
153157

154-
while getopts :a:g:f:A:H:o:m:tL:T:c:G:Bdn:j: args
158+
while getopts :a:g:f:A:H:o:m:tL:NT:c:G:Bdn:j: args
155159
do
156160
case "${args}"
157161
in
@@ -164,6 +168,7 @@ do
164168
m) file_to_genome_id_map=${OPTARG};;
165169
t) taxonkit_id_swap='true';;
166170
L) lineage_spec=${OPTARG};;
171+
N) align_only='true' ;;
167172
T) tree_program=${OPTARG};;
168173
c) len_cutoff=${OPTARG};;
169174
G) gen_cutoff=${OPTARG};;
@@ -488,7 +493,7 @@ fi
488493
#############################################################################
489494
############## EXPLICITLY STATING IF DEFAULT BEHAVIOR CHANGED #############
490495
#############################################################################
491-
if [ $output_dir != "GToTree_output" ] || [ "$file_to_genome_id_map" != "" ] || [ $taxonkit_id_swap != "false" ] || [ $len_cutoff != "0.2" ] || [ $gen_cutoff != "0.75" ] || [ $debug_flag == "true" ] || [ $best_hit_mode == "true" ] || [ $num_jobs != "1" ] || [ $num_cpus != 2 ] || [ $lineage_spec != "Domain,Phylum,Class,Species,Strain" ] || [ $tree_program != "FastTree" ]; then
496+
if [ $output_dir != "GToTree_output" ] || [ "$file_to_genome_id_map" != "" ] || [ $taxonkit_id_swap != "false" ] || [ $len_cutoff != "0.2" ] || [ $gen_cutoff != "0.75" ] || [ $debug_flag == "true" ] || [ $best_hit_mode == "true" ] || [ $num_jobs != "1" ] || [ $num_cpus != 2 ] || [ $lineage_spec != "Domain,Phylum,Class,Species,Strain" ] || [ $tree_program != "FastTree" ] || [ $align_only == 'true' ]; then
492497

493498
if [ "$file_to_genome_id_map" != "" ]; then
494499
if [ ! -s $file_to_genome_id_map ]; then
@@ -521,8 +526,16 @@ if [ $output_dir != "GToTree_output" ] || [ "$file_to_genome_id_map" != "" ] ||
521526
printf " - Gene-length filtering cutoff threshold (\"-c\") has been set to $len_cutoff.\n" | tee -a $gtotree_log
522527
fi
523528

524-
if [ $tree_program != "FastTree" ]; then
525-
printf " - Tree generation program (\"-T\") has been set to $tree_program.\n" | tee -a $gtotree_log
529+
if [ $align_only == 'true' ]; then
530+
printf " - Only generating alignment, no tree, as \"-N\" option has been provided.\n" | tee -a $gtotree_log
531+
fi
532+
533+
if [ $align_only == 'false' ]; then
534+
535+
if [ $tree_program != "FastTree" ]; then
536+
printf " - Tree generation program (\"-T\") has been set to $tree_program.\n" | tee -a $gtotree_log
537+
fi
538+
526539
fi
527540

528541
if [ $gen_cutoff != "0.5" ]; then
@@ -1579,6 +1592,42 @@ grep ">" ${tmp_dir}/cat.tmp | cut -f1 > ${tmp_dir}/headers.tmp
15791592
grep -v ">" ${tmp_dir}/cat.tmp | sed -e $'s/\t/XXXXX/g' > ${tmp_dir}/seqs.tmp
15801593
paste -d "\n" ${tmp_dir}/headers.tmp ${tmp_dir}/seqs.tmp > ${output_dir}/Aligned_SCGs.faa
15811594

1595+
# creating partitions file of gene coordinates for those that want might want it for mixed models with something like iqtree
1596+
curr_start=1
1597+
curr_stop=0
1598+
n=1
1599+
num_to_do=$(ls ${tmp_dir}/*_all_aligned.faa | wc -l)
1600+
1601+
for gene_file in $(ls ${tmp_dir}/*_all_aligned.faa)
1602+
1603+
do
1604+
if [ $n == 1 ]; then
1605+
n=$(($n + 1))
1606+
gene_base=$(basename $gene_file)
1607+
gene=${gene_base%%_all_aligned.faa}
1608+
len_tmp=$(head -n 2 $gene_file | grep -v ">" | wc -c)
1609+
len_tmp=$(echo "$len_tmp - 1" | bc)
1610+
1611+
printf "AA, $gene = 1-${len_tmp}\n"
1612+
1613+
curr_start=$(echo "$len_tmp + 6" | bc)
1614+
1615+
else
1616+
n=$(($n + 1))
1617+
gene_base=$(basename $gene_file)
1618+
gene=${gene_base%%_all_aligned.faa}
1619+
len_tmp=$(head -n 2 $gene_file | grep -v ">" | wc -c)
1620+
len_tmp=$(echo "$len_tmp - 1" | bc)
1621+
1622+
curr_stop=$(echo "$curr_start + $len_tmp - 1" | bc)
1623+
1624+
printf "AA, $gene = ${curr_start}-${curr_stop}\n"
1625+
1626+
curr_start=$(echo "$curr_stop + 6" | bc)
1627+
fi
1628+
1629+
done > ${output_dir}/Partitions.txt
1630+
15821631
# storing genomes that made it through workflow to report at end
15831632
genomes_retained=$(wc -l ${tmp_dir}/final_genomes_from_all_sources.tmp | sed "s/^ *//" | cut -d " " -f 1)
15841633

@@ -1938,47 +1987,51 @@ fi
19381987
############################## MAKING TREE ###############################
19391988
#############################################################################
19401989

1941-
if [ $tree_program == 'FastTree' ]; then
1990+
if [ $align_only == 'false' ]; then
19421991

1943-
printf "\n ############################################################################## \n" | tee -a $gtotree_log
1944-
printf " #### Running FastTree ####\n" | tee -a $gtotree_log
1945-
printf " ############################################################################## \n\n" | tee -a $gtotree_log
1992+
if [ $tree_program == 'FastTree' ]; then
19461993

1947-
curr_time=$(date +"%I:%M %p")
1948-
duration=$SECONDS
1994+
printf "\n ############################################################################## \n" | tee -a $gtotree_log
1995+
printf " #### Running FastTree ####\n" | tee -a $gtotree_log
1996+
printf " ############################################################################## \n\n" | tee -a $gtotree_log
19491997

1950-
printf " It is currently $curr_time; the process started at $start_time.\n" | tee -a $gtotree_log
1951-
printf " Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee -a $gtotree_log
1998+
curr_time=$(date +"%I:%M %p")
1999+
duration=$SECONDS
19522000

2001+
printf " It is currently $curr_time; the process started at $start_time.\n" | tee -a $gtotree_log
2002+
printf " Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee -a $gtotree_log
2003+
2004+
2005+
if [ -s ${output_dir}/Aligned_SCGs_mod_names.faa ]; then
2006+
FastTree ${output_dir}/Aligned_SCGs_mod_names.faa > ${output_dir}/Aligned_SCGs_mod_names.tre | tee -a $gtotree_log
2007+
else
2008+
FastTree ${output_dir}/Aligned_SCGs.faa > ${output_dir}/Aligned_SCGs.tre | tee -a $gtotree_log
2009+
fi
19532010

1954-
if [ -s ${output_dir}/Aligned_SCGs_mod_names.faa ]; then
1955-
FastTree ${output_dir}/Aligned_SCGs_mod_names.faa > ${output_dir}/Aligned_SCGs_mod_names.tre | tee -a $gtotree_log
19562011
else
1957-
FastTree ${output_dir}/Aligned_SCGs.faa > ${output_dir}/Aligned_SCGs.tre | tee -a $gtotree_log
1958-
fi
19592012

1960-
else
2013+
printf "\n ############################################################################## \n" | tee -a $gtotree_log
2014+
printf " #### Running IQ-TREE ####\n" | tee -a $gtotree_log
2015+
printf " ############################################################################## \n\n" | tee -a $gtotree_log
19612016

1962-
printf "\n ############################################################################## \n" | tee -a $gtotree_log
1963-
printf " #### Running IQ-TREE ####\n" | tee -a $gtotree_log
1964-
printf " ############################################################################## \n\n" | tee -a $gtotree_log
2017+
curr_time=$(date +"%I:%M %p")
2018+
duration=$SECONDS
19652019

1966-
curr_time=$(date +"%I:%M %p")
1967-
duration=$SECONDS
2020+
printf " It is currently $curr_time; the process started at $start_time.\n" | tee -a $gtotree_log
2021+
printf " Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee -a $gtotree_log
19682022

1969-
printf " It is currently $curr_time; the process started at $start_time.\n" | tee -a $gtotree_log
1970-
printf " Current process runtime: $(($duration / 60 / 60)) hours and $((($duration / 60) % 60)) minutes.\n\n" | tee -a $gtotree_log
19712023

2024+
if [ -s ${output_dir}/Aligned_SCGs_mod_names.faa ]; then
2025+
iqtree -s ${output_dir}/Aligned_SCGs_mod_names.faa -nt $num_jobs -mset WAG,LG -bb 1000 -pre iqtree_out
2026+
else
2027+
iqtree -s ${output_dir}/Aligned_SCGs.faa -nt $num_jobs -mset WAG,LG -bb 1000 -pre iqtree_out
2028+
fi
19722029

1973-
if [ -s ${output_dir}/Aligned_SCGs_mod_names.faa ]; then
1974-
iqtree -s ${output_dir}/Aligned_SCGs_mod_names.faa -nt $num_jobs -mset WAG,LG -bb 1000 -pre iqtree_out
1975-
else
1976-
iqtree -s ${output_dir}/Aligned_SCGs.faa -nt $num_jobs -mset WAG,LG -bb 1000 -pre iqtree_out
1977-
fi
2030+
mkdir ${output_dir}/iqtree_out/
2031+
mv iqtree_out* ${output_dir}/iqtree_out/
2032+
cp ${output_dir}/iqtree_out/iqtree_out.treefile ${output_dir}/Aligned_SCGs_mod_names.tre
19782033

1979-
mkdir ${output_dir}/iqtree_out/
1980-
mv iqtree_out* ${output_dir}/iqtree_out/
1981-
cp ${output_dir}/iqtree_out/iqtree_out.treefile ${output_dir}/Aligned_SCGs_mod_names.tre
2034+
fi
19822035

19832036
fi
19842037

@@ -2000,22 +2053,34 @@ fi
20002053

20012054
# reporting primary output files
20022055

2003-
printf " Full alignment written to file:\n" | tee -a $gtotree_log
2056+
if [ $align_only == 'false' ]; then
2057+
2058+
if [ -s ${output_dir}/Aligned_SCGs_mod_names.faa ]; then
2059+
2060+
printf " Tree written to:\n" | tee -a $gtotree_log
2061+
printf " ${GREEN}${output_dir}/Aligned_SCGs_mod_names.tre${NC}\n\n" | tee -a $gtotree_log
2062+
else
2063+
printf " Tree written to:\n" | tee -a $gtotree_log
2064+
printf " ${GREEN}${output_dir}/Aligned_SCGs.tre${NC}\n\n" | tee -a $gtotree_log
2065+
fi
2066+
2067+
fi
2068+
2069+
printf " Full alignment written to:\n" | tee -a $gtotree_log
20042070
printf " ${GREEN}${output_dir}/Aligned_SCGs.faa${NC}\n\n" | tee -a $gtotree_log
2071+
20052072
if [ -s ${output_dir}/Aligned_SCGs_mod_names.faa ]; then
2006-
printf " Alignment with altered headers written to file:\n" | tee -a $gtotree_log
2073+
printf " Alignment with altered headers written to:\n" | tee -a $gtotree_log
20072074
printf " ${GREEN}${output_dir}/Aligned_SCGs_mod_names.faa${NC}\n\n" | tee -a $gtotree_log
2008-
printf " Tree written to file:\n" | tee -a $gtotree_log
2009-
printf " ${GREEN}${output_dir}/Aligned_SCGs_mod_names.tre${NC}\n\n" | tee -a $gtotree_log
2010-
else
2011-
printf " Tree written to file:\n" | tee -a $gtotree_log
2012-
printf " ${GREEN}${output_dir}/Aligned_SCGs.tre${NC}\n\n" | tee -a $gtotree_log
20132075
fi
20142076

2015-
printf " Summary file with comp./redund. estimates written to file:\n" | tee -a $gtotree_log
2077+
printf " Partitions (for downstream use with mixed-model treeing) written to:\n" | tee -a $gtotree_log
2078+
printf " ${GREEN}${output_dir}/Partitions.txt${NC}\n\n" | tee -a $gtotree_log
2079+
2080+
printf " Summary file with comp./redund. estimates written to:\n" | tee -a $gtotree_log
20162081
printf " ${GREEN}${output_dir}/All_genomes_summary_info.tsv${NC}\n\n" | tee -a $gtotree_log
20172082

2018-
printf " Summary table with hits per target gene per genome written to file:\n" | tee -a $gtotree_log
2083+
printf " Summary table with hits per target gene per genome written to:\n" | tee -a $gtotree_log
20192084
printf " ${GREEN}${output_dir}/All_genomes_SCG_hit_counts.tsv${NC}\n\n" | tee -a $gtotree_log
20202085

20212086
# reporting any problem files/accessions

example_run/Alteromonas_example/Aligned_SCGs.faa

Lines changed: 33 additions & 33 deletions
Large diffs are not rendered by default.

example_run/Alteromonas_example/Aligned_SCGs_mod_names.faa

Lines changed: 33 additions & 33 deletions
Large diffs are not rendered by default.
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
((GCF_000020585.3_Alteromonas_mediterranea_DE:0.00202,(GCF_000439535.1_Alteromonas_mediterranea_U7:0.0,GCF_000439555.1_Alteromonas_mediterranea_U8:0.0):0.00153)0.900:0.00055,((GCF_001886395.1_Alteromonas_mediterranea_CP48:0.00138,(GCF_001886435.1_Alteromonas_mediterranea_RG65:0.00132,(GCF_001886415.1_Alteromonas_mediterranea_CP49:0.00055,GCF_001886455.1_Alteromonas_mediterranea_AR43:0.00055)1.000:0.00189)0.881:0.00050)1.000:0.00083,(((GCF_001886475.1_Alteromonas_sp_RW2A1_RW2A1:0.00124,GCF_002831605.1_Alteromonas_sp_MB-3u-76_MB-3u-76:0.00094)1.000:0.08194,((GCF_000730385.1_Alteromonas_australica_H_17:0.00122,GCF_000934525.1_Alteromonas_australica_DE170:0.00138)1.000:0.06762,(GCF_000011365.1_Alpha_Outgroup:1.15277,(GCF_000213655.1_Alteromonas_naphthalenivorans_SN2:0.00831,((GCF_001433715.1_Alteromonas_stellipolaris_LMG_21856:0.0,GCF_001562115.1_Alteromonas_stellipolaris_LMG_21861:0.0):0.00159,(GCF_001632825.1_Alteromonas_stellipolaris_PQQ-44:0.00080,(GCF_001632765.1_Alteromonas_stellipolaris_PQQ-42:0.00055,(GCF_001562125.1_Alteromonas_sp_Mac1_Mac1:0.00114,GCF_001562195.1_Alteromonas_addita_R10SW13:0.00109)0.872:0.00051)0.347:0.00055)0.999:0.00104)1.000:0.00810)1.000:0.05790)0.982:0.01987)1.000:0.03197)1.000:0.03452,(GCF_003443615.1_Alteromonas_sp_BL110_BL110:0.01952,(GCF_000299995.1_Alteromonas_macleodii_Black_Sea_11:0.01052,(((((Our_Alteromonas_MAG:0.00221,GCF_002849875.1_Alteromonas_macleodii_Te101:0.00182)0.643:0.00055,(GCF_000172635.2_Alteromonas_macleodii_ATCC_27126:0.00146,GCF_000299955.1_Alteromonas_macleodii_English_Channel_673:0.00215)0.882:0.00055)0.670:0.00055,GCF_001578515.1_Alteromonas_macleodii_HOT1A3:0.00198)0.999:0.00059,GCF_000300175.1_Alteromonas_macleodii_Balearic_Sea_AD45:0.00364)0.867:0.00052,GCF_001562235.1_Alteromonas_macleodii_D7:0.00364)1.000:0.00561)1.000:0.01088)1.000:0.01130)1.000:0.02062)1.000:0.00084,(GCF_000439495.1_Alteromonas_mediterranea_MED64:0.00126,((GCF_000310085.1_Alteromonas_mediterranea_AltDE1:0.0,GCF_000439575.1_Alteromonas_mediterranea_UM7:0.0,GCF_001562315.1_Alteromonas_mediterranea_UM8:0.0):0.00055,GCF_000439595.1_Alteromonas_mediterranea_UM4b:0.00055)1.000:0.00166)0.059:0.00055);
1+
((GCF_000020585.3_Alteromonas_mediterranea_DE:0.00207,GCF_000439495.1_Alteromonas_mediterranea_MED64:0.00119)0.724:0.00055,((GCF_001886395.1_Alteromonas_mediterranea_CP48:0.00135,(GCF_001886435.1_Alteromonas_mediterranea_RG65:0.00126,(GCF_001886415.1_Alteromonas_mediterranea_CP49:0.00055,GCF_001886455.1_Alteromonas_mediterranea_AR43:0.00055)1.000:0.00185)0.916:0.00055)1.000:0.00081,(((GCF_001886475.1_Alteromonas_sp_RW2A1_RW2A1:0.00124,GCF_002831605.1_Alteromonas_sp_MB-3u-76_MB-3u-76:0.00094)1.000:0.08171,((GCF_000730385.1_Alteromonas_australica_H_17:0.00118,GCF_000934525.1_Alteromonas_australica_DE170:0.00140)1.000:0.06761,(GCF_000011365.1_Alpha_Outgroup:1.15183,(GCF_000213655.1_Alteromonas_naphthalenivorans_SN2:0.00828,((GCF_001433715.1_Alteromonas_stellipolaris_LMG_21856:0.0,GCF_001562115.1_Alteromonas_stellipolaris_LMG_21861:0.0):0.00150,(GCF_001632825.1_Alteromonas_stellipolaris_PQQ-44:0.00080,(GCF_001632765.1_Alteromonas_stellipolaris_PQQ-42:0.00055,(GCF_001562125.1_Alteromonas_sp_Mac1_Mac1:0.00114,GCF_001562195.1_Alteromonas_addita_R10SW13:0.00110)0.785:0.00050)0.317:0.00055)0.998:0.00104)1.000:0.00824)1.000:0.05757)0.986:0.02015)1.000:0.03187)1.000:0.03449,(GCF_003443615.1_Alteromonas_sp_BL110_BL110:0.01934,(GCF_000299995.1_Alteromonas_macleodii_Black_Sea_11:0.01022,(GCF_001562235.1_Alteromonas_macleodii_D7:0.00361,(GCF_000300175.1_Alteromonas_macleodii_Balearic_Sea_AD45:0.00363,(GCF_001578515.1_Alteromonas_macleodii_HOT1A3:0.00198,((GCF_000172635.2_Alteromonas_macleodii_ATCC_27126:0.00146,GCF_000299955.1_Alteromonas_macleodii_English_Channel_673:0.00215)0.875:0.00055,(GCF_002849875.1_Alteromonas_macleodii_Te101:0.00182,Our_Alteromonas_MAG:0.00214)0.601:0.00055)0.657:0.00055)1.000:0.00059)0.863:0.00050)1.000:0.00535)1.000:0.01076)1.000:0.01117)1.000:0.02013)1.000:0.00077,(((GCF_000310085.1_Alteromonas_mediterranea_AltDE1:0.0,GCF_000439575.1_Alteromonas_mediterranea_UM7:0.0,GCF_001562315.1_Alteromonas_mediterranea_UM8:0.0):0.00055,GCF_000439595.1_Alteromonas_mediterranea_UM4b:0.00055)1.000:0.00152,(GCF_000439535.1_Alteromonas_mediterranea_U7:0.0,GCF_000439555.1_Alteromonas_mediterranea_U8:0.0):0.00167)0.924:0.00055);

0 commit comments

Comments
 (0)