Skip to content

Commit b4816f0

Browse files
committed
adding additional pfam search option to main program and creating iToL compatible files for additional targets; v1.4.1
1 parent 701aa9f commit b4816f0

12 files changed

+663
-100
lines changed

bin/GToTree

Lines changed: 268 additions & 74 deletions
Large diffs are not rendered by default.

bin/GToTree-gen-iToL-map

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
#!/usr/bin/env python
2+
3+
from __future__ import print_function
4+
5+
import sys
6+
import argparse
7+
8+
parser = argparse.ArgumentParser(description='This script is for creating a standard iToL color file give the genomes of interest and the "All_genomes_summary_info.tsv" output file from a normal GToTree run.')
9+
10+
required = parser.add_argument_group('required arguments')
11+
12+
required.add_argument("-s", "--all_genomes_summary", help="All_genomes_summary_info.tsv file from a typical GToTree run", action="store", dest="summary", required=True)
13+
14+
required.add_argument("-g", "--target_genomes", help="Single-column file with the genomes to color (should match their initial IDs when given to GToTree, e.g. input file with no extension, or NCBI accessions)", action="store", dest="target_genomes", required=True)
15+
parser.add_argument("-o", "--output_file", help='Output file for iToL (default: "iToL-colors.txt")', action="store", dest="output_file", default="iToL-colors.txt")
16+
17+
if len(sys.argv)==1:
18+
parser.print_help(sys.stderr)
19+
sys.exit(1)
20+
21+
args = parser.parse_args()
22+
23+
target_list = []
24+
25+
with open(args.target_genomes, "r") as target_genomes:
26+
for genome in target_genomes:
27+
target_list.append(genome.strip())
28+
29+
out_file = open(args.output_file, "w")
30+
31+
out_file.write("DATASET_STYLE\nSEPARATOR TAB\nDATASET_LABEL\tGToTree\nCOLOR\t#0000ff\nDATA\n")
32+
33+
with open(args.summary) as summary:
34+
for line in summary:
35+
line = line.split("\t")
36+
37+
if line[0] in target_list:
38+
out_file.write(str(line[1]) + "\tbranch\tnode\t#0000ff\t3\tnormal")

bin/gtt-amino-acid-parallel.sh

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ num_cpus=$4
1111
hmm_target_genes_total=$5
1212
output_dir=$6
1313
best_hit_mode=$7
14+
additional_pfam_targets=$8
1415

1516

1617
### kill backstop
@@ -71,7 +72,7 @@ awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab
7172
uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ")
7273

7374
## adding SCG-hit counts to table
74-
paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t")") >> ${output_dir}/All_genomes_SCG_hit_counts.tsv
75+
paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t")") >> ${output_dir}/SCG_hit_counts.tsv
7576

7677
num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ")
7778

@@ -99,7 +100,7 @@ if [ ${mult_perc_redund_rnd} -ge 1000 ]; then
99100
printf " going over 10%% is getting into the questionable range. You may want to\n"
100101
printf " consider taking a closer look and/or removing it from the input genomes.\n\n"
101102

102-
printf " Reported in \"${output_dir}/Genomes_with_questionable_redund_estimates.tsv\".\n"
103+
printf " Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n"
103104
printf " ${RED}****************************************************************************${NC} \n\n"
104105

105106
# writing to table of genomes with questionable redundancy estimates
@@ -137,4 +138,43 @@ else
137138

138139
fi
139140

141+
## searching for additional targets if provided
142+
if [ $additional_pfam_targets == "true" ]; then
143+
144+
### counting how many genes in this genome
145+
gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)
146+
147+
hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/all_targets.hmm ${tmp_dir}/${assembly}_genes.tmp > /dev/null
148+
149+
### getting counts of each target in this genome
150+
for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
151+
do
152+
grep -w ${target} ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | wc -l | sed 's/^ *//' >> ${tmp_dir}/${assembly}_hit_counts.tmp
153+
done
154+
155+
### writing results to main output file
156+
paste <( printf "${assembly}\tNA\t${gene_count}" ) <(printf %s "$(cat ${tmp_dir}/${assembly}_hit_counts.tmp | tr "\n" "\t") " ) >> ${output_dir}/additional_pfam_search_results/Additional_Pfam_hit_counts.tsv
157+
158+
### Pulling out hits to additional pfam targets for this genome ###
159+
for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
160+
do
161+
if grep -w -q "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp; then
162+
163+
grep -w "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | cut -f 1 -d " " >> ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp
164+
165+
for gene in $(cat ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp)
166+
do
167+
echo $gene | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp -
168+
done >> ${tmp_dir}/${assembly}_${target}_genes1.tmp
169+
170+
gtt-append-fasta-headers -i ${tmp_dir}/${assembly}_${target}_genes1.tmp -w ${assembly}_${target} -o ${tmp_dir}/${assembly}_${target}_genes.tmp
171+
172+
# adding to fasta of that target holding all genomes
173+
cat ${tmp_dir}/${assembly}_${target}_genes.tmp >> ${output_dir}/additional_pfam_search_results/${target}_hits.faa
174+
fi
175+
176+
done
177+
fi
178+
179+
140180
rm -rf ${tmp_dir}/${assembly}_*.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi

bin/gtt-amino-acid-serial.sh

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ num_cpus=$5
1212
hmm_target_genes_total=$6
1313
output_dir=$7
1414
best_hit_mode=$8
15+
additional_pfam_targets=$9
1516

1617
# looping through the lines of the provided [-f] file (this loop operates on one genome at a time)
1718
while IFS=$'\t' read -r -a file
@@ -77,7 +78,7 @@ do
7778
uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ")
7879

7980
## adding SCG-hit counts to table
80-
paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t")") >> ${output_dir}/All_genomes_SCG_hit_counts.tsv
81+
paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t")") >> ${output_dir}/SCG_hit_counts.tsv
8182

8283
num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ")
8384
num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }')
@@ -104,7 +105,7 @@ do
104105
printf " going over 10%% is getting into the questionable range. You may want to\n"
105106
printf " consider taking a closer look and/or removing it from the input genomes.\n\n"
106107

107-
printf " Reported in \"${output_dir}/Genomes_with_questionable_redund_estimates.tsv\".\n"
108+
printf " Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n"
108109
printf " ${RED}****************************************************************************${NC} \n\n"
109110

110111
# writing to table of genomes with questionable redundancy estimates
@@ -144,6 +145,44 @@ do
144145

145146
fi
146147

148+
## searching for additional targets if provided
149+
if [ $additional_pfam_targets == "true" ]; then
150+
151+
### counting how many genes in this genome
152+
gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)
153+
154+
hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/all_targets.hmm ${tmp_dir}/${assembly}_genes.tmp > /dev/null
155+
156+
### getting counts of each target in this genome
157+
for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
158+
do
159+
grep -w ${target} ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | wc -l | sed 's/^ *//' >> ${tmp_dir}/${assembly}_hit_counts.tmp
160+
done
161+
162+
### writing results to main output file
163+
paste <( printf "${assembly}\tNA\t${gene_count}" ) <(printf %s "$(cat ${tmp_dir}/${assembly}_hit_counts.tmp | tr "\n" "\t") " ) >> ${output_dir}/additional_pfam_search_results/Additional_Pfam_hit_counts.tsv
164+
165+
### Pulling out hits to additional pfam targets for this genome ###
166+
for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
167+
do
168+
if grep -w -q "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp; then
169+
170+
grep -w "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | cut -f 1 -d " " >> ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp
171+
172+
for gene in $(cat ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp)
173+
do
174+
echo $gene | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp -
175+
done >> ${tmp_dir}/${assembly}_${target}_genes1.tmp
176+
177+
gtt-append-fasta-headers -i ${tmp_dir}/${assembly}_${target}_genes1.tmp -w ${assembly}_${target} -o ${tmp_dir}/${assembly}_${target}_genes.tmp
178+
179+
# adding to fasta of that target holding all genomes
180+
cat ${tmp_dir}/${assembly}_${target}_genes.tmp >> ${output_dir}/additional_pfam_search_results/${target}_hits.faa
181+
fi
182+
183+
done
184+
fi
185+
147186
rm -rf ${tmp_dir}/${assembly}_*.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi
148187

149188
done < $1

bin/gtt-fasta-parallel.sh

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ num_cpus=$4
1111
hmm_target_genes_total=$5
1212
output_dir=$6
1313
best_hit_mode=$7
14+
additional_pfam_targets=$8
1415

1516

1617
### kill backstop
@@ -77,7 +78,7 @@ awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab
7778
uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ")
7879

7980
## adding SCG-hit counts to table
80-
paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t")") >> ${output_dir}/All_genomes_SCG_hit_counts.tsv
81+
paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t")") >> ${output_dir}/SCG_hit_counts.tsv
8182

8283
num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ")
8384

@@ -105,7 +106,7 @@ if [ ${mult_perc_redund_rnd} -ge 1000 ]; then
105106
printf " going over 10%% is getting into the questionable range. You may want to\n"
106107
printf " consider taking a closer look and/or removing it from the input genomes.\n\n"
107108

108-
printf " Reported in \"${output_dir}/Genomes_with_questionable_redund_estimates.tsv\".\n"
109+
printf " Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n"
109110
printf " ${RED}****************************************************************************${NC} \n\n"
110111

111112
# writing to table of genomes with questionable redundancy estimates
@@ -143,4 +144,42 @@ else
143144

144145
fi
145146

147+
## searching for additional targets if provided
148+
if [ $additional_pfam_targets == "true" ]; then
149+
150+
### counting how many genes in this genome
151+
gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)
152+
153+
hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/all_targets.hmm ${tmp_dir}/${assembly}_genes.tmp > /dev/null
154+
155+
### getting counts of each target in this genome
156+
for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
157+
do
158+
grep -w ${target} ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | wc -l | sed 's/^ *//' >> ${tmp_dir}/${assembly}_hit_counts.tmp
159+
done
160+
161+
### writing results to main output file
162+
paste <( printf "${assembly}\tNA\t${gene_count}" ) <(printf %s "$(cat ${tmp_dir}/${assembly}_hit_counts.tmp | tr "\n" "\t") " ) >> ${output_dir}/additional_pfam_search_results/Additional_Pfam_hit_counts.tsv
163+
164+
### Pulling out hits to additional pfam targets for this genome ###
165+
for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
166+
do
167+
if grep -w -q "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp; then
168+
169+
grep -w "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | cut -f 1 -d " " >> ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp
170+
171+
for gene in $(cat ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp)
172+
do
173+
echo $gene | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp -
174+
done >> ${tmp_dir}/${assembly}_${target}_genes1.tmp
175+
176+
gtt-append-fasta-headers -i ${tmp_dir}/${assembly}_${target}_genes1.tmp -w ${assembly}_${target} -o ${tmp_dir}/${assembly}_${target}_genes.tmp
177+
178+
# adding to fasta of that target holding all genomes
179+
cat ${tmp_dir}/${assembly}_${target}_genes.tmp >> ${output_dir}/additional_pfam_search_results/${target}_hits.faa
180+
fi
181+
182+
done
183+
fi
184+
146185
rm -rf ${tmp_dir}/${assembly}_*.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi

bin/gtt-fasta-serial.sh

Lines changed: 42 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ num_cpus=$5
1212
hmm_target_genes_total=$6
1313
output_dir=$7
1414
best_hit_mode=$8
15+
additional_pfam_targets=$9
1516

1617
# looping through the lines of the provided [-f] file (this loop operates on one genome at a time)
1718
while IFS=$'\t' read -r -a file
@@ -86,7 +87,7 @@ do
8687
uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ")
8788

8889
## adding SCG-hit counts to table
89-
paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t")") >> ${output_dir}/All_genomes_SCG_hit_counts.tsv
90+
paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t")") >> ${output_dir}/SCG_hit_counts.tsv
9091

9192
num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ")
9293
num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }')
@@ -113,7 +114,7 @@ do
113114
printf " going over 10%% is getting into the questionable range. You may want to\n"
114115
printf " consider taking a closer look and/or removing it from the input genomes.\n\n"
115116

116-
printf " Reported in \"${output_dir}/Genomes_with_questionable_redund_estimates.tsv\".\n"
117+
printf " Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n"
117118
printf " ${RED}****************************************************************************${NC} \n\n"
118119

119120
# writing to table of genomes with questionable redundancy estimates
@@ -153,6 +154,45 @@ do
153154

154155
fi
155156

157+
158+
## searching for additional targets if provided
159+
if [ $additional_pfam_targets == "true" ]; then
160+
161+
### counting how many genes in this genome
162+
gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)
163+
164+
hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/all_targets.hmm ${tmp_dir}/${assembly}_genes.tmp > /dev/null
165+
166+
### getting counts of each target in this genome
167+
for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
168+
do
169+
grep -w ${target} ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | wc -l | sed 's/^ *//' >> ${tmp_dir}/${assembly}_hit_counts.tmp
170+
done
171+
172+
### writing results to main output file
173+
paste <( printf "${assembly}\tNA\t${gene_count}" ) <(printf %s "$(cat ${tmp_dir}/${assembly}_hit_counts.tmp | tr "\n" "\t") " ) >> ${output_dir}/additional_pfam_search_results/Additional_Pfam_hit_counts.tsv
174+
175+
### Pulling out hits to additional pfam targets for this genome ###
176+
for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
177+
do
178+
if grep -w -q "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp; then
179+
180+
grep -w "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | cut -f 1 -d " " >> ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp
181+
182+
for gene in $(cat ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp)
183+
do
184+
echo $gene | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp -
185+
done >> ${tmp_dir}/${assembly}_${target}_genes1.tmp
186+
187+
gtt-append-fasta-headers -i ${tmp_dir}/${assembly}_${target}_genes1.tmp -w ${assembly}_${target} -o ${tmp_dir}/${assembly}_${target}_genes.tmp
188+
189+
# adding to fasta of that target holding all genomes
190+
cat ${tmp_dir}/${assembly}_${target}_genes.tmp >> ${output_dir}/additional_pfam_search_results/${target}_hits.faa
191+
fi
192+
193+
done
194+
fi
195+
156196
rm -rf ${tmp_dir}/${assembly}_*.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi
157197

158198
done < $1

bin/gtt-genbank-parallel.sh

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ num_cpus=$4
1111
hmm_target_genes_total=$5
1212
output_dir=$6
1313
best_hit_mode=$7
14+
additional_pfam_targets=$8
1415

1516

1617
### kill backstop
@@ -69,7 +70,7 @@ if [ ! -s ${tmp_dir}/${assembly}_genes2.tmp ]; then
6970
printf " This genbank file doesn't appear to have CDS annotations,\n"
7071
printf " so we are identifying coding sequences with prodigal.\n\n"
7172

72-
printf " Reported in \"${output_dir}/Genbank_files_with_no_CDSs.txt\".\n"
73+
printf " Reported in \"${output_dir}/run_files/Genbank_files_with_no_CDSs.txt\".\n"
7374
printf " ${RED}****************************************************************************${NC} \n\n"
7475

7576
echo "$1" >> ${output_dir}/Genbank_files_with_no_CDSs.txt
@@ -118,7 +119,7 @@ awk -F "\t" ' $2 == 1 ' ${tmp_dir}/${assembly}_conservative_filtering_counts_tab
118119
uniq_SCG_hits=$(wc -l ${tmp_dir}/${assembly}_conservative_target_unique_hmm_names.tmp | sed 's/^ *//' | cut -f 1 -d " ")
119120

120121
## adding SCG-hit counts to table
121-
paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t")") >> ${output_dir}/All_genomes_SCG_hit_counts.tsv
122+
paste <(printf $assembly) <(printf %s "$(cat ${tmp_dir}/${assembly}_uniq_counts.tmp | tr "\n" "\t")") >> ${output_dir}/SCG_hit_counts.tsv
122123

123124
num_SCG_hits=$(awk ' $1 > 0 ' ${tmp_dir}/${assembly}_uniq_counts.tmp | wc -l | tr -s " " | cut -f2 -d " ")
124125
num_SCG_redund=$(awk '{ if ($1 == 0) { print $1 } else { print $1 - 1 } }' ${tmp_dir}/${assembly}_uniq_counts.tmp | awk '{ sum += $1 } END { print sum }')
@@ -145,7 +146,7 @@ if [ ${mult_perc_redund_rnd} -ge 1000 ]; then
145146
printf " going over 10%% is getting into the questionable range. You may want to\n"
146147
printf " consider taking a closer look and/or removing it from the input genomes.\n\n"
147148

148-
printf " Reported in \"${output_dir}/Genomes_with_questionable_redund_estimates.tsv\".\n"
149+
printf " Reported in \"${output_dir}/run_files/Genomes_with_questionable_redund_estimates.tsv\".\n"
149150
printf " ${RED}****************************************************************************${NC} \n\n"
150151

151152
# writing to table of genomes with questionable redundancy estimates
@@ -180,4 +181,42 @@ else
180181

181182
fi
182183

184+
## searching for additional targets if provided
185+
if [ $additional_pfam_targets == "true" ]; then
186+
187+
### counting how many genes in this genome
188+
gene_count=$(grep -c ">" ${tmp_dir}/${assembly}_genes.tmp)
189+
190+
hmmsearch --cut_ga --cpu $num_cpus --tblout ${tmp_dir}/${assembly}_curr_hmm_hits.tmp ${tmp_dir}/all_targets.hmm ${tmp_dir}/${assembly}_genes.tmp > /dev/null
191+
192+
### getting counts of each target in this genome
193+
for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
194+
do
195+
grep -w ${target} ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | wc -l | sed 's/^ *//' >> ${tmp_dir}/${assembly}_hit_counts.tmp
196+
done
197+
198+
### writing results to main output file
199+
paste <( printf "${assembly}\tNA\t${gene_count}" ) <(printf %s "$(cat ${tmp_dir}/${assembly}_hit_counts.tmp | tr "\n" "\t") " ) >> ${output_dir}/additional_pfam_search_results/Additional_Pfam_hit_counts.tsv
200+
201+
### Pulling out hits to additional pfam targets for this genome ###
202+
for target in $(cat ${tmp_dir}/actual_pfam_targets.tmp)
203+
do
204+
if grep -w -q "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp; then
205+
206+
grep -w "$target" ${tmp_dir}/${assembly}_curr_hmm_hits.tmp | cut -f 1 -d " " >> ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp
207+
208+
for gene in $(cat ${tmp_dir}/${assembly}_${target}_genes_of_int.tmp)
209+
do
210+
echo $gene | esl-sfetch -f ${tmp_dir}/${assembly}_genes.tmp -
211+
done >> ${tmp_dir}/${assembly}_${target}_genes1.tmp
212+
213+
gtt-append-fasta-headers -i ${tmp_dir}/${assembly}_${target}_genes1.tmp -w ${assembly}_${target} -o ${tmp_dir}/${assembly}_${target}_genes.tmp
214+
215+
# adding to fasta of that target holding all genomes
216+
cat ${tmp_dir}/${assembly}_${target}_genes.tmp >> ${output_dir}/additional_pfam_search_results/${target}_hits.faa
217+
fi
218+
219+
done
220+
fi
221+
183222
rm -rf ${tmp_dir}/${assembly}_*.tmp ${tmp_dir}/${assembly}_genes.tmp.ssi

0 commit comments

Comments
 (0)