1+ #! /bin/sh -e
2+ fail () {
3+ echo " Error: $1 "
4+ exit 1
5+ }
6+
7+ notExists () {
8+ [ ! -f " $1 " ]
9+ }
10+
11+
12+ if notExists " ${TMP_PATH} /input.dbtype" ; then
13+ # shellcheck disable=SC2086
14+ " $MMSEQS " createdb " $@ " " ${TMP_PATH} /input" ${CREATEDB_PAR} \
15+ || fail " query createdb died"
16+ fi
17+
18+ if notExists " ${TMP_PATH} /clu.dbtype" ; then
19+ # shellcheck disable=SC2086
20+ " $MMSEQS " " ${CLUSTER_MODULE} " " ${TMP_PATH} /input" " ${TMP_PATH} /clu" " ${TMP_PATH} /clu_tmp" ${CLUSTER_PAR} \
21+ || fail " linclust died"
22+ fi
23+
24+ if notExists " ${RESULTS} _protein_cluster.tsv" ; then
25+ # shellcheck disable=SC2086
26+ " $MMSEQS " createtsv " ${TMP_PATH} /input" " ${TMP_PATH} /input" " ${TMP_PATH} /clu" " ${RESULTS} _protein_cluster.tsv" ${THREADS_PAR} \
27+ || fail " createtsv protein cluster died"
28+ fi
29+
30+ if notExists " ${TMP_PATH} /aln_proteome.dbtype" ; then
31+ # shellcheck disable=SC2086
32+ " $MMSEQS " proteomecluster " ${TMP_PATH} /input" " ${TMP_PATH} /clu" " ${TMP_PATH} /aln_proteome" " ${TMP_PATH} /cluster_count" " ${TMP_PATH} /aln_protein" ${PROTEOMECLUSTER_PAR} \
33+ || fail " proteomecluster died"
34+ fi
35+
36+ if notExists " ${RESULTS} _cluster_count.tsv" ; then
37+ # shellcheck disable=SC2086
38+ " $MMSEQS " createtsv " ${TMP_PATH} /input" " ${TMP_PATH} /cluster_count" " ${RESULTS} _cluster_count.tsv" ${THREADS_PAR} \
39+ || fail " createtsv proteome cluster count report died"
40+ fi
41+
42+ if notExists " ${RESULTS} _protein_align.tsv" && [ -n " ${WRITE_ALIGN_PROTEOME} " ]; then
43+ # shellcheck disable=SC2086
44+ " $MMSEQS " createtsv " ${TMP_PATH} /input" " ${TMP_PATH} /input" " ${TMP_PATH} /aln_protein" " ${RESULTS} _protein_align.tsv" ${THREADS_PAR} \
45+ || fail " createtsv protein align died"
46+ else
47+ rm -rf " ${TMP_PATH} /aln_protein" *
48+ fi
49+ # cascade
50+ awk ' NR==FNR { sub(/^\x00/, "", $1); a[$1]; next } !($1 in a)' " ${TMP_PATH} /aln_proteome" " ${TMP_PATH} /input.source" > " ${TMP_PATH} /source_filtered"
51+ SOURCEtoNEXTITERATION=" ${TMP_PATH} /source_filtered"
52+ STEP=2
53+
54+ SUBDB_LOOKUP_LIST=" ${TMP_PATH} /input.lookup"
55+
56+ # Corrected while loop condition with proper spacing and quoting
57+ while [ -s " $SOURCEtoNEXTITERATION " ]; do
58+ echo " Step $STEP : $( wc -l < " $SOURCEtoNEXTITERATION " ) sources left"
59+ # Make "sublookup_STEP" from lines in input.lookup whose 3rd field is in the set from source_filtered
60+ awk ' NR==FNR {sources[$1]; next} $3 in sources' " $SOURCEtoNEXTITERATION " " ${SUBDB_LOOKUP_LIST} " > " ${TMP_PATH} /sublookup_${STEP} "
61+
62+ # Create a smaller DB from sublookup
63+ # shellcheck disable=SC2086
64+ " $MMSEQS " createsubdb " ${TMP_PATH} /sublookup_${STEP} " " ${TMP_PATH} /input" " ${TMP_PATH} /input_${STEP} " --subdb-mode 1
65+ NEXTINPUT=" ${TMP_PATH} /input_${STEP} "
66+
67+ # Run linclust on the newly created sub-DB
68+ echo " Run linclust for iter $STEP "
69+ if notExists " ${TMP_PATH} /clu_${STEP} .dbtype" ; then
70+ # shellcheck disable=SC2086
71+ " $MMSEQS " " ${CLUSTER_MODULE} " " ${NEXTINPUT} " " ${TMP_PATH} /clu_${STEP} " " ${TMP_PATH} /clu_tmp_${STEP} " ${CLUSTER_PAR} \
72+ || fail " linclust died"
73+ fi
74+
75+ echo " Run createtsv: protein clust result for iter $STEP "
76+ if notExists " ${RESULTS} _protein_cluster_${STEP} .tsv" ; then
77+ # shellcheck disable=SC2086
78+ " $MMSEQS " createtsv " ${NEXTINPUT} " " ${NEXTINPUT} " " ${TMP_PATH} /clu_${STEP} " " ${RESULTS} _protein_cluster_${STEP} .tsv" ${THREADS_PAR} \
79+ || fail " createtsv protein cluster died"
80+ fi
81+
82+ if [ -n " $REMOVE_TMP " ]; then
83+ # shellcheck disable=SC2086
84+ rm -rf " ${TMP_PATH} /clu_tmp_${STEP} "
85+ fi
86+
87+ echo " Run ProteomeCluster for iter $STEP "
88+ # Run proteomecluster on the newly created sub-DB
89+ if notExists " ${TMP_PATH} /aln_proteome_${STEP} .dbtype" ; then
90+ # shellcheck disable=SC2086
91+ " $MMSEQS " proteomecluster " ${NEXTINPUT} " " ${TMP_PATH} /clu_${STEP} " " ${TMP_PATH} /aln_proteome_${STEP} " " ${TMP_PATH} /cluster_count_${STEP} " " ${TMP_PATH} /aln_protein_${STEP} " ${PROTEOMECLUSTER_PAR} \
92+ || fail " proteomecluster died"
93+ fi
94+
95+ echo " Run createtsv: clustercount report for iter $STEP "
96+ if notExists " ${RESULTS} _cluster_count_${STEP} .tsv" ; then
97+ # shellcheck disable=SC2086
98+ " $MMSEQS " createtsv " ${NEXTINPUT} " " ${TMP_PATH} /cluster_count_${STEP} " " ${RESULTS} _cluster_count_${STEP} .tsv" ${THREADS_PAR} \
99+ || fail " createtsv proteome cluster count report died"
100+ fi
101+
102+ if notExists " ${RESULTS} _protein_align_${STEP} .tsv" && [ -n " ${WRITE_ALIGN_PROTEOME} " ]; then
103+ echo " Run createtsv: protein align result for iter $STEP "
104+ # shellcheck disable=SC2086
105+ " $MMSEQS " createtsv " ${NEXTINPUT} " " ${NEXTINPUT} " " ${TMP_PATH} /aln_protein_${STEP} " " ${RESULTS} _protein_align_${STEP} .tsv" ${THREADS_PAR} \
106+ || fail " createtsv protein align died"
107+ else
108+ rm -rf " ${TMP_PATH} /aln_protein_${STEP} " *
109+ fi
110+
111+ echo " Run concatdbs of aln_proteome for iter $STEP "
112+ # Concatenate new proteome alignments into the master aln_proteome
113+ " $MMSEQS " concatdbs " ${TMP_PATH} /aln_proteome" " ${TMP_PATH} /aln_proteome_${STEP} " " ${TMP_PATH} /aln_proteome" --preserve-keys 1
114+
115+ # Repeat the AWK-based filtering to update source_filtered
116+ awk ' NR==FNR { sub(/^\x00/, "", $1); a[$1]; next } !($1 in a)' " ${TMP_PATH} /aln_proteome" " ${TMP_PATH} /input.source" > " ${TMP_PATH} /source_filtered"
117+ SUBDB_LOOKUP_LIST=" ${TMP_PATH} /sublookup_${STEP} "
118+ # rm -f "${TMP_PATH}/sublookup_${STEP}"
119+ STEP=$(( STEP + 1 ))
120+ done
121+
122+ echo " Run createtsv: proteome alignment result"
123+ if notExists " ${RESULTS} _proteome_cluster.tsv" ; then
124+ # shellcheck disable=SC2086
125+ " $MMSEQS " createtsv " ${TMP_PATH} /input" " ${TMP_PATH} /input" " ${TMP_PATH} /aln_proteome" " ${RESULTS} _proteome_cluster.tsv" ${THREADS_PAR} \
126+ || fail " createtsv proteome cluster died"
127+ fi
128+
129+ if [ -n " ${REMOVE_TMP} " ]; then
130+ # shellcheck disable=SC2086
131+ " $MMSEQS " rmdb " ${TMP_PATH} /input" ${VERBOSITY_PAR}
132+ # shellcheck disable=SC2086
133+ " $MMSEQS " rmdb " ${TMP_PATH} /input_h" ${VERBOSITY_PAR}
134+ # shellcheck disable=SC2086
135+ " $MMSEQS " rmdb " ${TMP_PATH} /clu" ${VERBOSITY_PAR}
136+ # shellcheck disable=SC2086
137+ " $MMSEQS " rmdb " ${TMP_PATH} /aln" ${VERBOSITY_PAR}
138+ # shellcheck disable=SC2086
139+ " $MMSEQS " rmdb " ${TMP_PATH} /aln_protein" ${VERBOSITY_PAR}
140+ # shellcheck disable=SC2086
141+ " $MMSEQS " rmdb " ${TMP_PATH} /aln_proteome" ${VERBOSITY_PAR}
142+ rm -rf " ${TMP_PATH} /clu_tmp"
143+ rm -f " ${TMP_PATH} /easyproteomecluster.sh"
144+ fi
0 commit comments