Skip to content

Commit 863ccff

Browse files
Gyuuul2milot-mirdita
authored andcommitted
Proteome Clustering Module
1 parent fed69f8 commit 863ccff

16 files changed

+1147
-4
lines changed

data/workflow/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ set(GENERATED_WORKFLOWS
33
workflow/easycluster.sh
44
workflow/easytaxonomy.sh
55
workflow/easyrbh.sh
6+
workflow/easyproteomecluster.sh
67
workflow/blastp.sh
78
workflow/blastpgp.sh
89
workflow/map.sh
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#!/bin/sh -e
2+
fail() {
3+
echo "Error: $1"
4+
exit 1
5+
}
6+
7+
notExists() {
8+
[ ! -f "$1" ]
9+
}
10+
11+
12+
if notExists "${TMP_PATH}/input.dbtype"; then
13+
# shellcheck disable=SC2086
14+
"$MMSEQS" createdb "$@" "${TMP_PATH}/input" ${CREATEDB_PAR} \
15+
|| fail "query createdb died"
16+
fi
17+
18+
if notExists "${TMP_PATH}/clu.dbtype"; then
19+
# shellcheck disable=SC2086
20+
"$MMSEQS" "${CLUSTER_MODULE}" "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/clu_tmp" ${CLUSTER_PAR} \
21+
|| fail "linclust died"
22+
fi
23+
24+
if notExists "${RESULTS}_protein_cluster.tsv"; then
25+
# shellcheck disable=SC2086
26+
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/clu" "${RESULTS}_protein_cluster.tsv" ${THREADS_PAR} \
27+
|| fail "createtsv protein cluster died"
28+
fi
29+
30+
if notExists "${TMP_PATH}/aln_proteome.dbtype"; then
31+
# shellcheck disable=SC2086
32+
"$MMSEQS" proteomecluster "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_proteome" "${TMP_PATH}/cluster_count" "${TMP_PATH}/aln_protein" ${PROTEOMECLUSTER_PAR} \
33+
|| fail "proteomecluster died"
34+
fi
35+
36+
if notExists "${RESULTS}_cluster_count.tsv"; then
37+
# shellcheck disable=SC2086
38+
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/cluster_count" "${RESULTS}_cluster_count.tsv" ${THREADS_PAR} \
39+
|| fail "createtsv proteome cluster count report died"
40+
fi
41+
42+
if notExists "${RESULTS}_protein_align.tsv" && [ -n "${WRITE_ALIGN_PROTEOME}" ]; then
43+
# shellcheck disable=SC2086
44+
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_protein" "${RESULTS}_protein_align.tsv" ${THREADS_PAR} \
45+
|| fail "createtsv protein align died"
46+
else
47+
rm -rf "${TMP_PATH}/aln_protein"*
48+
fi
49+
# cascade
50+
awk 'NR==FNR { sub(/^\x00/, "", $1); a[$1]; next } !($1 in a)' "${TMP_PATH}/aln_proteome" "${TMP_PATH}/input.source" > "${TMP_PATH}/source_filtered"
51+
SOURCEtoNEXTITERATION="${TMP_PATH}/source_filtered"
52+
STEP=2
53+
54+
SUBDB_LOOKUP_LIST="${TMP_PATH}/input.lookup"
55+
56+
# Corrected while loop condition with proper spacing and quoting
57+
while [ -s "$SOURCEtoNEXTITERATION" ]; do
58+
echo "Step $STEP: $(wc -l < "$SOURCEtoNEXTITERATION") sources left"
59+
# Make "sublookup_STEP" from lines in input.lookup whose 3rd field is in the set from source_filtered
60+
awk 'NR==FNR {sources[$1]; next} $3 in sources' "$SOURCEtoNEXTITERATION" "${SUBDB_LOOKUP_LIST}" > "${TMP_PATH}/sublookup_${STEP}"
61+
62+
# Create a smaller DB from sublookup
63+
# shellcheck disable=SC2086
64+
"$MMSEQS" createsubdb "${TMP_PATH}/sublookup_${STEP}" "${TMP_PATH}/input" "${TMP_PATH}/input_${STEP}" --subdb-mode 1
65+
NEXTINPUT="${TMP_PATH}/input_${STEP}"
66+
67+
# Run linclust on the newly created sub-DB
68+
echo "Run linclust for iter $STEP"
69+
if notExists "${TMP_PATH}/clu_${STEP}.dbtype"; then
70+
# shellcheck disable=SC2086
71+
"$MMSEQS" "${CLUSTER_MODULE}" "${NEXTINPUT}" "${TMP_PATH}/clu_${STEP}" "${TMP_PATH}/clu_tmp_${STEP}" ${CLUSTER_PAR} \
72+
|| fail "linclust died"
73+
fi
74+
75+
echo "Run createtsv: protein clust result for iter $STEP"
76+
if notExists "${RESULTS}_protein_cluster_${STEP}.tsv"; then
77+
# shellcheck disable=SC2086
78+
"$MMSEQS" createtsv "${NEXTINPUT}" "${NEXTINPUT}" "${TMP_PATH}/clu_${STEP}" "${RESULTS}_protein_cluster_${STEP}.tsv" ${THREADS_PAR} \
79+
|| fail "createtsv protein cluster died"
80+
fi
81+
82+
if [ -n "$REMOVE_TMP" ]; then
83+
# shellcheck disable=SC2086
84+
rm -rf "${TMP_PATH}/clu_tmp_${STEP}"
85+
fi
86+
87+
echo "Run ProteomeCluster for iter $STEP"
88+
# Run proteomecluster on the newly created sub-DB
89+
if notExists "${TMP_PATH}/aln_proteome_${STEP}.dbtype"; then
90+
# shellcheck disable=SC2086
91+
"$MMSEQS" proteomecluster "${NEXTINPUT}" "${TMP_PATH}/clu_${STEP}" "${TMP_PATH}/aln_proteome_${STEP}" "${TMP_PATH}/cluster_count_${STEP}" "${TMP_PATH}/aln_protein_${STEP}" ${PROTEOMECLUSTER_PAR} \
92+
|| fail "proteomecluster died"
93+
fi
94+
95+
echo "Run createtsv: clustercount report for iter $STEP"
96+
if notExists "${RESULTS}_cluster_count_${STEP}.tsv"; then
97+
# shellcheck disable=SC2086
98+
"$MMSEQS" createtsv "${NEXTINPUT}" "${TMP_PATH}/cluster_count_${STEP}" "${RESULTS}_cluster_count_${STEP}.tsv" ${THREADS_PAR} \
99+
|| fail "createtsv proteome cluster count report died"
100+
fi
101+
102+
if notExists "${RESULTS}_protein_align_${STEP}.tsv" && [ -n "${WRITE_ALIGN_PROTEOME}" ]; then
103+
echo "Run createtsv: protein align result for iter $STEP"
104+
# shellcheck disable=SC2086
105+
"$MMSEQS" createtsv "${NEXTINPUT}" "${NEXTINPUT}" "${TMP_PATH}/aln_protein_${STEP}" "${RESULTS}_protein_align_${STEP}.tsv" ${THREADS_PAR} \
106+
|| fail "createtsv protein align died"
107+
else
108+
rm -rf "${TMP_PATH}/aln_protein_${STEP}"*
109+
fi
110+
111+
echo "Run concatdbs of aln_proteome for iter $STEP"
112+
# Concatenate new proteome alignments into the master aln_proteome
113+
"$MMSEQS" concatdbs "${TMP_PATH}/aln_proteome" "${TMP_PATH}/aln_proteome_${STEP}" "${TMP_PATH}/aln_proteome" --preserve-keys 1
114+
115+
# Repeat the AWK-based filtering to update source_filtered
116+
awk 'NR==FNR { sub(/^\x00/, "", $1); a[$1]; next } !($1 in a)' "${TMP_PATH}/aln_proteome" "${TMP_PATH}/input.source" > "${TMP_PATH}/source_filtered"
117+
SUBDB_LOOKUP_LIST="${TMP_PATH}/sublookup_${STEP}"
118+
# rm -f "${TMP_PATH}/sublookup_${STEP}"
119+
STEP=$((STEP + 1))
120+
done
121+
122+
echo "Run createtsv: proteome alignment result"
123+
if notExists "${RESULTS}_proteome_cluster.tsv"; then
124+
# shellcheck disable=SC2086
125+
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome" "${RESULTS}_proteome_cluster.tsv" ${THREADS_PAR} \
126+
|| fail "createtsv proteome cluster died"
127+
fi
128+
129+
if [ -n "${REMOVE_TMP}" ]; then
130+
# shellcheck disable=SC2086
131+
"$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR}
132+
# shellcheck disable=SC2086
133+
"$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR}
134+
# shellcheck disable=SC2086
135+
"$MMSEQS" rmdb "${TMP_PATH}/clu" ${VERBOSITY_PAR}
136+
# shellcheck disable=SC2086
137+
"$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY_PAR}
138+
# shellcheck disable=SC2086
139+
"$MMSEQS" rmdb "${TMP_PATH}/aln_protein" ${VERBOSITY_PAR}
140+
# shellcheck disable=SC2086
141+
"$MMSEQS" rmdb "${TMP_PATH}/aln_proteome" ${VERBOSITY_PAR}
142+
rm -rf "${TMP_PATH}/clu_tmp"
143+
rm -f "${TMP_PATH}/easyproteomecluster.sh"
144+
fi

src/CommandDeclarations.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ extern int easyrbh(int argc, const char **argv, const Command& command);
4747
extern int easylinclust(int argc, const char **argv, const Command& command);
4848
extern int easysearch(int argc, const char **argv, const Command& command);
4949
extern int easylinsearch(int argc, const char **argv, const Command& command);
50+
extern int easyproteomecluster(int argc, const char **argv, const Command& command);
5051
extern int tsv2exprofiledb(int argc, const char **argv, const Command& command);
5152
extern int enrich(int argc, const char **argv, const Command& command);
5253
extern int expandaln(int argc, const char **argv, const Command& command);
@@ -97,6 +98,7 @@ extern int profile2neff(int argc, const char **argv, const Command& command);
9798
extern int profile2consensus(int argc, const char **argv, const Command& command);
9899
extern int profile2repseq(int argc, const char **argv, const Command& command);
99100
extern int proteinaln2nucl(int argc, const char **argv, const Command& command);
101+
extern int proteomecluster(int argc, const char **argv, const Command& command);
100102
extern int rescorediagonal(int argc, const char **argv, const Command& command);
101103
extern int ungappedprefilter(int argc, const char **argv, const Command& command);
102104
extern int gappedprefilter(int argc, const char **argv, const Command& command);

src/MMseqsBase.cpp

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,23 @@ std::vector<Command> baseCommands = {
7979
CITATION_MMSEQS2|CITATION_LINCLUST, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
8080
{"clusterPrefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
8181
{"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
82+
{"easy-proteomecluster", easyproteomecluster, &par.easyproteomeclusterworkflow, COMMAND_EASY,
83+
"Cluster proteomes and identify reference proteomes",
84+
"mmseqs easy-proteomecluster examples/ProteomeDBPaths.tsv(examples/fastaFile1.fa...fastaFile1.fa) result tmp\n\n"
85+
"# ProteomeCluster output\n"
86+
"# - result_protein_cluster.tsv: Results of protein clustering (linclust/cluster)\n"
87+
"# - result_proteome_cluster.tsv: Results of proteome clustering including similarity to the reference proteome \n"
88+
"# - result_protein_align.tsv: Results of protein alignments\n"
89+
"# - result_cluster_count.tsv: Number of clusters containing proteins from each proteome (from protein clustering results)\n"
90+
"# Clustering multiple proteomes with linclust for protein clustering(cluster-module 0)\n"
91+
"mmseqs easy-proteomecluster examples/ProteomeDBPaths.tsv(examples/fastaFile1.fa...fastaFile1.fa) result tmp --proteome-similarity 0.9 -c 0.8 --cov-mode 1 --cluster-module 0 \n"
92+
"# Cascade clustering: iteratively cluster remaining proteomes with protein clustering while selecting reference proteomes\n"
93+
"mmseqs easy-proteomecluster examples/ProteomeDBPaths.tsv(examples/fastaFile1.fa...fastaFile1.fa) result tmp --proteome-similarity 0.9 -c 0.8 --cov-mode 1 --proteome-cascaded-clustering 1 \n",
94+
"Gyuri Kim <[email protected]> & Martin Steinegger <[email protected]>",
95+
"<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]> <o:clusterPrefix> <tmpDir>",
96+
CITATION_MMSEQS2, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
97+
{"outputReports", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
98+
{"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
8299
{"easy-taxonomy", easytaxonomy, &par.easytaxonomy, COMMAND_EASY,
83100
"Taxonomic classification",
84101
"# Assign taxonomic labels to FASTA sequences\n"
@@ -655,6 +672,16 @@ std::vector<Command> baseCommands = {
655672
CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
656673
{"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
657674
{"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
675+
{"proteomecluster", proteomecluster, &par.proteomecluster, COMMAND_CLUSTPROTEOME,
676+
"Cluster proteomes and identify reference proteomes",
677+
NULL,
678+
"Gyuri Kim <[email protected]> & Martin Steinegger <[email protected]>",
679+
"<i:sequenceDB> <i:clustresultDB> <o:proteomeAlignmentResultDB> <o:proteomeClusterCountReport> <o:proteinAlignmenResultDB> ",
680+
CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
681+
{"clustresultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
682+
{"proteomeAlignmentResultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
683+
{"proteomeClusterCountReport", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
684+
{"proteinAlignmenResultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
658685
{"transitivealign", transitivealign, &par.align, COMMAND_ALIGNMENT,
659686
"Transfer alignments via transitivity",
660687
//"Infer the alignment A->C via B, B being the center sequence and A,C each pairwise aligned against B",

src/commons/Command.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ CommandMode COMMAND_ALIGNMENT = 1U << 15;
3838
CommandMode COMMAND_CLUSTER = 1U << 16;
3939
CommandMode COMMAND_PROFILE = 1U << 17;
4040
CommandMode COMMAND_PROFILE_PROFILE = 1U << 18;
41+
CommandMode COMMAND_CLUSTPROTEOME = 1U << 19;
4142

4243
CommandMode COMMAND_EXPERT = 1U << 31;
4344

src/commons/DBReader.cpp

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,28 @@ template <typename T> bool DBReader<T>::open(int accessType){
155155
}
156156
lookupData.close();
157157
}
158+
159+
if (dataMode & USE_SOURCE || dataMode & USE_SOURCE_REV) {
160+
std::string sourceFilename = (std::string(dataFileName) + ".source");
161+
MemoryMapped sourceData(sourceFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
162+
if (sourceData.isValid() == false) {
163+
Debug(Debug::ERROR) << "Cannot open source file " << sourceFilename << "!\n";
164+
EXIT(EXIT_FAILURE);
165+
}
166+
char* sourceDataChar = (char *) sourceData.getData();
167+
size_t sourceDataSize = sourceData.size();
168+
sourceSize = Util::ompCountLines(sourceDataChar, sourceDataSize, threads);
169+
source = new(std::nothrow) SourceEntry[this->sourceSize];
170+
incrementMemory(sizeof(SourceEntry) * this->sourceSize);
171+
readSource(sourceDataChar, sourceDataSize, source);
172+
if (dataMode & USE_SOURCE) {
173+
SORT_PARALLEL(source, source + sourceSize, SourceEntry::compareById);
174+
} else {
175+
SORT_PARALLEL(source, source + sourceSize, SourceEntry::compareByFileName);
176+
}
177+
sourceData.close();
178+
}
179+
158180
bool isSortedById = false;
159181
if (externalData == false) {
160182
MemoryMapped indexData(indexFileName, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
@@ -669,6 +691,11 @@ template <typename T> size_t DBReader<T>::getLookupSize() const {
669691
return lookupSize;
670692
}
671693

694+
template <typename T> size_t DBReader<T>::getSourceSize() const {
695+
checkClosed();
696+
return sourceSize;
697+
}
698+
672699
template <typename T> size_t DBReader<T>::getSize() const {
673700
checkClosed();
674701
return size;
@@ -758,6 +785,55 @@ void DBReader<std::string>::lookupEntryToBuffer(std::string& buffer, const Looku
758785
buffer.append(1, '\n');
759786
}
760787

788+
template <typename T> T DBReader<T>::getSourceKey(size_t id){
789+
if (id >= sourceSize){
790+
Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n";
791+
Debug(Debug::ERROR) << "getSource id: local id (" << id << ") >= db size (" << sourceSize << ")\n";
792+
EXIT(EXIT_FAILURE);
793+
}
794+
return source[id].id;
795+
}
796+
797+
template <typename T> std::string DBReader<T>::getSourceFileName (size_t id){
798+
if (id >= sourceSize){
799+
Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n";
800+
Debug(Debug::ERROR) << "getSourceFileName: local id (" << id << ") >= db size (" << sourceSize << ")\n";
801+
EXIT(EXIT_FAILURE);
802+
}
803+
return source[id].fileName;
804+
}
805+
806+
template <typename T> size_t DBReader<T>::getSourceIdByFileName(const std::string& fName) {
807+
if ((dataMode & USE_SOURCE_REV) == 0) {
808+
Debug(Debug::ERROR) << "DBReader for datafile=" << dataFileName << ".source was not opened with source mode\n";
809+
EXIT(EXIT_FAILURE);
810+
}
811+
SourceEntry val;
812+
val.fileName = fName;
813+
size_t id = std::upper_bound(source, source + sourceSize, val, SourceEntry::compareByFileNameOnly) - source;
814+
if (id >= sourceSize) {
815+
Debug(Debug::ERROR) << "Source file " << fName << " exceed source size\n";
816+
EXIT(EXIT_FAILURE);
817+
}
818+
if (source[id].fileName.compare(fName) != 0) {
819+
Debug(Debug::ERROR) << "Source file " << fName << " not found\n";
820+
EXIT(EXIT_FAILURE);
821+
}
822+
return (id < sourceSize && source[id].fileName.compare(fName) == 0) ? id : SIZE_MAX;
823+
}
824+
825+
template <typename T> void DBReader<T>::sortSourceById(){
826+
if (source != NULL) {
827+
SORT_PARALLEL(source, source + sourceSize, SourceEntry::compareById);
828+
}
829+
}
830+
831+
template <typename T> void DBReader<T>::sortSourceByFileName(){
832+
if (source != NULL) {
833+
SORT_PARALLEL(source, source + sourceSize, SourceEntry::compareByFileName);
834+
}
835+
}
836+
761837
template <typename T> size_t DBReader<T>::getId (T dbKey){
762838
size_t id = bsearch(index, size, dbKey);
763839
if (id2local != NULL) {
@@ -1081,6 +1157,34 @@ void DBReader<T>::readLookup(char *data, size_t dataSize, DBReader::LookupEntry
10811157
}
10821158
}
10831159

1160+
template <typename T>
1161+
void DBReader<T>::readSource(char *data, size_t dataSize, DBReader::SourceEntry *source) {
1162+
size_t i=0;
1163+
size_t currPos = 0;
1164+
char* sourceData = (char *) data;
1165+
const char * cols[3];
1166+
while (currPos < dataSize){
1167+
if (i >= this->sourceSize) {
1168+
Debug(Debug::ERROR) << "Corrupt memory, too many entries!\n";
1169+
EXIT(EXIT_FAILURE);
1170+
}
1171+
Util::getFieldsOfLine(sourceData, cols, 3);
1172+
source[i].id = Util::fast_atoi<size_t>(cols[0]);
1173+
std::string fileName = std::string(cols[1], (cols[2] - cols[1]));
1174+
size_t lastDotPosition = fileName.rfind('.');
1175+
1176+
if (lastDotPosition != std::string::npos) {
1177+
fileName = fileName.substr(0, lastDotPosition);
1178+
}
1179+
source[i].fileName = fileName;
1180+
sourceData = Util::skipLine(sourceData);
1181+
1182+
currPos = sourceData - (char *) data;
1183+
1184+
i++;
1185+
}
1186+
}
1187+
10841188
// TODO: Move to DbUtils?
10851189

10861190
template<typename T>

0 commit comments

Comments
 (0)