soedinglab
diff --git a/‎data/workflow/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎data/workflow/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎data/workflow/easyproteomecluster.sh‎
Lines changed: 144 additions & 0 deletions b/‎data/workflow/easyproteomecluster.sh‎
Lines changed: 144 additions & 0 deletions
diff --git a/‎src/CommandDeclarations.h‎
Lines changed: 2 additions & 0 deletions b/‎src/CommandDeclarations.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/MMseqsBase.cpp‎
Lines changed: 27 additions & 0 deletions b/‎src/MMseqsBase.cpp‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎src/commons/Command.h‎
Lines changed: 1 addition & 0 deletions b/‎src/commons/Command.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/commons/DBReader.cpp‎
Lines changed: 104 additions & 0 deletions b/‎src/commons/DBReader.cpp‎
Lines changed: 104 additions & 0 deletions
@@ -3,6 +3,7 @@ set(GENERATED_WORKFLOWS
         workflow/easycluster.sh
         workflow/easytaxonomy.sh
         workflow/easyrbh.sh
+        workflow/easyproteomecluster.sh
         workflow/blastp.sh
         workflow/blastpgp.sh
         workflow/map.sh
 
@@ -0,0 +1,144 @@
+#!/bin/sh -e
+fail() {
+    echo "Error: $1"
+    exit 1
+}
+
+notExists() {
+   [ ! -f "$1" ]
+}
+
+
+if notExists "${TMP_PATH}/input.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createdb "$@" "${TMP_PATH}/input" ${CREATEDB_PAR} \
+        || fail "query createdb died"
+fi
+
+if notExists "${TMP_PATH}/clu.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" "${CLUSTER_MODULE}" "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/clu_tmp" ${CLUSTER_PAR} \
+        || fail "linclust died"
+fi
+
+if notExists "${RESULTS}_protein_cluster.tsv"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/clu" "${RESULTS}_protein_cluster.tsv" ${THREADS_PAR} \
+            || fail "createtsv protein cluster died"
+fi
+
+if notExists "${TMP_PATH}/aln_proteome.dbtype"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" proteomecluster "${TMP_PATH}/input" "${TMP_PATH}/clu" "${TMP_PATH}/aln_proteome" "${TMP_PATH}/cluster_count" "${TMP_PATH}/aln_protein" ${PROTEOMECLUSTER_PAR} \
+        || fail "proteomecluster died"
+fi
+
+if notExists "${RESULTS}_cluster_count.tsv"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/cluster_count" "${RESULTS}_cluster_count.tsv" ${THREADS_PAR} \
+            || fail "createtsv proteome cluster count report died"
+fi
+
+if notExists "${RESULTS}_protein_align.tsv" && [ -n "${WRITE_ALIGN_PROTEOME}" ]; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_protein" "${RESULTS}_protein_align.tsv" ${THREADS_PAR} \
+            || fail "createtsv protein align died"
+else
+    rm -rf "${TMP_PATH}/aln_protein"*
+fi
+# cascade 
+awk 'NR==FNR { sub(/^\x00/, "", $1); a[$1]; next } !($1 in a)' "${TMP_PATH}/aln_proteome" "${TMP_PATH}/input.source" > "${TMP_PATH}/source_filtered"
+SOURCEtoNEXTITERATION="${TMP_PATH}/source_filtered"
+STEP=2
+
+SUBDB_LOOKUP_LIST="${TMP_PATH}/input.lookup"
+
+# Corrected while loop condition with proper spacing and quoting
+while [ -s "$SOURCEtoNEXTITERATION" ]; do
+    echo "Step $STEP: $(wc -l < "$SOURCEtoNEXTITERATION") sources left"
+    # Make "sublookup_STEP" from lines in input.lookup whose 3rd field is in the set from source_filtered
+    awk 'NR==FNR {sources[$1]; next} $3 in sources' "$SOURCEtoNEXTITERATION" "${SUBDB_LOOKUP_LIST}" > "${TMP_PATH}/sublookup_${STEP}"
+
+    # Create a smaller DB from sublookup
+    # shellcheck disable=SC2086
+    "$MMSEQS" createsubdb "${TMP_PATH}/sublookup_${STEP}" "${TMP_PATH}/input" "${TMP_PATH}/input_${STEP}" --subdb-mode 1
+    NEXTINPUT="${TMP_PATH}/input_${STEP}"
+
+    # Run linclust on the newly created sub-DB
+    echo "Run linclust for iter $STEP" 
+    if notExists "${TMP_PATH}/clu_${STEP}.dbtype"; then
+        # shellcheck disable=SC2086
+        "$MMSEQS" "${CLUSTER_MODULE}" "${NEXTINPUT}" "${TMP_PATH}/clu_${STEP}" "${TMP_PATH}/clu_tmp_${STEP}" ${CLUSTER_PAR} \
+            || fail "linclust died"
+    fi
+
+    echo "Run createtsv: protein clust result for iter $STEP"
+    if notExists "${RESULTS}_protein_cluster_${STEP}.tsv"; then
+        # shellcheck disable=SC2086
+        "$MMSEQS" createtsv "${NEXTINPUT}" "${NEXTINPUT}" "${TMP_PATH}/clu_${STEP}" "${RESULTS}_protein_cluster_${STEP}.tsv" ${THREADS_PAR} \
+                || fail "createtsv protein cluster died"
+    fi
+
+    if [ -n "$REMOVE_TMP" ]; then
+        # shellcheck disable=SC2086
+        rm -rf "${TMP_PATH}/clu_tmp_${STEP}"
+    fi
+
+    echo "Run ProteomeCluster for iter $STEP"
+    # Run proteomecluster on the newly created sub-DB
+    if notExists "${TMP_PATH}/aln_proteome_${STEP}.dbtype"; then
+        # shellcheck disable=SC2086
+        "$MMSEQS" proteomecluster "${NEXTINPUT}" "${TMP_PATH}/clu_${STEP}" "${TMP_PATH}/aln_proteome_${STEP}" "${TMP_PATH}/cluster_count_${STEP}" "${TMP_PATH}/aln_protein_${STEP}" ${PROTEOMECLUSTER_PAR} \
+            || fail "proteomecluster died"
+    fi
+
+    echo "Run createtsv: clustercount report for iter $STEP"
+    if notExists "${RESULTS}_cluster_count_${STEP}.tsv"; then
+        # shellcheck disable=SC2086
+        "$MMSEQS" createtsv "${NEXTINPUT}" "${TMP_PATH}/cluster_count_${STEP}" "${RESULTS}_cluster_count_${STEP}.tsv" ${THREADS_PAR} \
+                || fail "createtsv proteome cluster count report died"
+    fi
+
+    if notExists "${RESULTS}_protein_align_${STEP}.tsv" && [ -n "${WRITE_ALIGN_PROTEOME}" ]; then
+        echo "Run createtsv: protein align result for iter $STEP"
+        # shellcheck disable=SC2086
+        "$MMSEQS" createtsv "${NEXTINPUT}" "${NEXTINPUT}" "${TMP_PATH}/aln_protein_${STEP}" "${RESULTS}_protein_align_${STEP}.tsv" ${THREADS_PAR} \
+                || fail "createtsv protein align died"
+    else
+        rm -rf "${TMP_PATH}/aln_protein_${STEP}"*
+    fi
+
+    echo "Run concatdbs of aln_proteome for iter $STEP"
+    # Concatenate new proteome alignments into the master aln_proteome
+    "$MMSEQS" concatdbs "${TMP_PATH}/aln_proteome" "${TMP_PATH}/aln_proteome_${STEP}" "${TMP_PATH}/aln_proteome" --preserve-keys 1
+
+    # Repeat the AWK-based filtering to update source_filtered
+    awk 'NR==FNR { sub(/^\x00/, "", $1); a[$1]; next } !($1 in a)' "${TMP_PATH}/aln_proteome" "${TMP_PATH}/input.source" > "${TMP_PATH}/source_filtered"
+    SUBDB_LOOKUP_LIST="${TMP_PATH}/sublookup_${STEP}"
+    # rm -f "${TMP_PATH}/sublookup_${STEP}"
+    STEP=$((STEP + 1))
+done
+
+echo "Run createtsv: proteome alignment result"
+if notExists "${RESULTS}_proteome_cluster.tsv"; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome" "${RESULTS}_proteome_cluster.tsv" ${THREADS_PAR} \
+            || fail "createtsv proteome cluster died"
+fi
+
+if [ -n "${REMOVE_TMP}" ]; then
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/input" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/input_h" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/clu" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/aln" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/aln_protein" ${VERBOSITY_PAR}
+    # shellcheck disable=SC2086
+    "$MMSEQS" rmdb "${TMP_PATH}/aln_proteome" ${VERBOSITY_PAR}
+    rm -rf "${TMP_PATH}/clu_tmp"
+    rm -f "${TMP_PATH}/easyproteomecluster.sh"
+fi
@@ -47,6 +47,7 @@ extern int easyrbh(int argc, const char **argv, const Command& command);
 extern int easylinclust(int argc, const char **argv, const Command& command);
 extern int easysearch(int argc, const char **argv, const Command& command);
 extern int easylinsearch(int argc, const char **argv, const Command& command);
+extern int easyproteomecluster(int argc, const char **argv, const Command& command);
 extern int tsv2exprofiledb(int argc, const char **argv, const Command& command);
 extern int enrich(int argc, const char **argv, const Command& command);
 extern int expandaln(int argc, const char **argv, const Command& command);
@@ -97,6 +98,7 @@ extern int profile2neff(int argc, const char **argv, const Command& command);
 extern int profile2consensus(int argc, const char **argv, const Command& command);
 extern int profile2repseq(int argc, const char **argv, const Command& command);
 extern int proteinaln2nucl(int argc, const char **argv, const Command& command);
+extern int proteomecluster(int argc, const char **argv, const Command& command);
 extern int rescorediagonal(int argc, const char **argv, const Command& command);
 extern int ungappedprefilter(int argc, const char **argv, const Command& command);
 extern int gappedprefilter(int argc, const char **argv, const Command& command);
 
@@ -79,6 +79,23 @@ std::vector<Command> baseCommands = {
                 CITATION_MMSEQS2|CITATION_LINCLUST, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
                                                             {"clusterPrefix", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
                                                             {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
+        {"easy-proteomecluster", easyproteomecluster, &par.easyproteomeclusterworkflow, COMMAND_EASY,
+                "Cluster proteomes and identify reference proteomes",
+                "mmseqs easy-proteomecluster examples/ProteomeDBPaths.tsv(examples/fastaFile1.fa...fastaFile1.fa) result tmp\n\n"
+                "# ProteomeCluster output\n"
+                "#  - result_protein_cluster.tsv:  Results of protein clustering (linclust/cluster)\n"
+                "#  - result_proteome_cluster.tsv: Results of proteome clustering including similarity to the reference proteome \n"
+                "#  - result_protein_align.tsv: Results of protein alignments\n"
+                "#  - result_cluster_count.tsv: Number of clusters containing proteins from each proteome (from protein clustering results)\n"
+                "# Clustering multiple proteomes with linclust for protein clustering(cluster-module 0)\n"
+                "mmseqs easy-proteomecluster examples/ProteomeDBPaths.tsv(examples/fastaFile1.fa...fastaFile1.fa) result tmp --proteome-similarity 0.9 -c 0.8 --cov-mode 1 --cluster-module 0 \n"
+                "# Cascade clustering: iteratively cluster remaining proteomes with protein clustering while selecting reference proteomes\n"
+                "mmseqs easy-proteomecluster examples/ProteomeDBPaths.tsv(examples/fastaFile1.fa...fastaFile1.fa) result tmp --proteome-similarity 0.9 -c 0.8 --cov-mode 1 --proteome-cascaded-clustering 1 \n",
+                "Gyuri Kim <[email protected]> & Martin Steinegger <[email protected]>",
+                "<i:fastaFile1[.gz|.bz2]> ... <i:fastaFileN[.gz|.bz2]> <o:clusterPrefix> <tmpDir>",
+                CITATION_MMSEQS2, {{"fastaFile[.gz|.bz2]", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA|DbType::VARIADIC, &DbValidator::flatfileAndStdin },
+                                        {"outputReports", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::flatfile },
+                                        {"tmpDir", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::directory }}},
         {"easy-taxonomy",        easytaxonomy,         &par.easytaxonomy,         COMMAND_EASY,
                 "Taxonomic classification",
                 "# Assign taxonomic labels to FASTA sequences\n"
@@ -655,6 +672,16 @@ std::vector<Command> baseCommands = {
                 CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
                                                            {"resultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
                                                            {"alignmentDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
+        {"proteomecluster",             proteomecluster,             &par.proteomecluster,             COMMAND_CLUSTPROTEOME,
+                "Cluster proteomes and identify reference proteomes",
+                NULL,
+                "Gyuri Kim <[email protected]> & Martin Steinegger <[email protected]>",
+                "<i:sequenceDB> <i:clustresultDB> <o:proteomeAlignmentResultDB> <o:proteomeClusterCountReport> <o:proteinAlignmenResultDB> ",
+                CITATION_MMSEQS2, {{"sequenceDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::sequenceDb },
+                                                                {"clustresultDB", DbType::ACCESS_MODE_INPUT, DbType::NEED_DATA, &DbValidator::resultDb },
+                                                                {"proteomeAlignmentResultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                                                {"proteomeClusterCountReport", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb },
+                                                                {"proteinAlignmenResultDB", DbType::ACCESS_MODE_OUTPUT, DbType::NEED_DATA, &DbValidator::alignmentDb }}},
         {"transitivealign",      transitivealign,      &par.align,                COMMAND_ALIGNMENT,
                 "Transfer alignments via transitivity",
                 //"Infer the alignment A->C via B, B being the center sequence and A,C each pairwise aligned against B",
 
@@ -38,6 +38,7 @@ CommandMode COMMAND_ALIGNMENT         = 1U << 15;
 CommandMode COMMAND_CLUSTER           = 1U << 16;
 CommandMode COMMAND_PROFILE           = 1U << 17;
 CommandMode COMMAND_PROFILE_PROFILE   = 1U << 18;
+CommandMode COMMAND_CLUSTPROTEOME     = 1U << 19;
 
 CommandMode COMMAND_EXPERT            = 1U << 31;
 
 
@@ -155,6 +155,28 @@ template <typename T> bool DBReader<T>::open(int accessType){
         }
         lookupData.close();
     }
+
+    if (dataMode & USE_SOURCE || dataMode & USE_SOURCE_REV) {
+        std::string sourceFilename = (std::string(dataFileName) + ".source");
+        MemoryMapped sourceData(sourceFilename, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
+        if (sourceData.isValid() == false) {
+            Debug(Debug::ERROR) << "Cannot open source file " << sourceFilename << "!\n";
+            EXIT(EXIT_FAILURE);
+        }
+        char* sourceDataChar = (char *) sourceData.getData();
+        size_t sourceDataSize = sourceData.size();
+        sourceSize = Util::ompCountLines(sourceDataChar, sourceDataSize, threads);
+        source = new(std::nothrow) SourceEntry[this->sourceSize];
+        incrementMemory(sizeof(SourceEntry) * this->sourceSize);
+        readSource(sourceDataChar, sourceDataSize, source);
+        if (dataMode & USE_SOURCE) {
+            SORT_PARALLEL(source, source + sourceSize, SourceEntry::compareById);
+        } else {
+            SORT_PARALLEL(source, source + sourceSize, SourceEntry::compareByFileName);
+        }
+        sourceData.close();
+    }
+
     bool isSortedById = false;
     if (externalData == false) {
         MemoryMapped indexData(indexFileName, MemoryMapped::WholeFile, MemoryMapped::SequentialScan);
@@ -669,6 +691,11 @@ template <typename T> size_t DBReader<T>::getLookupSize() const {
     return lookupSize;
 }
 
+template <typename T> size_t DBReader<T>::getSourceSize() const {
+    checkClosed();
+    return sourceSize;
+}
+
 template <typename T> size_t DBReader<T>::getSize() const {
     checkClosed();
     return size;
@@ -758,6 +785,55 @@ void DBReader<std::string>::lookupEntryToBuffer(std::string& buffer, const Looku
     buffer.append(1, '\n');
 }
 
+template <typename T> T DBReader<T>::getSourceKey(size_t id){
+    if (id >= sourceSize){
+        Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n";
+        Debug(Debug::ERROR) << "getSource id: local id (" << id << ") >= db size (" << sourceSize << ")\n";
+        EXIT(EXIT_FAILURE);
+    }
+    return source[id].id;
+}
+
+template <typename T> std::string DBReader<T>::getSourceFileName (size_t id){
+    if (id >= sourceSize){
+        Debug(Debug::ERROR) << "Invalid database read for id=" << id << ", database index=" << dataFileName << ".source\n";
+        Debug(Debug::ERROR) << "getSourceFileName: local id (" << id << ") >= db size (" << sourceSize << ")\n";
+        EXIT(EXIT_FAILURE);
+    }
+    return source[id].fileName;
+}
+
+template <typename T> size_t DBReader<T>::getSourceIdByFileName(const std::string& fName) {
+    if ((dataMode & USE_SOURCE_REV) == 0) {
+        Debug(Debug::ERROR) << "DBReader for datafile=" << dataFileName << ".source was not opened with source mode\n";
+        EXIT(EXIT_FAILURE);
+    }
+    SourceEntry val;
+    val.fileName = fName;
+    size_t id = std::upper_bound(source, source + sourceSize, val, SourceEntry::compareByFileNameOnly) - source;
+    if (id >= sourceSize) {
+        Debug(Debug::ERROR) << "Source file " << fName << " exceed source size\n";
+        EXIT(EXIT_FAILURE);
+    }
+    if (source[id].fileName.compare(fName) != 0) {
+        Debug(Debug::ERROR) << "Source file " << fName << " not found\n";
+        EXIT(EXIT_FAILURE);
+    }
+    return (id < sourceSize && source[id].fileName.compare(fName) == 0) ? id : SIZE_MAX;
+}
+
+template <typename T> void DBReader<T>::sortSourceById(){
+    if (source != NULL) {
+        SORT_PARALLEL(source, source + sourceSize, SourceEntry::compareById);
+    }
+}
+
+template <typename T> void DBReader<T>::sortSourceByFileName(){
+    if (source != NULL) {
+        SORT_PARALLEL(source, source + sourceSize, SourceEntry::compareByFileName);
+    }
+}
+
 template <typename T> size_t DBReader<T>::getId (T dbKey){
     size_t id = bsearch(index, size, dbKey);
     if (id2local != NULL) {
@@ -1081,6 +1157,34 @@ void DBReader<T>::readLookup(char *data, size_t dataSize, DBReader::LookupEntry
     }
 }
 
+template <typename T>
+void DBReader<T>::readSource(char *data, size_t dataSize, DBReader::SourceEntry *source) {
+    size_t i=0;
+    size_t currPos = 0;
+    char* sourceData = (char *) data;
+    const char * cols[3];
+    while (currPos < dataSize){
+        if (i >= this->sourceSize) {
+            Debug(Debug::ERROR) << "Corrupt memory, too many entries!\n";
+            EXIT(EXIT_FAILURE);
+        }
+        Util::getFieldsOfLine(sourceData, cols, 3);
+        source[i].id = Util::fast_atoi<size_t>(cols[0]);
+        std::string fileName = std::string(cols[1], (cols[2] - cols[1]));
+        size_t lastDotPosition = fileName.rfind('.');
+
+        if (lastDotPosition != std::string::npos) {
+            fileName = fileName.substr(0, lastDotPosition);
+        }
+        source[i].fileName = fileName;
+        sourceData = Util::skipLine(sourceData);
+
+        currPos = sourceData - (char *) data;
+
+        i++;
+    }
+}
+
 // TODO: Move to DbUtils?
 
 template<typename T>