Skip to content

Commit befcb11

Browse files
Gyuuul2milot-mirdita
authored andcommitted
Set ambiguous proteome parameter as hidden and add hidden report
1 parent 863ccff commit befcb11

File tree

8 files changed

+127
-29
lines changed

8 files changed

+127
-29
lines changed

data/workflow/easyproteomecluster.sh

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,13 +39,26 @@ if notExists "${RESULTS}_cluster_count.tsv"; then
3939
|| fail "createtsv proteome cluster count report died"
4040
fi
4141

42+
if [ -z "${CASCADED_PROTEOME_CLUSTERING}" ] && notExists "${RESULTS}_proteome_cluster.tsv"; then
43+
# shellcheck disable=SC2086
44+
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome" "${RESULTS}_proteome_cluster.tsv" ${THREADS_PAR} \
45+
|| fail "createtsv proteome cluster died"
46+
fi
47+
48+
if [ -n "${PROTEOME_HIDDEN_REPORT}" ]; then
49+
# shellcheck disable=SC2086
50+
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome_productionReport" "${RESULTS}_proteome_cluster_production.tsv" ${THREADS_PAR} \
51+
|| fail "createtsv proteome cluster died"
52+
fi
53+
4254
if notExists "${RESULTS}_protein_align.tsv" && [ -n "${WRITE_ALIGN_PROTEOME}" ]; then
4355
# shellcheck disable=SC2086
4456
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_protein" "${RESULTS}_protein_align.tsv" ${THREADS_PAR} \
4557
|| fail "createtsv protein align died"
4658
else
4759
rm -rf "${TMP_PATH}/aln_protein"*
4860
fi
61+
4962
# cascade
5063
awk 'NR==FNR { sub(/^\x00/, "", $1); a[$1]; next } !($1 in a)' "${TMP_PATH}/aln_proteome" "${TMP_PATH}/input.source" > "${TMP_PATH}/source_filtered"
5164
SOURCEtoNEXTITERATION="${TMP_PATH}/source_filtered"
@@ -54,7 +67,7 @@ STEP=2
5467
SUBDB_LOOKUP_LIST="${TMP_PATH}/input.lookup"
5568

5669
# Corrected while loop condition with proper spacing and quoting
57-
while [ -s "$SOURCEtoNEXTITERATION" ]; do
70+
while [ -s "$SOURCEtoNEXTITERATION" ] && [ -n "${CASCADED_PROTEOME_CLUSTERING}" ]; do
5871
echo "Step $STEP: $(wc -l < "$SOURCEtoNEXTITERATION") sources left"
5972
# Make "sublookup_STEP" from lines in input.lookup whose 3rd field is in the set from source_filtered
6073
awk 'NR==FNR {sources[$1]; next} $3 in sources' "$SOURCEtoNEXTITERATION" "${SUBDB_LOOKUP_LIST}" > "${TMP_PATH}/sublookup_${STEP}"
@@ -99,6 +112,12 @@ while [ -s "$SOURCEtoNEXTITERATION" ]; do
99112
|| fail "createtsv proteome cluster count report died"
100113
fi
101114

115+
if [ -n "${PROTEOME_HIDDEN_REPORT}" ]; then
116+
# shellcheck disable=SC2086
117+
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome_${STEP}_productionReport" "${RESULTS}_proteome_cluster_production_${STEP}.tsv" ${THREADS_PAR} \
118+
|| fail "createtsv proteome cluster died"
119+
fi
120+
102121
if notExists "${RESULTS}_protein_align_${STEP}.tsv" && [ -n "${WRITE_ALIGN_PROTEOME}" ]; then
103122
echo "Run createtsv: protein align result for iter $STEP"
104123
# shellcheck disable=SC2086
@@ -119,8 +138,7 @@ while [ -s "$SOURCEtoNEXTITERATION" ]; do
119138
STEP=$((STEP + 1))
120139
done
121140

122-
echo "Run createtsv: proteome alignment result"
123-
if notExists "${RESULTS}_proteome_cluster.tsv"; then
141+
if [ -n "${CASCADED_PROTEOME_CLUSTERING}" ]; then
124142
# shellcheck disable=SC2086
125143
"$MMSEQS" createtsv "${TMP_PATH}/input" "${TMP_PATH}/input" "${TMP_PATH}/aln_proteome" "${RESULTS}_proteome_cluster.tsv" ${THREADS_PAR} \
126144
|| fail "createtsv proteome cluster died"

src/commons/Parameters.cpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -322,12 +322,16 @@ Parameters::Parameters():
322322
// touchdb
323323
PARAM_TOUCH_LOCK(PARAM_TOUCH_LOCK_ID, "--touch-lock", "Touch lock", "Lock touched database or database entries into memory. Process will not exit until killed.", typeid(bool), (void *) &touchLock, "", MMseqsParameter::COMMAND_EXPERT),
324324
// proteomecluster
325-
PARAM_PPS_WEIGHT_FILE(PARAM_PPS_WEIGHT_FILE_ID, "--ppsWeights", "PPS Weight file name", "Weights used for proteome cluster priorization", typeid(std::string), (void*) &ppsWeightFile, "",MMseqsParameter::COMMAND_EXPERT ),
326-
PARAM_WEIGHT_CLUSTER_COUNT(PARAM_WEIGHT_CLUSTER_COUNT_ID, "--weight-cluster-count", "Weight cluster count", "Weight of cluster count in clustering", typeid(float), (void *) &weightClusterCount, "^-?[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_EXPERT),
325+
PARAM_PPS_WEIGHT_FILE(PARAM_PPS_WEIGHT_FILE_ID, "--ppsWeights", "PPS Weight file name", "Weights used for proteome cluster priorization", typeid(std::string), (void*) &ppsWeightFile, "",MMseqsParameter::COMMAND_HIDDEN ),
326+
PARAM_WEIGHT_CLUSTER_COUNT(PARAM_WEIGHT_CLUSTER_COUNT_ID, "--weight-clustercount", "Weight cluster count", "Weight of cluster count in clustering", typeid(float), (void *) &weightClusterCount, "^-?[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_HIDDEN),
327327
PARAM_PROTEOME_SIMILARITY(PARAM_PROTEOME_SIMILARITY_ID, "--proteome-similarity", "Proteome similarity", "Proteome similarity threshold", typeid(float), (void *) &proteomeSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_EXPERT),
328328
PARAM_PROTEOME_RELATIVE_SIMILARITY(PARAM_PROTEOME_RELATIVE_SIMILARITY_ID, "--proteome-relative-similarity", "Proteome relative similarity", "Proteome relative similarity threshold normalized by proteome size", typeid(float), (void *) &proteomeRelativeSimThr, "^0(\\.[0-9]+)?|1(\\.0+)?$", MMseqsParameter::COMMAND_CLUSTPROTEOME),
329329
PARAM_PROTEOME_CASCADED_CLUSTERING(PARAM_PROTEOME_CASCADED_CLUSTERING_ID, "--proteome-cascaded-clustering", "Proteome cascaded clustering", "Cascaded clustering", typeid(bool), (void *) &proteomeCascadedClustering, "", MMseqsParameter::COMMAND_EXPERT),
330-
PARAM_INCLUDE_ALIGN_FILES(PARAM_INCLUDE_ALIGN_FILES_ID, "--include-align-files", "Include align files in proteomecluster", "Include align files", typeid(bool), (void *) &includeAlignFiles, "", MMseqsParameter::COMMAND_EXPERT),
330+
PARAM_INCLUDE_ALIGN_FILES(PARAM_INCLUDE_ALIGN_FILES_ID, "--include-align-files", "Include align files in proteomecluster", "Include align files", typeid(bool), (void *) &includeAlignFiles, "", MMseqsParameter::COMMAND_HIDDEN),
331+
PARAM_PROTEOME_WEIGHT_FILE(PARAM_PROTEOME_WEIGHT_FILE_ID, "--proteome-weights", "Proteome Weight file name", "Weights used for proteome priorization", typeid(std::string), (void*) &proteomeWeightFile, "",MMseqsParameter::COMMAND_EXPERT ),
332+
PARAM_PROTEOME_WEIGHT_CLUSTER_COUNT(PARAM_PROTEOME_WEIGHT_CLUSTER_COUNT_ID, "--proteome-weight-clustercount", "Weight cluster count in proteome clustering", "Weight of cluster count in proteome clustering", typeid(float), (void *) &proteomeWeightClusterCount, "^-?[0-9]*(\\.[0-9]+)?$", MMseqsParameter::COMMAND_EXPERT),
333+
PARAM_PROTEOME_INCLUDE_ALIGN_FILES(PARAM_PROTEOME_INCLUDE_ALIGN_FILES_ID, "--proteome-include-align-files", "Include align files in proteomecluster", "Include align files", typeid(bool), (void *) &proteomeIncludeAlignFiles, "", MMseqsParameter::COMMAND_EXPERT),
334+
PARAM_PROTEOME_HIDDEN_REPORT(PARAM_PROTEOME_HIDDEN_REPORT_ID, "--proteome-hidden-report", "Hidden report", "Hidden proteome alignment result against the reference proteome", typeid(bool), (void *) &proteomeHiddenReport, "", MMseqsParameter::COMMAND_HIDDEN),
331335
// for modules that should handle -h themselves
332336
PARAM_HELP(PARAM_HELP_ID, "-h", "Help", "Help", typeid(bool), (void *) &help, "", MMseqsParameter::COMMAND_HIDDEN),
333337
PARAM_HELP_LONG(PARAM_HELP_LONG_ID, "--help", "Help", "Help", typeid(bool), (void *) &help, "", MMseqsParameter::COMMAND_HIDDEN)
@@ -1344,6 +1348,10 @@ Parameters::Parameters():
13441348
proteomecluster.push_back(&PARAM_NO_COMP_BIAS_CORR_SCALE);
13451349
proteomecluster.push_back(&PARAM_WEIGHT_CLUSTER_COUNT);
13461350
proteomecluster.push_back(&PARAM_PPS_WEIGHT_FILE);
1351+
proteomecluster.push_back(&PARAM_PROTEOME_WEIGHT_FILE);
1352+
proteomecluster.push_back(&PARAM_PROTEOME_WEIGHT_CLUSTER_COUNT);
1353+
proteomecluster.push_back(&PARAM_PROTEOME_INCLUDE_ALIGN_FILES);
1354+
proteomecluster.push_back(&PARAM_PROTEOME_HIDDEN_REPORT);
13471355
proteomecluster.push_back(&PARAM_PROTEOME_SIMILARITY);
13481356
proteomecluster.push_back(&PARAM_PROTEOME_RELATIVE_SIMILARITY);
13491357
proteomecluster.push_back(&PARAM_PROTEOME_CASCADED_CLUSTERING);
@@ -2760,11 +2768,15 @@ void Parameters::setDefaults() {
27602768

27612769
// proteomecluster
27622770
ppsWeightFile = "";
2771+
proteomeWeightFile = "";
27632772
weightClusterCount = 0.0;
2773+
proteomeWeightClusterCount = 0.0;
27642774
proteomeSimThr = 0.9;
27652775
proteomeRelativeSimThr = 0.9;
27662776
proteomeCascadedClustering = 0;
27672777
includeAlignFiles = false;
2778+
proteomeIncludeAlignFiles = false;
2779+
proteomeHiddenReport = false;
27682780

27692781
// help
27702782
help = 0;

src/commons/Parameters.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -749,11 +749,15 @@ class Parameters {
749749

750750
// proteomecluster
751751
std::string ppsWeightFile;
752+
std::string proteomeWeightFile;
752753
float weightClusterCount;
754+
float proteomeWeightClusterCount;
753755
float proteomeSimThr;
754756
float proteomeRelativeSimThr;
755757
bool proteomeCascadedClustering;
756758
bool includeAlignFiles;
759+
bool proteomeIncludeAlignFiles;
760+
bool proteomeHiddenReport;
757761

758762
// for modules that should handle -h themselves
759763
bool help;
@@ -1135,6 +1139,10 @@ class Parameters {
11351139
PARAMETER(PARAM_PROTEOME_RELATIVE_SIMILARITY)
11361140
PARAMETER(PARAM_PROTEOME_CASCADED_CLUSTERING)
11371141
PARAMETER(PARAM_INCLUDE_ALIGN_FILES)
1142+
PARAMETER(PARAM_PROTEOME_WEIGHT_FILE)
1143+
PARAMETER(PARAM_PROTEOME_WEIGHT_CLUSTER_COUNT)
1144+
PARAMETER(PARAM_PROTEOME_INCLUDE_ALIGN_FILES)
1145+
PARAMETER(PARAM_PROTEOME_HIDDEN_REPORT)
11381146

11391147
// for modules that should handle -h themselves
11401148
PARAMETER(PARAM_HELP)

src/commons/Util.cpp

Lines changed: 15 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -439,8 +439,7 @@ int Util::omp_thread_count() {
439439
n += 1;
440440
return n;
441441
}
442-
443-
std::map<unsigned int, std::string> Util::readLookup(const std::string& file, const bool removeSplit) {
442+
std::map<unsigned int, std::string> Util::readLookup(const std::string& file, const unsigned char removeSplit) {
444443
std::map<unsigned int, std::string> mapping;
445444
if (file.length() > 0) {
446445
std::ifstream mappingStream(file);
@@ -456,9 +455,20 @@ std::map<unsigned int, std::string> Util::readLookup(const std::string& file, co
456455

457456
std::string& name = split[1];
458457

459-
size_t pos;
460-
if (removeSplit && (pos = name.find_last_of('.')) != std::string::npos) {
461-
name = name.substr(0, pos);
458+
switch (removeSplit) {
459+
case 1: { // Underscore
460+
size_t pos = name.find_last_of('_');
461+
if (pos != std::string::npos) name = name.substr(0, pos);
462+
break;
463+
}
464+
case 2: { // Dot
465+
size_t pos = name.find_last_of('.');
466+
if (pos != std::string::npos) name = name.substr(0, pos);
467+
break;
468+
}
469+
case 0: // None
470+
default:
471+
break;
462472
}
463473

464474
mapping.emplace(id, name);
@@ -468,7 +478,6 @@ std::map<unsigned int, std::string> Util::readLookup(const std::string& file, co
468478
return mapping;
469479
}
470480

471-
472481
std::string Util::removeWhiteSpace(std::string in) {
473482
in.erase(std::remove_if(in.begin(), in.end(), isspace), in.end());
474483
return in;

src/commons/Util.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,7 +356,7 @@ class Util {
356356
static std::string removeWhiteSpace(std::string in);
357357

358358
static std::map<unsigned int, std::string> readLookup(const std::string& lookupFile,
359-
const bool removeSplit = false);
359+
const unsigned char removeSplit = 0);
360360

361361
static bool canBeCovered(const float covThr, const int covMode, float queryLength, float targetLength);
362362

src/util/createtsv.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,10 @@ int createtsv(int argc, const char **argv, const Command &command) {
6565
if (extended & Parameters::DBTYPE_EXTENDED_SET) {
6666
needSET = true;
6767
if (hasTargetDB) {
68-
qSetToSource = Util::readLookup((par.db1 + ".source"), true);
69-
tSetToSource = Util::readLookup((par.db2 + ".source"), true);
68+
qSetToSource = Util::readLookup((par.db1 + ".source"), 2);
69+
tSetToSource = Util::readLookup((par.db2 + ".source"), 2);
7070
} else {
71-
qSetToSource = Util::readLookup((par.db1 + ".source"), true);
71+
qSetToSource = Util::readLookup((par.db1 + ".source"), 2);
7272
}
7373
}
7474

0 commit comments

Comments
 (0)