Skip to content

Commit 36eb84c

Browse files
committed
change gap penalties, add citation
1 parent bed671a commit 36eb84c

File tree

7 files changed

+33
-17
lines changed

7 files changed

+33
-17
lines changed

src/commons/FoldmasonParameters.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,8 @@ FoldmasonParameters::FoldmasonParameters() :
1313
PARAM_PAIR_THRESHOLD(PARAM_PAIR_THRESHOLD_ID, "--pair-threshold", "LDDT pair threshold", "% of pair subalignments with LDDT information [0.0,1.0]",typeid(float), (void *) &pairThreshold, "^0(\\.[0-9]+)?|1(\\.0+)?$"),
1414
PARAM_REPORT_COMMAND(PARAM_REPORT_COMMAND_ID, "--report-command", "", "", typeid(std::string), (void *) &reportCommand, ""),
1515
PARAM_REPORT_PATHS(PARAM_REPORT_PATHS_ID, "--report-paths", "", "", typeid(bool), (void *) &reportPaths, ""),
16-
PARAM_REFINE_SEED(PARAM_REFINE_SEED_ID, "--refine-seed", "Random number generator seed", "Random number generator seed", typeid(int), (void *) &refinementSeed, "^([-]?[0-9]*)$")
16+
PARAM_REFINE_SEED(PARAM_REFINE_SEED_ID, "--refine-seed", "Random number generator seed", "Random number generator seed", typeid(int), (void *) &refinementSeed, "^([-]?[0-9]*)$"),
17+
PARAM_ONLY_SCORING_COLS(PARAM_ONLY_SCORING_COLS_ID, "--only-scoring-cols", "Normalise LDDT by no. scoring columns", "Normalise LDDT by no. scoring columns", typeid(bool), (void *) &onlyScoringCols, "")
1718
{
1819
// structuremsa
1920
structuremsa.push_back(&PARAM_WG);
@@ -40,6 +41,7 @@ FoldmasonParameters::FoldmasonParameters() :
4041
structuremsa.push_back(&PARAM_NO_COMP_BIAS_CORR);
4142
structuremsa.push_back(&PARAM_V);
4243
structuremsa.push_back(&PARAM_REFINE_SEED);
44+
structuremsa.push_back(&PARAM_ONLY_SCORING_COLS);
4345
4446
structuremsacluster = combineList(structuremsacluster, structuremsa);
4547
@@ -50,6 +52,7 @@ FoldmasonParameters::FoldmasonParameters() :
5052
msa2lddt.push_back(&PARAM_V);
5153
msa2lddt.push_back(&PARAM_REPORT_COMMAND);
5254
msa2lddt.push_back(&PARAM_REPORT_PATHS);
55+
msa2lddt.push_back(&PARAM_ONLY_SCORING_COLS);
5356
5457
// refinemsa
5558
refinemsa = combineList(refinemsa, structuremsa);
@@ -78,8 +81,9 @@ FoldmasonParameters::FoldmasonParameters() :
7881
filterMsa = 0;
7982
compBiasCorrection = 0;
8083
refinementSeed = -1;
81-
gapOpen = 25;
82-
gapExtend = 3;
84+
gapOpen = 20;
85+
gapExtend = 2;
86+
onlyScoringCols = false;
8387
84-
citations.emplace(CITATION_FOLDMASON, " << TODO >> ");
88+
citations.emplace(CITATION_FOLDMASON, "Gilchrist, C. L. M., Mirdita, M. & Steinegger, M. Multiple Protein Structure Alignment at Scale with FoldMason. bioRxiv, doi.org/10.1101/2024.08.01.606130 (2024)");
8589
}

src/commons/FoldmasonParameters.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ class FoldmasonParameters : public LocalParameters {
4141
PARAMETER(PARAM_REPORT_COMMAND)
4242
PARAMETER(PARAM_REPORT_PATHS)
4343
PARAMETER(PARAM_REFINE_SEED)
44+
PARAMETER(PARAM_ONLY_SCORING_COLS)
4445

4546
std::string guideTree;
4647
bool recomputeScores;
@@ -52,5 +53,6 @@ class FoldmasonParameters : public LocalParameters {
5253
bool reportPaths;
5354
float pairThreshold;
5455
int refinementSeed;
56+
bool onlyScoringCols;
5557
};
5658
#endif

src/strucclustutils/msa2lddt.cpp

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -245,7 +245,8 @@ std::tuple<std::vector<float>, std::vector<int>, float, int> calculate_lddt(
245245
std::vector<size_t> &subset,
246246
std::vector<size_t> &keys,
247247
DBReader<unsigned int> * seqDbrCA,
248-
float pairThreshold
248+
float pairThreshold,
249+
bool onlyScoringCols
249250
) {
250251
// mapping:
251252
// cigar vectors can be any size
@@ -375,6 +376,7 @@ std::tuple<std::vector<float>, std::vector<int>, float, int> calculate_lddt(
375376
std::vector<int> colCounts = countColumns(cigars, subset, alnLength);
376377
float scaledSum = 0.0;
377378
int numCols = 0;
379+
int numColsScoring = 0;
378380
for (size_t i = 0; i < perColumnCount.size(); i++) {
379381
// float pairSupport = perColumnCount[i] / static_cast<float>(numPairs);
380382
float residuesInColumn = colCounts[i];
@@ -392,13 +394,16 @@ std::tuple<std::vector<float>, std::vector<int>, float, int> calculate_lddt(
392394
//perColumnScore[i] /= perColumnCount[i]; // get mean LDDT for this column
393395
perColumnScore[i] = (2 * score * occupancy) / (score + occupancy);
394396
scaledSum += perColumnScore[i];
397+
numColsScoring++;
395398
} else {
396399
perColumnScore[i] = 0.0;
397400
}
398401
// scaledSum += colscore;
399402
numCols++;
400403
}
401-
float lddtScore = (numCols > 0) ? scaledSum / static_cast<float>(numCols) : 0.0;
404+
float lddtScore = onlyScoringCols
405+
? ((numColsScoring > 0) ? scaledSum / static_cast<float>(numColsScoring) : 0.0f)
406+
: ((numCols > 0) ? scaledSum / static_cast<float>(numCols) : 0.0f);
402407
return std::make_tuple(perColumnScore, perColumnCount, lddtScore, numCols);
403408
}
404409

@@ -464,7 +469,8 @@ float getLDDTScore(
464469
DBReader<unsigned int> &seqDbr3Di,
465470
DBReader<unsigned int> &seqDbrCA,
466471
std::string msa,
467-
float pairThreshold
472+
float pairThreshold,
473+
bool onlyScoringCols
468474
) {
469475
KSeqWrapper* kseq = new KSeqBuffer(msa.c_str(), msa.length());
470476
int alnLength = 0;
@@ -479,7 +485,7 @@ float getLDDTScore(
479485
std::vector<int> perColumnCount;
480486
float lddtScore;
481487
int numCols;
482-
std::tie(perColumnScore, perColumnCount, lddtScore, numCols) = calculate_lddt(cigars_aa, inds, inds, &seqDbrCA, pairThreshold);
488+
std::tie(perColumnScore, perColumnCount, lddtScore, numCols) = calculate_lddt(cigars_aa, inds, inds, &seqDbrCA, pairThreshold, onlyScoringCols);
483489

484490
return lddtScore;
485491
}
@@ -531,7 +537,7 @@ int msa2lddt(int argc, const char **argv, const Command& command, int makeReport
531537
std::iota(subset.begin(), subset.end(), 0);
532538

533539
if (caExist) {
534-
std::tie(perColumnScore, perColumnCount, lddtScore, numCols) = calculate_lddt(cigars_aa, subset, indices, seqDbrCA, par.pairThreshold);
540+
std::tie(perColumnScore, perColumnCount, lddtScore, numCols) = calculate_lddt(cigars_aa, subset, indices, seqDbrCA, par.pairThreshold, par.onlyScoringCols);
535541
std::string scores;
536542
for (float score : perColumnScore) {
537543
if (scores.length() > 0) scores += ",";

src/strucclustutils/msa2lddt.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ std::tuple<std::vector<float>, std::vector<int>, float, int> calculate_lddt(
2424
std::vector<size_t> &subset,
2525
std::vector<size_t> &keys,
2626
DBReader<unsigned int> * seqDbrCA,
27-
float pairThreshold
27+
float pairThreshold,
28+
bool scoringColsOnly
2829
);
2930

3031
double calculate_lddt_pair(
@@ -49,7 +50,8 @@ float getLDDTScore(
4950
DBReader<unsigned int> &seqDbr3Di,
5051
DBReader<unsigned int> &seqDbrCA,
5152
std::string msa,
52-
float pairThreshold
53+
float pairThreshold,
54+
bool scoringColsOnly
5355
);
5456

5557
#endif

src/strucclustutils/refinemsa.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -259,7 +259,8 @@ void refineMany(
259259
std::string qid,
260260
float pairThreshold,
261261
std::vector<size_t> indices,
262-
int seed
262+
int seed,
263+
bool onlyScoringCols
263264
) {
264265
std::cout << "Running " << iterations << " refinement iterations\n";
265266

@@ -268,7 +269,7 @@ void refineMany(
268269
subset[i] = i;
269270
}
270271

271-
float prevLDDT = std::get<2>(calculate_lddt(cigars_aa, subset, indices, seqDbrCA, pairThreshold));
272+
float prevLDDT = std::get<2>(calculate_lddt(cigars_aa, subset, indices, seqDbrCA, pairThreshold, onlyScoringCols));
272273
float initLDDT = prevLDDT;
273274
std::cout << "Initial LDDT: " << prevLDDT << '\n';
274275

@@ -304,7 +305,7 @@ void refineMany(
304305
sequences_aa, sequences_ss,
305306
rng
306307
);
307-
float lddtScore = std::get<2>(calculate_lddt(cigars_new_aa, subset, indices, seqDbrCA, pairThreshold));
308+
float lddtScore = std::get<2>(calculate_lddt(cigars_new_aa, subset, indices, seqDbrCA, pairThreshold, onlyScoringCols));
308309
// std::cout << std::fixed << std::setprecision(4) << "New LDDT: " << lddtScore << '\t' << "(" << i + 1 << ")\n";
309310
// for (std::vector<Instruction> &ins : cigars_new_aa) {
310311
// std::cout << expand(ins) << '\n';
@@ -407,7 +408,7 @@ int refinemsa(int argc, const char **argv, const Command& command) {
407408
structureSmithWaterman, par.refineIters, par.compBiasCorrection, par.wg, par.filterMaxSeqId,
408409
par.qsc, par.Ndiff, par.covMSAThr,
409410
par.filterMinEnable, par.filterMsa, par.gapExtend.values.aminoacid(), par.gapOpen.values.aminoacid(),
410-
par.maxSeqLen, par.qid, par.pairThreshold, indices, par.refinementSeed
411+
par.maxSeqLen, par.qid, par.pairThreshold, indices, par.refinementSeed, par.onlyScoringCols
411412
);
412413

413414
// Write final MSA to file

src/strucclustutils/refinemsa.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,8 @@ void refineMany(
3030
std::string qid,
3131
float pairThreshold,
3232
std::vector<size_t> indices,
33-
int seed
33+
int seed,
34+
bool onlyScoringCols
3435
);
3536
void deleteGapCols(std::vector<std::string> &sequences);
3637
void buildSubMSA(std::vector<std::string> &headers, std::vector<std::string> &sequences, std::string &subMSA);

src/strucclustutils/structuremsa.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1642,7 +1642,7 @@ int structuremsa(int argc, const char **argv, const Command& command, bool preCl
16421642
par.refineIters, par.compBiasCorrection, par.wg, par.filterMaxSeqId, par.qsc,
16431643
par.Ndiff, par.covMSAThr, par.filterMinEnable, par.filterMsa, par.gapExtend.values.aminoacid(),
16441644
par.gapOpen.values.aminoacid(), par.maxSeqLen, par.qid, par.pairThreshold, msa.dbKeys,
1645-
par.refinementSeed
1645+
par.refinementSeed, par.onlyScoringCols
16461646
);
16471647
}
16481648
}

0 commit comments

Comments
 (0)