Skip to content

Commit 1cda777

Browse files
authored
Merge pull request #11 from HS6986/feature/HS6986/extend-MixtureFinder
Extend MixtureFinder to codon, binary, multistate, (and amino acid) data
2 parents df4c9ee + 1e9303f commit 1cda777

File tree

4 files changed

+53
-14
lines changed

4 files changed

+53
-14
lines changed

main/phylotesting.cpp

Lines changed: 39 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -6248,6 +6248,16 @@ void addModel(string model_str, string& new_model_str, string new_subst) {
62486248
}
62496249
}
62506250

6251+
// initialise model frequency set in MixtureFinder for different sequence types
6252+
char* initFreqSet(SeqType seq_type) {
6253+
switch (seq_type) {
6254+
case SEQ_CODON: return ",F1X4,F3X4";
6255+
case SEQ_MORPH: return "FQ";
6256+
case SEQ_PROTEIN: return ",FO";
6257+
default: return "FO";
6258+
}
6259+
}
6260+
62516261
/**
62526262
@brief Find the best component of a Q-mixture model while fixing other components
62536263
@note This function is similar to runModelFinder, but it is adapted for optimisation of Q-Mixture model
@@ -6396,8 +6406,8 @@ CandidateModel findMixtureComponent(Params &params, IQTree &iqtree, ModelCheckpo
63966406
}
63976407
}
63986408
skip_all_when_drop = true;
6399-
} else if (mixture_action == MA_FIND_CLASS) {
6400-
char init_state_freq_set[] = "FO";
6409+
} else if (mixture_action == MA_FIND_CLASS) {
6410+
char* init_state_freq_set = initFreqSet(iqtree.aln->seq_type);
64016411
if (!params.state_freq_set) {
64026412
params.state_freq_set = init_state_freq_set;
64036413
}
@@ -6565,7 +6575,7 @@ double runMixtureFinderMain(Params &params, IQTree* &iqtree, ModelCheckpoint &mo
65656575
double LR, df_diff, pvalue;
65666576
string criteria_str;
65676577

6568-
char init_state_freq_set[] = "FO";
6578+
char* init_state_freq_set = initFreqSet(iqtree->aln->seq_type);
65696579
if (!params.state_freq_set) {
65706580
params.state_freq_set = init_state_freq_set;
65716581
}
@@ -6576,15 +6586,26 @@ double runMixtureFinderMain(Params &params, IQTree* &iqtree, ModelCheckpoint &mo
65766586

65776587
// Step 0: (reorder candidate DNA models when -mset is used) build the nest-relationship network
65786588
map<string, vector<string> > nest_network;
6579-
if (iqtree->aln->seq_type == SEQ_DNA) {
6580-
StrVector model_names, freq_names;
6581-
getModelSubst(iqtree->aln->seq_type, iqtree->aln->isStandardGeneticCode(), params.model_name,
6582-
params.model_set, params.model_subset, model_names);
6583-
getStateFreqs(iqtree->aln->seq_type, params.state_freq_set, freq_names);
6589+
StrVector model_names, freq_names;
6590+
getModelSubst(iqtree->aln->seq_type, iqtree->aln->isStandardGeneticCode(), params.model_name,
6591+
params.model_set, params.model_subset, model_names);
6592+
getStateFreqs(iqtree->aln->seq_type, params.state_freq_set, freq_names);
6593+
6594+
auto isOnlyMKAndFQ = [&]() -> bool {
6595+
bool onlyMK = std::all_of(model_names.begin(), model_names.end(),
6596+
[](const std::string& s) { return s == "MK"; });
6597+
bool onlyFQ = std::all_of(freq_names.begin(), freq_names.end(),
6598+
[](const std::string& s) { return s == "+FQ"; });
6599+
return onlyMK && onlyFQ;
6600+
};
6601+
if (isOnlyMKAndFQ()) {
6602+
outError("Error! Running MixtureFinder only with the MK model and the FQ frequency is completely meaningless.\nPlease provide additional models and/or frequencies, such as GTRX, +F, and +FO, using -mset and/or -mfreq, if you really want to use MixtureFinder for your multistate data.");
6603+
}
65846604

6605+
if (iqtree->aln->seq_type == SEQ_DNA) {
65856606
nest_network = generateNestNetwork(model_names, freq_names);
65866607
}
6587-
6608+
65886609
// Step 1: run ModelFinder
65896610
params.model_name = "";
65906611
bool under_mix_finder = true;
@@ -6731,8 +6752,15 @@ void runMixtureFinder(Params &params, IQTree* &iqtree, ModelCheckpoint &model_in
67316752
aln = iqtree->aln;
67326753
}
67336754

6734-
if (aln->seq_type != SEQ_DNA)
6735-
outError("Error! The option -m '" + params.model_name + "' can only work on DNA data set");
6755+
if (iqtree->aln->seq_type != SEQ_DNA)
6756+
outWarning("MixtureFinder has not been tested for non-DNA data types. Be cautious about interpreting the results");
6757+
6758+
if (iqtree->aln->getMaxNumStates() > 6)
6759+
outWarning("Running MixtureFinder for the given data type can take much time. Please consider restricting the set of the models to test as much as possible");
6760+
6761+
if (iqtree->aln->seq_type == SEQ_PROTEIN && !params.force_aa_mix_finder)
6762+
outError("Error! We already have the mixture frequency vectors C10–C60 which are effective for modeling amino acid data.\nPlease make sure running MixtureFinder for your amino acid data makes sense.\nIf you are determined to do that, please add an option --force-aa-mix-finder to the command line.");
6763+
67366764

67376765
// create a new IQTree object for this mixture model
67386766
// allocate heterotachy tree if neccessary

model/modelmorphology.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,11 @@ void ModelMorphology::init(const char *model_name, string model_params, StateFre
3030
}
3131
num_params = 0;
3232
} else if (name == "GTR" || name == "GTRX") {
33-
outWarning("GTRX multistate model will estimate " + convertIntToString(getNumRateEntries()-1) + " substitution rates that might be overfitting!");
34-
outWarning("Please only use GTRX with very large data and always test for model fit!");
35-
name = "GTRX";
33+
if (num_states > 6 || phylo_tree->aln->getNPattern() < 100) {
34+
outWarning("GTRX multistate model will estimate " + convertIntToString(getNumRateEntries()-1) + " substitution rates that might be overfitting for the length of your alignment/partition!");
35+
outWarning("Please only use GTRX with large data and always test for model fit!");
36+
}
37+
name = "GTRX";
3638
} else {
3739
// if name does not match, read the user-defined model
3840
readParameters(model_name);

utils/tools.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1496,6 +1496,7 @@ void parseArg(int argc, char *argv[], Params &params) {
14961496
params.ignore_checkpoint = false;
14971497
params.checkpoint_dump_interval = 60;
14981498
params.force_unfinished = false;
1499+
params.force_aa_mix_finder = false;
14991500
params.print_all_checkpoints = false;
15001501
params.suppress_output_flags = 0;
15011502
params.ufboot2corr = false;
@@ -5506,6 +5507,11 @@ void parseArg(int argc, char *argv[], Params &params) {
55065507
params.force_unfinished = true;
55075508
continue;
55085509
}
5510+
5511+
if (strcmp(argv[cnt], "-force-aa-mix-finder") == 0 || strcmp(argv[cnt], "--force-aa-mix-finder") == 0) {
5512+
params.force_aa_mix_finder = true;
5513+
continue;
5514+
}
55095515

55105516
if (strcmp(argv[cnt], "-cptime") == 0 || strcmp(argv[cnt], "--cptime") == 0) {
55115517
cnt++;

utils/tools.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2461,6 +2461,9 @@ class Params {
24612461
/** true if ignoring the "finished" flag in checkpoint file */
24622462
bool force_unfinished;
24632463

2464+
/** true if forcing IQ-TREE to run MixtureFinder for amino acid data */
2465+
bool force_aa_mix_finder;
2466+
24642467
/** TRUE to print checkpoints to 1.ckp.gz, 2.ckp.gz,... */
24652468
bool print_all_checkpoints;
24662469

0 commit comments

Comments
 (0)