Skip to content

Commit 5b2cf24

Browse files
authored
Merge pull request #68 from iqtree/huaiyan
Update sequence type detection function
2 parents 5eb1253 + dcab2d0 commit 5b2cf24

File tree

3 files changed

+57
-138
lines changed

3 files changed

+57
-138
lines changed

alignment/alignment.cpp

Lines changed: 46 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1583,53 +1583,70 @@ void Alignment::regroupSitePattern(int groups, IntVector& site_group)
15831583
@return the data type of the input sequences
15841584
*/
15851585
SeqType Alignment::detectSequenceType(StrVector &sequences) {
1586+
size_t num_proper_nuc = 0;
15861587
size_t num_nuc = 0;
1587-
size_t num_ungap = 0;
1588+
size_t num_aa = 0;
15881589
size_t num_bin = 0;
1589-
size_t num_alpha = 0;
15901590
size_t num_digit = 0;
1591+
size_t num_alpha = 0;
15911592
double detectStart = getRealTime();
15921593
size_t sequenceCount = sequences.size();
1594+
std::unordered_set<char> proper_nucleotides = {'A', 'C', 'G', 'T', 'U'};
1595+
std::unordered_set<char> nucleotides = {'A', 'C', 'G', 'T', 'U', 'R', 'Y', 'W', 'S', 'M', 'K', 'B', 'H', 'D', 'V', 'N', 'X'};
1596+
std::unordered_set<char> proper_amino_acids = {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V'};
1597+
std::unordered_set<char> binaries = {'0', '1'};
1598+
// std::unordered_set<char> gap_miss = {'?', '-', '.', '~'};
1599+
15931600
#ifdef _OPENMP
1594-
#pragma omp parallel for reduction(+:num_nuc,num_ungap,num_bin,num_alpha,num_digit)
1601+
#pragma omp parallel for reduction(+:num_proper_nuc, num_nuc, num_aa, num_bin, num_digit, num_alpha)
15951602
#endif
15961603
for (size_t seqNum = 0; seqNum < sequenceCount; ++seqNum) {
15971604
auto start = sequences.at(seqNum).data();
15981605
auto stop = start + sequences.at(seqNum).size();
15991606
for (auto i = start; i!=stop; ++i) {
1600-
if ((*i) == 'A' || (*i) == 'C' || (*i) == 'G' || (*i) == 'T' || (*i) == 'U') {
1601-
++num_nuc;
1602-
++num_ungap;
1603-
continue;
1604-
}
1605-
if ((*i)=='?' || (*i)=='-' || (*i) == '.' ) {
1606-
continue;
1607-
}
1608-
if (*i != 'N' && *i != 'X' && (*i) != '~') {
1609-
num_ungap++;
1610-
if (isdigit(*i)) {
1611-
num_digit++;
1612-
if ((*i) == '0' || (*i) == '1') {
1613-
num_bin++;
1614-
}
1615-
}
1616-
}
1617-
if (isalpha(*i)) {
1607+
// if (gap_miss.find(*i) != gap_miss.end()) {
1608+
// continue;
1609+
// }
1610+
if (proper_nucleotides.find(*i) != proper_nucleotides.end())
1611+
num_proper_nuc++;
1612+
if (nucleotides.find(*i) != nucleotides.end())
1613+
num_nuc++;
1614+
if (proper_amino_acids.find(*i) != proper_amino_acids.end())
1615+
num_aa++;
1616+
if (binaries.find(*i) != binaries.end())
1617+
num_bin++;
1618+
if (isdigit(*i))
1619+
num_digit++;
1620+
if (isalpha(*i))
16181621
num_alpha++;
1619-
}
16201622
}
16211623
}
16221624
if (verbose_mode >= VB_MED) {
16231625
cout << "Sequence Type detection took " << (getRealTime()-detectStart) << " seconds." << endl;
16241626
}
1625-
if (((double)num_nuc) / num_ungap > 0.9)
1626-
return SEQ_DNA;
1627-
if (num_bin == num_ungap) // For binary data, only 0, 1, ?, -, . can occur
1628-
return SEQ_BINARY;
1629-
if (((double)num_alpha + num_nuc) / num_ungap > 0.9)
1630-
return SEQ_PROTEIN;
1631-
if (((double)(num_alpha + num_digit + num_nuc)) / num_ungap > 0.9)
1627+
if (num_digit == 0) {
1628+
if (num_alpha < 10) // two few occurences to decide
1629+
return SEQ_UNKNOWN;
1630+
if (num_nuc == num_alpha) {
1631+
if ((double)num_proper_nuc / num_alpha > 0.9) {
1632+
return SEQ_DNA;
1633+
} else {
1634+
return SEQ_UNKNOWN;
1635+
}
1636+
}
1637+
// likely protein if there are only letters
1638+
// TODO: check if this condition is OK
1639+
if ((double)num_aa / num_alpha > 0.9)
1640+
return SEQ_PROTEIN;
1641+
else
1642+
return SEQ_UNKNOWN;
1643+
} else if (num_digit >= 10) {
1644+
// there are some digit(s) in the data
1645+
if (num_bin == (num_digit+num_alpha)) // For binary data, only 0, 1, ?, -, . can occur
1646+
return SEQ_BINARY;
16321647
return SEQ_MORPH;
1648+
}
1649+
// can't decide
16331650
return SEQ_UNKNOWN;
16341651
}
16351652

main/phyloanalysis.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1364,7 +1364,6 @@ void reportSubstitutionProcess(ostream &out, Params &params, IQTree &tree)
13641364

13651365
PhyloSuperTree *stree = (PhyloSuperTree*) &tree;
13661366
PhyloSuperTree::iterator it;
1367-
it = stree->begin();
13681367

13691368
int part;
13701369

@@ -1479,6 +1478,11 @@ void reportPhyloAnalysis(Params &params, IQTree &tree, ModelCheckpoint &model_in
14791478

14801479
out << "SEQUENCE ALIGNMENT" << endl << "------------------" << endl
14811480
<< endl;
1481+
1482+
if (tree.aln->sequence_type.empty()) {
1483+
out << "NOTE: Alignment sequence type is auto-detected. If in doubt, specify it via -st option." << endl;
1484+
}
1485+
14821486
if (tree.isSuperTree()) {
14831487
// TODO DS: Changes may be needed here for PoMo.
14841488
out << "Input data: " << tree.aln->getNSeq()+tree.removed_seqs.size() << " taxa with "

model/partitionmodel.cpp

Lines changed: 6 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -466,7 +466,7 @@ double PartitionModel::computeMarginalLh() {
466466
sub_tree2->copyTree(tree2);
467467
}
468468

469-
if (inter_seqs_id.size() == 2) {
469+
if (inter_seqs_id.size() == 2 && tree2->getModel()->isReversible()) {
470470
// if only two seqs in the subset
471471
// add a gappy seq two the sub_aln
472472
string gappy_seq = "gappy_seq";
@@ -565,113 +565,8 @@ double PartitionModel::computeMarginalLh() {
565565
sub_tree2->aln = NULL;
566566
delete sub_tree2;
567567
delete[] ptn_lh_array;
568-
/*
569-
} else if (inter_seqs_id.size() == 2) {
570-
//AlignmentPairwise aln_pair(tree2, inter_seqs_id[0], inter_seqs_id[1]);
571-
//double site_lh = aln_pair.computeFunction(1);
572-
573-
//compute the distance between the tree nodes of the two "overlapping" taxa
574-
auto it = inter_seqs.begin();
575-
string inter_seq1 = *it;
576-
Node *node1 = tree2->findLeafName(inter_seq1);
577-
++it;
578-
string inter_seq2 = *it;
579-
Node *node2 = tree2->findLeafName(inter_seq2);
580-
double branch_len = tree2->pairDist(node1, node2, node1);
581-
582-
//compute the transition probobility matrix based on tree2
583-
RateHeterogeneity *site_rate = tree2->getRate();
584-
int ncat = site_rate->getNDiscreteRate();
585-
int trans_size = tree2->getModel()->getTransMatrixSize();
586-
double *trans_matrix = new double[trans_size];
587-
double *sum_trans_matrix = new double[trans_size];
588-
589-
if (tree2->getModelFactory()->site_rate->getGammaShape() == 0.0)
590-
tree2->getModelFactory()->computeTransMatrix(branch_len, sum_trans_matrix);
591-
else {
592-
tree2->getModelFactory()->computeTransMatrix(branch_len * site_rate->getRate(0), sum_trans_matrix);
593-
for (int cat = 1; cat < ncat; cat++) {
594-
tree2->getModelFactory()->computeTransMatrix(branch_len * site_rate->getRate(cat), trans_matrix);
595-
for (int i = 0; i < trans_size; i++)
596-
sum_trans_matrix[i] += trans_matrix[i];
597-
}
598-
}
599-
600-
// take the log for the transition probobility matrix
601-
vector<double> log_sum_trans_matrix(trans_size);
602-
for (int n = 0; n < trans_size; n++) {
603-
log_sum_trans_matrix[n] = log(sum_trans_matrix[n]);
604-
}
605-
606-
for (int l = 0; l < tree1_nsite; l++) {
607-
double site_lh = 0.0;
608-
Pattern p = tree1_aln->at(tree1_aln->getPatternID(l));
609-
int state1 = p[inter_seqs_id[0]];
610-
int state2 = p[inter_seqs_id[1]];
611-
612-
// compute the site log-likelihood of the two "overlaping" sequences.
613-
if (state1 < n_states && state2 < n_states) {
614-
site_lh = log_sum_trans_matrix[state1 * n_states + state2] + log_state_freq[state1];
615-
} else if (state1 >= n_states && state2 < n_states) {
616-
// compute ambiguous frequencies
617-
int cstate1 = state1 - n_states + 1;
618-
double amb_lh = 0.0;
619-
for (int m = 0; m < n_states; m++) {
620-
if ((cstate1) & (1 << m)) {
621-
amb_lh = sum_trans_matrix[m * n_states + state2] * state_freq[m];
622-
site_lh *= amb_lh;
623-
}
624-
}
625-
site_lh = log(site_lh);
626-
} else if (state2 >= n_states && state1 < n_states) {
627-
// compute ambiguous frequencies
628-
int cstate2 = state2-n_states+1;
629-
double amb_lh = 0.0;
630-
for (int m = 0; m < n_states; m++) {
631-
if ((cstate2) & (1 << m)) {
632-
amb_lh = sum_trans_matrix[state1 * n_states + m] * state_freq[state1];
633-
site_lh *= amb_lh;
634-
}
635-
}
636-
site_lh = log(site_lh);
637-
} else {
638-
// compute ambiguous frequencies
639-
int cstate1 = state1-n_states+1;
640-
int cstate2 = state2-n_states+1;
641-
double amb_lh = 0.0;
642-
for (int m = 0; m < n_states; m++) {
643-
for (int n = 0; n < n_states; n++) {
644-
if (((cstate1) & (1 << m)) && ((cstate2) & (1 << n))) {
645-
amb_lh = sum_trans_matrix[m * n_states + n] * state_freq[m];
646-
site_lh *= amb_lh;
647-
}
648-
}
649-
}
650-
site_lh = log(site_lh);
651-
}
652-
653-
for (int missing_id: missing_seqs_id) {
654-
int char_id = p[missing_id];
655-
if (char_id < n_states) {
656-
site_lh += log_state_freq[char_id];
657-
} else {
658-
// compute ambiguous frequencies
659-
int cstate = char_id-n_states+1;
660-
double amb_freq = 0;
661-
for (int m = 0; m < n_states; m++) {
662-
if ((cstate) & (1 << m)) {
663-
amb_freq += state_freq[m];
664-
}
665-
}
666-
site_lh += log(amb_freq);
667-
}
668-
}
669-
lh_array[tree1_nsite * k + l] = site_lh;
670-
}
671-
delete[] trans_matrix;
672-
delete[] sum_trans_matrix;
673-
*/
674-
} else {
568+
} else if (tree2->getModel()->isReversible()) {
569+
// case when the intersection of taxon sets is 1
675570
for (int l = 0; l < tree1_nsite; l++) {
676571
double site_lh = 0.0;
677572
Pattern p = tree1_aln->at(tree1_aln->getPatternID(l));
@@ -705,6 +600,9 @@ double PartitionModel::computeMarginalLh() {
705600
}
706601
lh_array[tree1_nsite * k + l] = site_lh;
707602
}
603+
} else {
604+
// intersection has only 1 taxon and non-reversible model
605+
outError("mAIC calculation doesn't work yet for intersection of 1 taxon and non-reversible model");
708606
}
709607
delete[] state_freq;
710608
}

0 commit comments

Comments
 (0)