Skip to content

Commit af5bd81

Browse files
Merge pull request #2387 from jeromekelleher/smck_implementation_notes
Add some notes on implementation of the SMC(k)
2 parents cda89ed + 5136727 commit af5bd81

File tree

3 files changed

+34
-7
lines changed

3 files changed

+34
-7
lines changed

algorithms.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -682,6 +682,14 @@ def _make_segment(self, left, right, count):
682682
return seg
683683

684684

685+
# The SMC(k) implementation here differs in a few details to the
686+
# version in the C code. Most of this is incidental detail related
687+
# to memory management and differences in AVL tree implementations.
688+
# The main difference is that we have an implementation of
689+
# reset_hull_right here which could be worth porting into the C
690+
# code at some point.
691+
692+
685693
class Hull:
686694
"""
687695
A hull keeps track of the outermost boundaries (left, right) of
@@ -1952,7 +1960,9 @@ def wiuf_gene_conversion_within_event(self, label):
19521960
elif head is not None:
19531961
new_individual_head = head
19541962
if new_individual_head is not None:
1955-
# FIXME when doing the smc_k update
1963+
# NOTE: this is not done very nicely and there's likely
1964+
# ways that would improve performance a little. See
1965+
# https://github.com/tskit-dev/msprime/issues/2386
19561966
lineage.reset_segments()
19571967
self.update_lineage_right(lineage)
19581968
new_lineage = self.alloc_lineage(new_individual_head, pop)
@@ -2018,7 +2028,7 @@ def wiuf_gene_conversion_left_event(self, label):
20182028
y.prev = None
20192029
alpha = y
20202030

2021-
# FIXME
2031+
# See https://github.com/tskit-dev/msprime/issues/2386
20222032
lineage.reset_segments()
20232033
self.update_lineage_right(lineage)
20242034

lib/msprime.c

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2038,9 +2038,6 @@ msp_verify_hulls(msp_t *self)
20382038
io = 0;
20392039
}
20402040
}
2041-
/* printf("hull %d io=%d insertion_order=%d\n", */
2042-
/* (int) hull->id, (int) io, (int) hull->right_insertion_order);
2043-
*/
20442041
tsk_bug_assert(io == hull->right_insertion_order);
20452042
pos = hull->right;
20462043
}
@@ -3251,6 +3248,19 @@ msp_store_arg_gene_conversion(
32513248
* there was specific logic for this. In an attempt to simplify the logic
32523249
* here we just do the straightforward thing, but this is definitely somewhere
32533250
* we could get some benefit if looking for optimisations.
3251+
*
3252+
* In some profiling on a simulation that takes about a minute, we see
3253+
* the following breakdown:
3254+
* --49.65%--msp_smc_k_common_ancestor_event
3255+
* --40.93%--msp_recombination_event
3256+
* |--17.03%--msp_insert_individual
3257+
* |--16.79%--msp_reset_hull_right
3258+
*
3259+
* So, in this case msp_reset_hull_right represents about 17% of the overall
3260+
* processing time, and I think we could at best expect to halve that if
3261+
* we did things better. A ~5% improvement in performance doesn't seem
3262+
* worth the effort or complexity currently. See the Python implementation
3263+
* of the git history though, for how this used to be done.
32543264
*/
32553265
static int
32563266
msp_reset_hull_right(msp_t *self, lineage_t *lineage)
@@ -3624,7 +3634,9 @@ msp_gene_conversion_event(msp_t *self, label_id_t label)
36243634
* lineage and the SMC(k) state as the code paths are quite convoluted.
36253635
* It seems unlikely the segment iteration will be a bottleneck in
36263636
* practise, but unconditionaly removing and reinserting hulls could
3627-
* become significant. */
3637+
* become significant.
3638+
* See https://github.com/tskit-dev/msprime/issues/2386
3639+
*/
36283640
lineage_reset_segments(lineage);
36293641
if (is_smc_k) {
36303642
ret = msp_insert_individual_hull(self, lineage);
@@ -5028,6 +5040,7 @@ msp_gene_conversion_left_event(msp_t *self, label_id_t label)
50285040
msp_set_segment_mass(self, alpha);
50295041
tsk_bug_assert(alpha->prev == NULL);
50305042

5043+
// This isn't optimal - see https://github.com/tskit-dev/msprime/issues/2386
50315044
lineage_reset_segments(new_lineage);
50325045
lineage_reset_segments(lineage);
50335046

lib/msprime.h

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ typedef struct lineage_t_t {
9292
segment_t *head;
9393
segment_t *tail;
9494
avl_node_t avl_node;
95+
/* The hull is only used for the SMCK, and is NULL otherwise */
9596
struct hull_t_t *hull;
9697
} lineage_t;
9798

@@ -129,7 +130,10 @@ typedef struct {
129130
avl_tree_t *ancestors;
130131
tsk_size_t num_potential_destinations;
131132
tsk_id_t *potential_destinations;
132-
/* These three indexes are only used in the SMCK model */
133+
/* These three indexes are only used in the SMCK model. We maintain
134+
* two AVL trees in which the hulls are indexed by the left and right
135+
* coordinates, respectively, along with a Fenwick tree which maintains
136+
* the cumulative count of coalescable lineages */
133137
avl_tree_t *hulls_left;
134138
avl_tree_t *hulls_right;
135139
fenwick_t *coal_mass_index;

0 commit comments

Comments
 (0)