Skip to content

Commit 5f3bc2f

Browse files
committed
Make mutation sort identical to canonical
1 parent 0b89f97 commit 5f3bc2f

File tree

10 files changed

+316
-149
lines changed

10 files changed

+316
-149
lines changed

c/tests/test_tables.c

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8905,6 +8905,7 @@ static void
89058905
test_sort_tables_errors(void)
89068906
{
89078907
int ret;
8908+
tsk_id_t ret_id;
89088909
tsk_treeseq_t ts;
89098910
tsk_table_collection_t tables;
89108911
tsk_bookmark_t pos;
@@ -8968,6 +8969,32 @@ test_sort_tables_errors(void)
89688969
ret = tsk_table_collection_sort(&tables, &pos, 0);
89698970
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED);
89708971

8972+
/* Test TSK_ERR_MUTATION_PARENT_INCONSISTENT */
8973+
ret = tsk_table_collection_clear(&tables, 0);
8974+
CU_ASSERT_EQUAL_FATAL(ret, 0);
8975+
tables.sequence_length = 1.0;
8976+
8977+
ret_id = tsk_node_table_add_row(&tables.nodes, 0, 0.0, TSK_NULL, TSK_NULL, NULL, 0);
8978+
CU_ASSERT_FATAL(ret_id >= 0);
8979+
ret_id = tsk_site_table_add_row(&tables.sites, 0.0, "x", 1, NULL, 0);
8980+
CU_ASSERT_FATAL(ret_id >= 0);
8981+
8982+
ret_id
8983+
= tsk_mutation_table_add_row(&tables.mutations, 0, 0, 2, 0.0, "a", 1, NULL, 0);
8984+
CU_ASSERT_FATAL(ret_id >= 0);
8985+
ret_id
8986+
= tsk_mutation_table_add_row(&tables.mutations, 0, 0, 3, 0.0, "b", 1, NULL, 0);
8987+
CU_ASSERT_FATAL(ret_id >= 0);
8988+
ret_id
8989+
= tsk_mutation_table_add_row(&tables.mutations, 0, 0, 1, 0.0, "c", 1, NULL, 0);
8990+
CU_ASSERT_FATAL(ret_id >= 0);
8991+
ret_id
8992+
= tsk_mutation_table_add_row(&tables.mutations, 0, 0, 2, 0.0, "d", 1, NULL, 0);
8993+
CU_ASSERT_FATAL(ret_id >= 0);
8994+
8995+
ret = tsk_table_collection_sort(&tables, NULL, 0);
8996+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MUTATION_PARENT_INCONSISTENT);
8997+
89718998
tsk_table_collection_free(&tables);
89728999
tsk_treeseq_free(&ts);
89739000
}
@@ -9032,6 +9059,120 @@ test_sort_tables_mutation_times(void)
90329059
tsk_treeseq_free(&ts);
90339060
}
90349061

9062+
static void
9063+
test_sort_tables_mutations(void)
9064+
{
9065+
int ret;
9066+
tsk_table_collection_t tables;
9067+
9068+
/* Sorting hierarchy:
9069+
* 1. site
9070+
* 2. time (when known)
9071+
* 3. node_time
9072+
* 4. num_descendants: parent mutations first
9073+
* 5. node_id
9074+
* 6. mutation_id
9075+
*/
9076+
9077+
const char *sites = "0.0 A\n"
9078+
"0.5 T\n"
9079+
"0.75 G\n";
9080+
9081+
const char *mutations_unsorted =
9082+
/* Test site criterion (primary) - site 1 should come after site 0 */
9083+
"1 0 X -1 0.0\n" /* mut 0: site 1, will be sorted after site 0 mutations */
9084+
"0 0 Y -1 0.0\n" /* mut 1: site 0, will be sorted before site 1 mutations */
9085+
9086+
/* Test time criterion - within same site, earlier time first */
9087+
"0 4 B -1 2.0\n" /* mut 2: site 0, node 4 (time 1.0), time 2.0 (later time)
9088+
*/
9089+
"0 5 A -1 2.5\n" /* mut 3: site 0, node 5 (time 2.0), time 2.5 (earlier
9090+
relative) */
9091+
9092+
/* Test unknown vs known times - unknown times at site 2, fall back to node_time
9093+
sorting */
9094+
"2 4 U2 -1\n" /* mut 4: site 2, node 4 (time 1.0), unknown time - falls back
9095+
to node_time */
9096+
"2 4 U3 -1\n" /* mut 5: site 2, node 4 (time 1.0), unknown time - should use
9097+
mutation_id as tiebreaker */
9098+
"2 5 U1 -1\n" /* mut 6: site 2, node 5 (time 2.0), unknown time - falls back
9099+
to node_time */
9100+
9101+
/* Test node_time criterion - same site, same mut time, different node times */
9102+
"0 4 D -1 1.5\n" /* mut 7: site 0, node 4 (time 1.0), mut time 1.5 */
9103+
"0 5 C -1 2.5\n" /* mut 8: site 0, node 5 (time 2.0), mut time 2.5 - same
9104+
mut time */
9105+
9106+
/* Test num_descendants criterion with mutation parent-child relationships */
9107+
"0 2 P -1 0.0\n" /* mut 9: site 0, node 2, parent mutation (0 descendants
9108+
initially) */
9109+
"0 1 C1 9 0.0\n" /* mut 10: site 0, node 1, child of mut 9 (parent now has
9110+
1+ descendants) */
9111+
"0 1 C2 9 0.0\n" /* mut 11: site 0, node 1, another child of mut 9 (parent
9112+
now has 2+ descendants) */
9113+
"0 3 Q -1 0.0\n" /* mut 12: site 0, node 3, no children (0 descendants) */
9114+
"0 0 C3 10 0.0\n" /* mut 13: site 0, node 0, child of mut 10 (making mut 9 a
9115+
grandparent) */
9116+
9117+
/* Test node and mutation_id criteria for final tiebreaking */
9118+
"0 0 Z1 -1 0.0\n" /* mut 14: site 0, node 0, no parent, will test node+id
9119+
ordering */
9120+
"0 0 Z2 -1 0.0\n"; /* mut 15: site 0, node 0, no parent, later in input =
9121+
higher ID */
9122+
9123+
const char *mutations_sorted =
9124+
/* Site 0 mutations - known times first, sorted by time */
9125+
"0 5 A -1 2.5\n"
9126+
"0 5 C -1 2.5\n"
9127+
"0 4 B -1 2.0\n"
9128+
"0 4 D -1 1.5\n"
9129+
"0 2 P -1 0.0\n"
9130+
"0 1 C1 4 0.0\n"
9131+
"0 0 Y -1 0.0\n"
9132+
"0 0 C3 5 0.0\n"
9133+
"0 0 Z1 -1 0.0\n"
9134+
"0 0 Z2 -1 0.0\n"
9135+
"0 1 C2 4 0.0\n"
9136+
"0 3 Q -1 0.0\n"
9137+
9138+
/* Site 1 mutations */
9139+
"1 0 X -1 0.0\n"
9140+
9141+
/* Site 2 mutations - unknown times, sorted by node_time then other criteria */
9142+
"2 5 U1 -1\n"
9143+
"2 4 U2 -1\n"
9144+
"2 4 U3 -1\n";
9145+
9146+
ret = tsk_table_collection_init(&tables, 0);
9147+
CU_ASSERT_EQUAL_FATAL(ret, 0);
9148+
tables.sequence_length = 1.0;
9149+
parse_nodes(single_tree_ex_nodes, &tables.nodes);
9150+
parse_edges(single_tree_ex_edges, &tables.edges);
9151+
9152+
parse_sites(sites, &tables.sites);
9153+
CU_ASSERT_EQUAL_FATAL(tables.sites.num_rows, 3);
9154+
9155+
parse_mutations(mutations_unsorted, &tables.mutations);
9156+
CU_ASSERT_EQUAL_FATAL(tables.mutations.num_rows, 16);
9157+
9158+
ret = tsk_table_collection_sort(&tables, NULL, 0);
9159+
CU_ASSERT_EQUAL_FATAL(ret, 0);
9160+
9161+
tsk_table_collection_t expected;
9162+
ret = tsk_table_collection_init(&expected, 0);
9163+
CU_ASSERT_EQUAL_FATAL(ret, 0);
9164+
expected.sequence_length = 1.0;
9165+
parse_nodes(single_tree_ex_nodes, &expected.nodes);
9166+
parse_edges(single_tree_ex_edges, &expected.edges);
9167+
parse_sites(sites, &expected.sites);
9168+
parse_mutations(mutations_sorted, &expected.mutations);
9169+
9170+
CU_ASSERT_TRUE(tsk_mutation_table_equals(&tables.mutations, &expected.mutations, 0));
9171+
9172+
tsk_table_collection_free(&expected);
9173+
tsk_table_collection_free(&tables);
9174+
}
9175+
90359176
static void
90369177
test_sort_tables_canonical_errors(void)
90379178
{
@@ -11608,6 +11749,7 @@ main(int argc, char **argv)
1160811749
{ "test_sort_tables_errors", test_sort_tables_errors },
1160911750
{ "test_sort_tables_individuals", test_sort_tables_individuals },
1161011751
{ "test_sort_tables_mutation_times", test_sort_tables_mutation_times },
11752+
{ "test_sort_tables_mutations", test_sort_tables_mutations },
1161111753
{ "test_sort_tables_migrations", test_sort_tables_migrations },
1161211754
{ "test_sort_tables_no_edge_metadata", test_sort_tables_no_edge_metadata },
1161311755
{ "test_sort_tables_offsets", test_sort_tables_offsets },

c/tskit/tables.c

Lines changed: 18 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -6756,7 +6756,8 @@ typedef struct {
67566756
typedef struct {
67576757
tsk_mutation_t mut;
67586758
int num_descendants;
6759-
} mutation_canonical_sort_t;
6759+
double node_time;
6760+
} mutation_sort_t;
67606761

67616762
typedef struct {
67626763
tsk_individual_t ind;
@@ -6797,39 +6798,30 @@ cmp_site(const void *a, const void *b)
67976798
static int
67986799
cmp_mutation(const void *a, const void *b)
67996800
{
6800-
const tsk_mutation_t *ia = (const tsk_mutation_t *) a;
6801-
const tsk_mutation_t *ib = (const tsk_mutation_t *) b;
6802-
/* Compare mutations by site */
6803-
int ret = (ia->site > ib->site) - (ia->site < ib->site);
6804-
/* Within a particular site sort by time if known, then ID. This ensures that
6805-
* relative ordering within a site is maintained */
6806-
if (ret == 0 && !tsk_is_unknown_time(ia->time) && !tsk_is_unknown_time(ib->time)) {
6807-
ret = (ia->time < ib->time) - (ia->time > ib->time);
6808-
}
6809-
if (ret == 0) {
6810-
ret = (ia->id > ib->id) - (ia->id < ib->id);
6811-
}
6812-
return ret;
6813-
}
6814-
6815-
static int
6816-
cmp_mutation_canonical(const void *a, const void *b)
6817-
{
6818-
const mutation_canonical_sort_t *ia = (const mutation_canonical_sort_t *) a;
6819-
const mutation_canonical_sort_t *ib = (const mutation_canonical_sort_t *) b;
6801+
const mutation_sort_t *ia = (const mutation_sort_t *) a;
6802+
const mutation_sort_t *ib = (const mutation_sort_t *) b;
68206803
/* Compare mutations by site */
68216804
int ret = (ia->mut.site > ib->mut.site) - (ia->mut.site < ib->mut.site);
6805+
6806+
/* Within a particular site sort by time if known */
68226807
if (ret == 0 && !tsk_is_unknown_time(ia->mut.time)
68236808
&& !tsk_is_unknown_time(ib->mut.time)) {
68246809
ret = (ia->mut.time < ib->mut.time) - (ia->mut.time > ib->mut.time);
68256810
}
6811+
/* Or node times when mutation times are unknown or equal */
6812+
if (ret == 0) {
6813+
ret = (ia->node_time < ib->node_time) - (ia->node_time > ib->node_time);
6814+
}
6815+
/* If node times are equal, sort by number of descendants */
68266816
if (ret == 0) {
68276817
ret = (ia->num_descendants < ib->num_descendants)
68286818
- (ia->num_descendants > ib->num_descendants);
68296819
}
6820+
/* If number of descendants are equal, sort by node */
68306821
if (ret == 0) {
68316822
ret = (ia->mut.node > ib->mut.node) - (ia->mut.node < ib->mut.node);
68326823
}
6824+
/* Final tiebreaker: ID */
68336825
if (ret == 0) {
68346826
ret = (ia->mut.id > ib->mut.id) - (ia->mut.id < ib->mut.id);
68356827
}
@@ -7056,77 +7048,15 @@ tsk_table_sorter_sort_sites(tsk_table_sorter_t *self)
70567048

70577049
static int
70587050
tsk_table_sorter_sort_mutations(tsk_table_sorter_t *self)
7059-
{
7060-
int ret = 0;
7061-
tsk_size_t j;
7062-
tsk_id_t ret_id, parent, mapped_parent;
7063-
tsk_mutation_table_t *mutations = &self->tables->mutations;
7064-
tsk_size_t num_mutations = mutations->num_rows;
7065-
tsk_mutation_table_t copy;
7066-
tsk_mutation_t *sorted_mutations
7067-
= tsk_malloc(num_mutations * sizeof(*sorted_mutations));
7068-
tsk_id_t *mutation_id_map = tsk_malloc(num_mutations * sizeof(*mutation_id_map));
7069-
7070-
ret = tsk_mutation_table_copy(mutations, &copy, 0);
7071-
if (ret != 0) {
7072-
goto out;
7073-
}
7074-
if (mutation_id_map == NULL || sorted_mutations == NULL) {
7075-
ret = tsk_trace_error(TSK_ERR_NO_MEMORY);
7076-
goto out;
7077-
}
7078-
7079-
for (j = 0; j < num_mutations; j++) {
7080-
tsk_mutation_table_get_row_unsafe(&copy, (tsk_id_t) j, sorted_mutations + j);
7081-
sorted_mutations[j].site = self->site_id_map[sorted_mutations[j].site];
7082-
}
7083-
ret = tsk_mutation_table_clear(mutations);
7084-
if (ret != 0) {
7085-
goto out;
7086-
}
7087-
7088-
qsort(sorted_mutations, (size_t) num_mutations, sizeof(*sorted_mutations),
7089-
cmp_mutation);
7090-
7091-
/* Make a first pass through the sorted mutations to build the ID map. */
7092-
for (j = 0; j < num_mutations; j++) {
7093-
mutation_id_map[sorted_mutations[j].id] = (tsk_id_t) j;
7094-
}
7095-
7096-
for (j = 0; j < num_mutations; j++) {
7097-
mapped_parent = TSK_NULL;
7098-
parent = sorted_mutations[j].parent;
7099-
if (parent != TSK_NULL) {
7100-
mapped_parent = mutation_id_map[parent];
7101-
}
7102-
ret_id = tsk_mutation_table_add_row(mutations, sorted_mutations[j].site,
7103-
sorted_mutations[j].node, mapped_parent, sorted_mutations[j].time,
7104-
sorted_mutations[j].derived_state, sorted_mutations[j].derived_state_length,
7105-
sorted_mutations[j].metadata, sorted_mutations[j].metadata_length);
7106-
if (ret_id < 0) {
7107-
ret = (int) ret_id;
7108-
goto out;
7109-
}
7110-
}
7111-
ret = 0;
7112-
7113-
out:
7114-
tsk_safe_free(mutation_id_map);
7115-
tsk_safe_free(sorted_mutations);
7116-
tsk_mutation_table_free(&copy);
7117-
return ret;
7118-
}
7119-
7120-
static int
7121-
tsk_table_sorter_sort_mutations_canonical(tsk_table_sorter_t *self)
71227051
{
71237052
int ret = 0;
71247053
tsk_size_t j;
71257054
tsk_id_t ret_id, parent, mapped_parent, p;
71267055
tsk_mutation_table_t *mutations = &self->tables->mutations;
7056+
tsk_node_table_t *nodes = &self->tables->nodes;
71277057
tsk_size_t num_mutations = mutations->num_rows;
71287058
tsk_mutation_table_t copy;
7129-
mutation_canonical_sort_t *sorted_mutations
7059+
mutation_sort_t *sorted_mutations
71307060
= tsk_malloc(num_mutations * sizeof(*sorted_mutations));
71317061
tsk_id_t *mutation_id_map = tsk_malloc(num_mutations * sizeof(*mutation_id_map));
71327062

@@ -7158,14 +7088,15 @@ tsk_table_sorter_sort_mutations_canonical(tsk_table_sorter_t *self)
71587088
for (j = 0; j < num_mutations; j++) {
71597089
tsk_mutation_table_get_row_unsafe(&copy, (tsk_id_t) j, &sorted_mutations[j].mut);
71607090
sorted_mutations[j].mut.site = self->site_id_map[sorted_mutations[j].mut.site];
7091+
sorted_mutations[j].node_time = nodes->time[sorted_mutations[j].mut.node];
71617092
}
71627093
ret = tsk_mutation_table_clear(mutations);
71637094
if (ret != 0) {
71647095
goto out;
71657096
}
71667097

71677098
qsort(sorted_mutations, (size_t) num_mutations, sizeof(*sorted_mutations),
7168-
cmp_mutation_canonical);
7099+
cmp_mutation);
71697100

71707101
/* Make a first pass through the sorted mutations to build the ID map. */
71717102
for (j = 0; j < num_mutations; j++) {
@@ -12245,7 +12176,7 @@ tsk_table_collection_canonicalise(tsk_table_collection_t *self, tsk_flags_t opti
1224512176
if (ret != 0) {
1224612177
goto out;
1224712178
}
12248-
sorter.sort_mutations = tsk_table_sorter_sort_mutations_canonical;
12179+
sorter.sort_mutations = tsk_table_sorter_sort_mutations;
1224912180
sorter.sort_individuals = tsk_table_sorter_sort_individuals_canonical;
1225012181

1225112182
nodes = tsk_malloc(self->nodes.num_rows * sizeof(*nodes));

python/CHANGELOG.rst

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,13 @@
2121

2222
**Bugfixes**
2323

24+
- In some tables with mutations out-of-order `TableCollection.sort` did not re-order
25+
the mutations so they formed a valid TreeSequence. `TableCollection.sort` and
26+
`TableCollection.canonicalise` now sort mutations by site, then time (if known),
27+
then the mutation's node's time, then number of descendant mutations
28+
(ensuring that parent mutations occur before children), then node, then
29+
their original order in the tables. (:user:`benjeffery`, :pr:`3257`, :issue:`3253`)
30+
2431
- Fix bug in ``TreeSequence.pair_coalescence_counts`` when ``span_normalise=True``
2532
and a window breakpoint falls within an internal missing interval.
2633
(:user:`nspope`, :pr:`3176`, :issue:`3175`)

python/tests/test_extend_haplotypes.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -482,7 +482,7 @@ def extend_haplotypes(ts, max_iter=10):
482482
extender = HaplotypeExtender(ts, forwards=forwards)
483483
extender.extend_haplotypes()
484484
tables.edges.replace_with(extender.edges)
485-
tables.sort()
485+
tables.sort(mutation_start=tables.mutations.num_rows)
486486
tables.build_index()
487487
ts = tables.tree_sequence()
488488
if ts.num_edges == last_num_edges:
@@ -493,7 +493,6 @@ def extend_haplotypes(ts, max_iter=10):
493493
tables = ts.dump_tables()
494494
mutations = _slide_mutation_nodes_up(ts, mutations)
495495
tables.mutations.replace_with(mutations)
496-
tables.sort()
497496
ts = tables.tree_sequence()
498497
return ts
499498

python/tests/test_highlevel.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3396,7 +3396,12 @@ def test_text_record_round_trip(self, ts1):
33963396
sequence_length=ts1.sequence_length,
33973397
strict=True,
33983398
)
3399-
self.verify_approximate_equality(ts1, ts2)
3399+
tables1 = ts1.tables.copy()
3400+
# load_text performs a `sort`, which changes the order relative to
3401+
# the original tree sequence
3402+
tables1.sort()
3403+
ts1_sorted = tables1.tree_sequence()
3404+
self.verify_approximate_equality(ts1_sorted, ts2)
34003405

34013406
def test_empty_files(self):
34023407
nodes_file = io.StringIO("is_sample\ttime\n")

python/tests/test_table_transforms.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,11 @@ def test_genotypes_round_trip(self, ts):
596596
@pytest.mark.parametrize("ts", get_example_tree_sequences())
597597
@pytest.mark.parametrize("population", [-1, None])
598598
def test_definition(self, ts, population):
599+
# The python implementation of split_edges performs a sort,
600+
# which changes the order relative to the original tree sequence
601+
tables = ts.dump_tables()
602+
tables.sort()
603+
ts = tables.tree_sequence()
599604
time = 0 if ts.num_nodes == 0 else np.median(ts.tables.nodes.time)
600605
if ts.num_migrations == 0:
601606
ts1 = split_edges_definition(ts, time, population=population)

0 commit comments

Comments
 (0)