Skip to content

Commit 1eabf28

Browse files
authored
Merge pull request #782 from jeromekelleher/simplify-keep-path-to-ancient
Keep path to ancient in simplify
2 parents 20af34a + 6f885de commit 1eabf28

File tree

17 files changed

+1113
-229
lines changed

17 files changed

+1113
-229
lines changed

appveyor.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ build_script:
2929
- cmd: python -m pip install newick
3030
- cmd: python -m pip install python_jsonschema_objects
3131
- cmd: python -m pip install xmlunittest
32+
- cmd: python -m pip install portion
3233
- cmd: python -m nose -vs --processes=%NUMBER_OF_PROCESSORS% --process-timeout=5000
3334

3435
after_test:

c/CHANGELOG.rst

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,13 +6,19 @@
66

77
- The macro ``TSK_IMPUTE_MISSING_DATA`` is renamed to ``TSK_ISOLATED_NOT_MISSING``
88

9+
**New features**
10+
11+
- Add a ``TSK_KEEP_INPUT_ROOTS`` option to simplify which, if enabled, adds edges
12+
from the MRCAs of samples in the simplified tree sequence back to the roots
13+
in the input tree sequence (:user:`jeromekelleher`, :issue:`775`, :pr:`782`).
14+
915
---------------------
1016
[0.99.4] - 2020-08-12
1117
---------------------
1218

1319
**Note**
1420

15-
- The ``TSK_VERSION_PATCH`` macro was incorrectly set to ``4`` for 0.99.3, so both
21+
- The ``TSK_VERSION_PATCH`` macro was incorrectly set to ``4`` for 0.99.3, so both
1622
0.99.4 and 0.99.3 have the same value.
1723

1824
**Changes**
@@ -70,7 +76,7 @@
7076

7177
- New methods to perform set operations on table collections.
7278
``tsk_table_collection_subset`` subsets and reorders table collections by nodes
73-
(:user:`mufernando`, :user:`petrelharp`, :pr:`663`, :pr:`690`).
79+
(:user:`mufernando`, :user:`petrelharp`, :pr:`663`, :pr:`690`).
7480
``tsk_table_collection_union`` forms the node-wise union of two table collections
7581
(:user:`mufernando`, :user:`petrelharp`, :issue:`381`, :pr:`623`).
7682

c/tests/test_tables.c

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2978,6 +2978,80 @@ test_copy_table_collection(void)
29782978
tsk_treeseq_free(&ts);
29792979
}
29802980

2981+
static void
2982+
test_sort_tables_offsets(void)
2983+
{
2984+
int ret;
2985+
tsk_treeseq_t *ts;
2986+
tsk_table_collection_t tables, copy;
2987+
tsk_bookmark_t bookmark;
2988+
2989+
ts = caterpillar_tree(10, 5, 5);
2990+
ret = tsk_treeseq_copy_tables(ts, &tables, 0);
2991+
CU_ASSERT_EQUAL_FATAL(ret, 0);
2992+
2993+
ret = tsk_table_collection_sort(&tables, NULL, 0);
2994+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_MIGRATIONS_NOT_SUPPORTED);
2995+
2996+
tsk_migration_table_clear(&tables.migrations);
2997+
ret = tsk_table_collection_sort(&tables, NULL, 0);
2998+
CU_ASSERT_EQUAL_FATAL(ret, 0);
2999+
3000+
/* Check that setting edge offset = len(edges) does nothing */
3001+
reverse_edges(&tables);
3002+
ret = tsk_table_collection_copy(&tables, &copy, 0);
3003+
CU_ASSERT_EQUAL_FATAL(ret, 0);
3004+
memset(&bookmark, 0, sizeof(bookmark));
3005+
bookmark.edges = tables.edges.num_rows;
3006+
ret = tsk_table_collection_sort(&tables, &bookmark, 0);
3007+
CU_ASSERT_EQUAL_FATAL(ret, 0);
3008+
CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &copy));
3009+
3010+
ret = tsk_table_collection_sort(&tables, NULL, 0);
3011+
CU_ASSERT_EQUAL_FATAL(ret, 0);
3012+
CU_ASSERT_FATAL(tables.sites.num_rows > 2);
3013+
CU_ASSERT_FATAL(tables.mutations.num_rows > 2);
3014+
3015+
/* Check that setting mutation and site offset = to the len
3016+
* of the tables leaves them untouched. */
3017+
reverse_mutations(&tables);
3018+
/* Swap the positions of the first two sites, as a quick way
3019+
* to disorder the site table */
3020+
tables.sites.position[0] = tables.sites.position[1];
3021+
tables.sites.position[1] = 0;
3022+
ret = tsk_table_collection_copy(&tables, &copy, TSK_NO_INIT);
3023+
CU_ASSERT_EQUAL_FATAL(ret, 0);
3024+
memset(&bookmark, 0, sizeof(bookmark));
3025+
bookmark.sites = tables.sites.num_rows;
3026+
bookmark.mutations = tables.mutations.num_rows;
3027+
ret = tsk_table_collection_sort(&tables, &bookmark, 0);
3028+
CU_ASSERT_EQUAL_FATAL(ret, 0);
3029+
CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &copy));
3030+
3031+
/* Anything other than len(table) leads to an error for sites
3032+
* and mutations, and we can't specify one without the other. */
3033+
memset(&bookmark, 0, sizeof(bookmark));
3034+
bookmark.sites = tables.sites.num_rows;
3035+
ret = tsk_table_collection_sort(&tables, &bookmark, 0);
3036+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED);
3037+
3038+
memset(&bookmark, 0, sizeof(bookmark));
3039+
bookmark.mutations = tables.mutations.num_rows;
3040+
ret = tsk_table_collection_sort(&tables, &bookmark, 0);
3041+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED);
3042+
3043+
memset(&bookmark, 0, sizeof(bookmark));
3044+
bookmark.sites = tables.sites.num_rows - 1;
3045+
bookmark.mutations = tables.mutations.num_rows - 1;
3046+
ret = tsk_table_collection_sort(&tables, &bookmark, 0);
3047+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED);
3048+
3049+
tsk_table_collection_free(&tables);
3050+
tsk_table_collection_free(&copy);
3051+
tsk_treeseq_free(ts);
3052+
free(ts);
3053+
}
3054+
29813055
static void
29823056
test_sort_tables_drops_indexes_with_options(tsk_flags_t tc_options)
29833057
{
@@ -3128,7 +3202,7 @@ test_sort_tables_errors(void)
31283202
memset(&pos, 0, sizeof(pos));
31293203
pos.migrations = 1;
31303204
ret = tsk_table_collection_sort(&tables, &pos, 0);
3131-
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED);
3205+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATIONS_NOT_SUPPORTED);
31323206

31333207
memset(&pos, 0, sizeof(pos));
31343208
pos.sites = 1;
@@ -4552,6 +4626,7 @@ main(int argc, char **argv)
45524626
test_link_ancestors_samples_and_ancestors_overlap },
45534627
{ "test_link_ancestors_multiple_to_single_tree",
45544628
test_link_ancestors_multiple_to_single_tree },
4629+
{ "test_sort_tables_offsets", test_sort_tables_offsets },
45554630
{ "test_sort_tables_drops_indexes", test_sort_tables_drops_indexes },
45564631
{ "test_sort_tables_edge_metadata", test_sort_tables_edge_metadata },
45574632
{ "test_sort_tables_no_edge_metadata", test_sort_tables_no_edge_metadata },

c/tests/test_trees.c

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2566,6 +2566,69 @@ test_simplest_reduce_site_topology(void)
25662566
tsk_table_collection_free(&tables);
25672567
}
25682568

2569+
static void
2570+
test_simplest_simplify_defragment(void)
2571+
{
2572+
const char *nodes = "0 2 -1\n"
2573+
"0 2 -1\n"
2574+
"0 2 -1\n"
2575+
"0 2 -1\n"
2576+
"0 2 -1\n"
2577+
"0 2 -1\n"
2578+
"0 1 -1\n"
2579+
"0 1 -1\n"
2580+
"0 1 -1\n"
2581+
"0 1 -1\n"
2582+
"0 1 -1\n"
2583+
"0 1 -1\n"
2584+
"1 0 -1\n"
2585+
"1 0 -1\n"
2586+
"1 0 -1\n"
2587+
"1 0 -1\n"
2588+
"1 0 -1\n"
2589+
"1 0 -1\n";
2590+
const char *edges = "0.00000000 0.20784841 8 12\n"
2591+
"0.00000000 0.42202433 8 15\n"
2592+
"0.00000000 0.63541014 8 16\n"
2593+
"0.42202433 1.00000000 9 15\n"
2594+
"0.00000000 1.00000000 9 17\n"
2595+
"0.00000000 1.00000000 10 14\n"
2596+
"0.20784841 1.00000000 11 12\n"
2597+
"0.00000000 1.00000000 11 13\n"
2598+
"0.63541014 1.00000000 11 16\n"
2599+
"0.00000000 1.00000000 0 10\n"
2600+
"0.62102072 1.00000000 1 9\n"
2601+
"0.00000000 1.00000000 1 11\n"
2602+
"0.00000000 0.26002984 2 6\n"
2603+
"0.26002984 1.00000000 2 6\n"
2604+
"0.00000000 0.62102072 2 9\n"
2605+
"0.55150554 1.00000000 3 8\n"
2606+
"0.00000000 1.00000000 4 7\n"
2607+
"0.00000000 0.55150554 5 8\n";
2608+
2609+
tsk_id_t samples[] = { 12, 13, 14, 15, 16, 17 };
2610+
tsk_table_collection_t tables;
2611+
int ret;
2612+
2613+
/* This was the simplest example I could find that exercised the
2614+
* inner loops of the simplifier_extract_ancestry function */
2615+
ret = tsk_table_collection_init(&tables, 0);
2616+
CU_ASSERT_EQUAL_FATAL(ret, 0);
2617+
tables.sequence_length = 1;
2618+
parse_nodes(nodes, &tables.nodes);
2619+
CU_ASSERT_EQUAL_FATAL(tables.nodes.num_rows, 18);
2620+
parse_edges(edges, &tables.edges);
2621+
CU_ASSERT_EQUAL_FATAL(tables.edges.num_rows, 18);
2622+
2623+
ret = tsk_table_collection_simplify(&tables, samples, 6, 0, NULL);
2624+
CU_ASSERT_EQUAL_FATAL(ret, 0);
2625+
2626+
CU_ASSERT_EQUAL(tables.nodes.num_rows, 10);
2627+
CU_ASSERT_EQUAL(tables.edges.num_rows, 10);
2628+
2629+
tsk_table_collection_free(&tables);
2630+
}
2631+
25692632
static void
25702633
test_simplest_population_filter(void)
25712634
{
@@ -3640,6 +3703,7 @@ test_single_tree_iter_depths(void)
36403703
tsk_tree_free(&tree);
36413704
tsk_treeseq_free(&ts);
36423705
}
3706+
36433707
static void
36443708
test_single_tree_simplify(void)
36453709
{
@@ -3697,6 +3761,55 @@ test_single_tree_simplify(void)
36973761
tsk_table_collection_free(&tables);
36983762
}
36993763

3764+
static void
3765+
test_single_tree_simplify_debug(void)
3766+
{
3767+
tsk_treeseq_t ts, simplified;
3768+
tsk_id_t samples[] = { 0, 1 };
3769+
int ret;
3770+
FILE *save_stdout = stdout;
3771+
FILE *tmp = fopen(_tmp_file_name, "w");
3772+
3773+
CU_ASSERT_FATAL(tmp != NULL);
3774+
tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL,
3775+
single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0);
3776+
3777+
stdout = tmp;
3778+
ret = tsk_treeseq_simplify(&ts, samples, 2, TSK_DEBUG, &simplified, NULL);
3779+
stdout = save_stdout;
3780+
CU_ASSERT_EQUAL_FATAL(ret, 0);
3781+
CU_ASSERT_TRUE(ftell(tmp) > 0);
3782+
3783+
fclose(tmp);
3784+
tsk_treeseq_free(&ts);
3785+
tsk_treeseq_free(&simplified);
3786+
}
3787+
3788+
static void
3789+
test_single_tree_simplify_keep_input_roots(void)
3790+
{
3791+
tsk_treeseq_t ts;
3792+
tsk_table_collection_t tables;
3793+
tsk_id_t samples[] = { 0, 1 };
3794+
int ret;
3795+
3796+
tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL,
3797+
single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0);
3798+
verify_simplify(&ts);
3799+
ret = tsk_treeseq_copy_tables(&ts, &tables, 0);
3800+
CU_ASSERT_EQUAL_FATAL(ret, 0);
3801+
3802+
ret = tsk_table_collection_simplify(&tables, samples, 2, TSK_KEEP_INPUT_ROOTS, NULL);
3803+
CU_ASSERT_EQUAL_FATAL(ret, 0);
3804+
CU_ASSERT_EQUAL(tables.nodes.num_rows, 4);
3805+
CU_ASSERT_EQUAL(tables.edges.num_rows, 3);
3806+
CU_ASSERT_EQUAL(tables.sites.num_rows, 3);
3807+
CU_ASSERT_EQUAL(tables.mutations.num_rows, 4);
3808+
3809+
tsk_treeseq_free(&ts);
3810+
tsk_table_collection_free(&tables);
3811+
}
3812+
37003813
static void
37013814
test_single_tree_simplify_no_sample_nodes(void)
37023815
{
@@ -4283,6 +4396,60 @@ test_nonbinary_multi_tree(void)
42834396
tsk_treeseq_free(&ts);
42844397
}
42854398

4399+
static void
4400+
test_simplify_keep_input_roots_multi_tree(void)
4401+
{
4402+
4403+
/*
4404+
0.25┊ 8 ┊ ┊ ┊
4405+
┊ ┏━┻━┓ ┊ ┊ ┊
4406+
0.20┊ ┃ ┃ ┊ ┊ 7 ┊
4407+
┊ ┃ ┃ ┊ ┊ ┏━┻━┓ ┊
4408+
0.17┊ 6 ┃ ┊ 6 ┊ ┃ ┃ ┊
4409+
┊ ┏━┻┓ ┃ ┊ ┏━┻━┓ ┊ ┃ ┃ ┊
4410+
0.09┊ ┃ 5 ┃ ┊ ┃ 5 ┊ ┃ 5 ┊
4411+
┊ ┃ ┏┻┓ ┃ ┊ ┃ ┏━┻┓ ┊ ┃ ┏━┻┓ ┊
4412+
0.07┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ 4 ┊ ┃ ┃ 4 ┊
4413+
┊ ┃ ┃ ┃ ┃ ┊ ┃ ┃ ┏┻┓ ┊ ┃ ┃ ┏┻┓ ┊
4414+
0.00┊ 0 1 3 2 ┊ 0 1 2 3 ┊ 0 1 2 3 ┊
4415+
0.00 2.00 7.00 10.00
4416+
4417+
Simplifies to
4418+
4419+
0.25┊ 4 ┊ ┊ ┊
4420+
┊ ┃ ┊ ┊ ┊
4421+
0.20┊ ┃ ┊ ┊ 3 ┊
4422+
┊ ┃ ┊ ┊ ┏┻┓ ┊
4423+
0.17┊ 2 ┊ 2 ┊ ┃ ┃ ┊
4424+
┊ ┏┻┓ ┊ ┏┻┓ ┊ ┃ ┃ ┊
4425+
0.00┊ 0 1 ┊ 0 1 ┊ 0 1 ┊
4426+
0.00 2.00 7.00 10.00
4427+
4428+
*/
4429+
int ret = 0;
4430+
// clang-format off
4431+
tsk_id_t parents[] = {
4432+
2, 2, 4, -1, -1,
4433+
2, 2, -1, -1, -1,
4434+
3, 3, -1, -1, -1,
4435+
};
4436+
// clang-format on
4437+
uint32_t num_trees = 3;
4438+
4439+
tsk_id_t samples[] = { 0, 3 };
4440+
tsk_treeseq_t ts, simplified;
4441+
4442+
tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
4443+
paper_ex_mutations, paper_ex_individuals, NULL, 0);
4444+
tsk_treeseq_dump(&ts, "tmp.trees", 0);
4445+
ret = tsk_treeseq_simplify(&ts, samples, 2, TSK_KEEP_INPUT_ROOTS, &simplified, NULL);
4446+
CU_ASSERT_EQUAL_FATAL(ret, 0);
4447+
verify_trees(&simplified, num_trees, parents);
4448+
4449+
tsk_treeseq_free(&ts);
4450+
tsk_treeseq_free(&simplified);
4451+
}
4452+
42864453
static void
42874454
test_left_to_right_multi_tree(void)
42884455
{
@@ -5811,6 +5978,7 @@ main(int argc, char **argv)
58115978
{ "test_simplest_overlapping_unary_edges_internal_samples_simplify",
58125979
test_simplest_overlapping_unary_edges_internal_samples_simplify },
58135980
{ "test_simplest_reduce_site_topology", test_simplest_reduce_site_topology },
5981+
{ "test_simplest_simplify_defragment", test_simplest_simplify_defragment },
58145982
{ "test_simplest_population_filter", test_simplest_population_filter },
58155983
{ "test_simplest_individual_filter", test_simplest_individual_filter },
58165984
{ "test_simplest_map_mutations", test_simplest_map_mutations },
@@ -5839,6 +6007,9 @@ main(int argc, char **argv)
58396007
{ "test_single_tree_iter_times", test_single_tree_iter_times },
58406008
{ "test_single_tree_iter_depths", test_single_tree_iter_depths },
58416009
{ "test_single_tree_simplify", test_single_tree_simplify },
6010+
{ "test_single_tree_simplify_debug", test_single_tree_simplify_debug },
6011+
{ "test_single_tree_simplify_keep_input_roots",
6012+
test_single_tree_simplify_keep_input_roots },
58426013
{ "test_single_tree_simplify_no_sample_nodes",
58436014
test_single_tree_simplify_no_sample_nodes },
58446015
{ "test_single_tree_simplify_null_samples",
@@ -5859,6 +6030,8 @@ main(int argc, char **argv)
58596030
{ "test_internal_sample_multi_tree", test_internal_sample_multi_tree },
58606031
{ "test_internal_sample_simplified_multi_tree",
58616032
test_internal_sample_simplified_multi_tree },
6033+
{ "test_simplify_keep_input_roots_multi_tree",
6034+
test_simplify_keep_input_roots_multi_tree },
58626035
{ "test_left_to_right_multi_tree", test_left_to_right_multi_tree },
58636036
{ "test_gappy_multi_tree", test_gappy_multi_tree },
58646037
{ "test_tsk_treeseq_bad_records", test_tsk_treeseq_bad_records },

c/tskit/core.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -351,8 +351,9 @@ tsk_strerror_internal(int err)
351351
ret = "Migrations not currently supported by this operation";
352352
break;
353353
case TSK_ERR_SORT_OFFSET_NOT_SUPPORTED:
354-
ret = "Specifying position for mutation, sites or migrations is not "
355-
"supported";
354+
ret = "Sort offsets for sites and mutations must be either 0 "
355+
"or the length of the respective tables. Intermediate values "
356+
"are not supported";
356357
break;
357358
case TSK_ERR_NONBINARY_MUTATIONS_UNSUPPORTED:
358359
ret = "Only binary mutations are supported for this operation";

0 commit comments

Comments
 (0)