Skip to content

Commit f2d906d

Browse files
authored
Merge pull request #1119 from hyanwong/keep_unary_in_individuals
Add KEEP_UNARY_IN_INDIVIDUALS option to simplify()
2 parents b6cee42 + 929f4d2 commit f2d906d

File tree

6 files changed

+129
-2
lines changed

6 files changed

+129
-2
lines changed

c/CHANGELOG.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,10 @@
66

77
**Features**
88

9+
- Add ``TSK_KEEP_UNARY_IN_INDIVIDUALS`` flag to simplify, which allows the user to
10+
keep unary nodes only if they belong to a tabled individual. This is useful for
11+
simplification in forwards simulations (:user:`hyanwong`, :issue:`1113`, :pr:`1119`).
12+
913
**Bugfixes**
1014

1115

c/tests/test_trees.c

Lines changed: 97 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -940,11 +940,22 @@ test_simplest_records(void)
940940
CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0));
941941
tsk_treeseq_free(&simplified);
942942

943+
ret = tsk_treeseq_simplify(&ts, sample_ids, 2,
944+
TSK_KEEP_UNARY | TSK_KEEP_UNARY_IN_INDIVIDUALS, &simplified, NULL);
945+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE);
946+
tsk_treeseq_free(&simplified);
947+
943948
ret = tsk_treeseq_simplify(&ts, sample_ids, 2, TSK_KEEP_UNARY, &simplified, NULL);
944949
CU_ASSERT_EQUAL_FATAL(ret, 0);
945950
CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0));
946951
tsk_treeseq_free(&simplified);
947952

953+
ret = tsk_treeseq_simplify(
954+
&ts, sample_ids, 2, TSK_KEEP_UNARY_IN_INDIVIDUALS, &simplified, NULL);
955+
CU_ASSERT_EQUAL_FATAL(ret, 0);
956+
CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0));
957+
tsk_treeseq_free(&simplified);
958+
948959
tsk_treeseq_free(&ts);
949960
}
950961

@@ -978,6 +989,12 @@ test_simplest_nonbinary_records(void)
978989
CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0));
979990
tsk_treeseq_free(&simplified);
980991

992+
ret = tsk_treeseq_simplify(
993+
&ts, sample_ids, 4, TSK_KEEP_UNARY_IN_INDIVIDUALS, &simplified, NULL);
994+
CU_ASSERT_EQUAL_FATAL(ret, 0);
995+
CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0));
996+
tsk_treeseq_free(&simplified);
997+
981998
tsk_treeseq_free(&ts);
982999
}
9831000

@@ -993,7 +1010,7 @@ test_simplest_unary_records(void)
9931010
const char *edges = "0 1 2 0\n"
9941011
"0 1 3 1\n"
9951012
"0 1 4 2,3\n";
996-
tsk_treeseq_t ts, simplified;
1013+
tsk_treeseq_t ts, simplified, simplified_other;
9971014
tsk_id_t sample_ids[] = { 0, 1 };
9981015

9991016
tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0);
@@ -1019,6 +1036,84 @@ test_simplest_unary_records(void)
10191036
CU_ASSERT_TRUE(tsk_table_collection_equals(ts.tables, simplified.tables, 0));
10201037
tsk_treeseq_free(&simplified);
10211038

1039+
ret = tsk_treeseq_simplify(
1040+
&ts, sample_ids, 2, TSK_KEEP_UNARY_IN_INDIVIDUALS, &simplified, NULL);
1041+
CU_ASSERT_EQUAL_FATAL(ret, 0);
1042+
CU_ASSERT_FALSE(tsk_table_collection_equals(ts.tables, simplified.tables, 0));
1043+
ret = tsk_treeseq_simplify(&ts, sample_ids, 2, 0, &simplified_other, NULL);
1044+
CU_ASSERT_EQUAL_FATAL(ret, 0);
1045+
CU_ASSERT_TRUE(
1046+
tsk_table_collection_equals(simplified.tables, simplified_other.tables, 0));
1047+
tsk_treeseq_free(&simplified);
1048+
tsk_treeseq_free(&simplified_other);
1049+
1050+
tsk_treeseq_free(&ts);
1051+
}
1052+
1053+
static void
1054+
test_simplest_unary_with_individuals(void)
1055+
{
1056+
int ret;
1057+
const char *nodes = "1 0 0 -1\n"
1058+
"1 0 0 0\n"
1059+
"0 1 0 -1\n"
1060+
"0 1 0 1\n"
1061+
"0 2 0 -1\n"
1062+
"0 3 0 -1\n"
1063+
"0 3 0 2\n"
1064+
"0 1 0 -1\n"
1065+
"0 1 0 3\n"
1066+
"0 0 0 -1\n"
1067+
"0 0 0 4\n"
1068+
"0 1 0 3\n";
1069+
const char *edges = "0 2 2 0\n"
1070+
"0 2 3 1\n"
1071+
"2 3 7 0\n"
1072+
"2 3 8 1,9\n"
1073+
"2 3 11 10\n"
1074+
"0 2 4 2,3\n"
1075+
"0 1 5 4\n"
1076+
"1 2 6 4\n";
1077+
const char *individuals = "0 0.5\n"
1078+
"0 1.5,3.1\n"
1079+
"0 2.1\n"
1080+
"0 3.2\n"
1081+
"0 4.2\n";
1082+
const char *nodes_expect = "1 0 0 -1\n"
1083+
"1 0 0 0\n"
1084+
"0 1 0 1\n"
1085+
"0 1 0 3\n"
1086+
"0 2 0 -1\n"
1087+
"0 3 0 2\n";
1088+
const char *edges_expect = "0 2 2 1\n"
1089+
"2 3 3 1\n"
1090+
"0 2 4 0,2\n"
1091+
"1 2 5 4\n";
1092+
const char *individuals_expect = "0 0.5\n"
1093+
"0 1.5,3.1\n"
1094+
"0 2.1\n"
1095+
"0 3.2\n";
1096+
tsk_treeseq_t ts, simplified, expected;
1097+
tsk_id_t sample_ids[] = { 0, 1 };
1098+
1099+
tsk_treeseq_from_text(&ts, 3, nodes, edges, NULL, NULL, NULL, individuals, NULL, 0);
1100+
tsk_treeseq_from_text(&expected, 3, nodes_expect, edges_expect, NULL, NULL, NULL,
1101+
individuals_expect, NULL, 0);
1102+
CU_ASSERT_EQUAL(tsk_treeseq_get_num_samples(&ts), 2);
1103+
CU_ASSERT_EQUAL(tsk_treeseq_get_sequence_length(&ts), 3.0);
1104+
CU_ASSERT_EQUAL(tsk_treeseq_get_num_nodes(&ts), 12);
1105+
CU_ASSERT_EQUAL(tsk_treeseq_get_num_individuals(&ts), 5);
1106+
CU_ASSERT_EQUAL(tsk_treeseq_get_num_mutations(&ts), 0);
1107+
CU_ASSERT_EQUAL(tsk_treeseq_get_num_trees(&ts), 3);
1108+
CU_ASSERT_EQUAL(tsk_treeseq_get_num_populations(&ts), 1);
1109+
1110+
ret = tsk_treeseq_simplify(&ts, sample_ids, 2,
1111+
TSK_KEEP_UNARY_IN_INDIVIDUALS | TSK_FILTER_INDIVIDUALS, &simplified, NULL);
1112+
CU_ASSERT_EQUAL_FATAL(ret, 0);
1113+
CU_ASSERT_TRUE(tsk_table_collection_equals(simplified.tables, expected.tables, 0));
1114+
tsk_treeseq_free(&simplified);
1115+
1116+
tsk_treeseq_free(&expected);
10221117
tsk_treeseq_free(&ts);
10231118
}
10241119

@@ -5987,6 +6082,7 @@ main(int argc, char **argv)
59876082
{ "test_simplest_records", test_simplest_records },
59886083
{ "test_simplest_nonbinary_records", test_simplest_nonbinary_records },
59896084
{ "test_simplest_unary_records", test_simplest_unary_records },
6085+
{ "test_simplest_unary_with_individuals", test_simplest_unary_with_individuals },
59906086
{ "test_simplest_non_sample_leaf_records",
59916087
test_simplest_non_sample_leaf_records },
59926088
{ "test_simplest_degenerate_multiple_root_records",

c/tskit/core.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -470,6 +470,11 @@ tsk_strerror_internal(int err)
470470
case TSK_ERR_DUPLICATE_SAMPLE_PAIRS:
471471
ret = "There are duplicate sample pairs.";
472472
break;
473+
474+
/* Simplify errors */
475+
case TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE:
476+
ret = "You cannot specify both KEEP_UNARY and KEEP_UNARY_IN_INDIVDUALS.";
477+
break;
473478
}
474479
return ret;
475480
}

c/tskit/core.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -320,6 +320,9 @@ not found in the file.
320320
#define TSK_ERR_NO_SAMPLE_PAIRS -1500
321321
#define TSK_ERR_DUPLICATE_SAMPLE_PAIRS -1501
322322

323+
/* Simplify errors */
324+
#define TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE -1600
325+
323326
// clang-format on
324327

325328
/* This bit is 0 for any errors originating from kastore */

c/tskit/tables.c

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6094,6 +6094,8 @@ simplifier_print_state(simplifier_t *self, FILE *out)
60946094
fprintf(out, "\tkeep_unary : %d\n", !!(self->options & TSK_KEEP_UNARY));
60956095
fprintf(out, "\tkeep_input_roots : %d\n",
60966096
!!(self->options & TSK_KEEP_INPUT_ROOTS));
6097+
fprintf(out, "\tkeep_unary_in_individuals : %d\n",
6098+
!!(self->options & TSK_KEEP_UNARY_IN_INDIVIDUALS));
60976099

60986100
fprintf(out, "===\nInput tables\n==\n");
60996101
tsk_table_collection_print_state(&self->input_tables, out);
@@ -6635,8 +6637,16 @@ simplifier_merge_ancestors(simplifier_t *self, tsk_id_t input_id)
66356637
double left, right, prev_right;
66366638
tsk_id_t ancestry_node;
66376639
tsk_id_t output_id = self->node_id_map[input_id];
6640+
66386641
bool is_sample = output_id != TSK_NULL;
6639-
bool keep_unary = !!(self->options & TSK_KEEP_UNARY);
6642+
bool keep_unary = false;
6643+
if (self->options & TSK_KEEP_UNARY) {
6644+
keep_unary = true;
6645+
}
6646+
if ((self->options & TSK_KEEP_UNARY_IN_INDIVIDUALS)
6647+
&& (self->tables->nodes.individual[input_id] != TSK_NULL)) {
6648+
keep_unary = true;
6649+
}
66406650

66416651
if (is_sample) {
66426652
/* Free up the existing ancestry mapping. */
@@ -8546,6 +8556,11 @@ tsk_table_collection_simplify(tsk_table_collection_t *self, const tsk_id_t *samp
85468556
/* Avoid calling to simplifier_free with uninit'd memory on error branches */
85478557
memset(&simplifier, 0, sizeof(simplifier_t));
85488558

8559+
if ((options & TSK_KEEP_UNARY) && (options & TSK_KEEP_UNARY_IN_INDIVIDUALS)) {
8560+
ret = TSK_ERR_KEEP_UNARY_MUTUALLY_EXCLUSIVE;
8561+
goto out;
8562+
}
8563+
85498564
/* For now we don't bother with edge metadata, but it can easily be
85508565
* implemented. */
85518566
if (self->edges.metadata_length > 0) {

c/tskit/tables.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -693,6 +693,7 @@ typedef struct {
693693
#define TSK_REDUCE_TO_SITE_TOPOLOGY (1 << 3)
694694
#define TSK_KEEP_UNARY (1 << 4)
695695
#define TSK_KEEP_INPUT_ROOTS (1 << 5)
696+
#define TSK_KEEP_UNARY_IN_INDIVIDUALS (1 << 6)
696697

697698
/* Flags for check_integrity */
698699
#define TSK_CHECK_EDGE_ORDERING (1 << 0)
@@ -2671,6 +2672,9 @@ TSK_KEEP_INPUT_ROOTS
26712672
By default simplify removes all topology ancestral the MRCAs of the samples.
26722673
This option inserts edges from these MRCAs back to the roots of the input
26732674
trees.
2675+
TSK_KEEP_UNARY_IN_INDIVDUALS
2676+
This acts like TSK_KEEP_UNARY (and is mutually exclusive with that flag). It
2677+
keeps unary nodes, but only if the unary node is referenced from an individual.
26742678
26752679
.. note:: Migrations are currently not supported by simplify, and an error will
26762680
be raised if we attempt call simplify on a table collection with greater

0 commit comments

Comments
 (0)