Skip to content

Commit edb3df2

Browse files
molpopgenmergify[bot]
authored andcommitted
Implement efficient initialization of null trees.
* Add seeking to a tree by index. Closes #2659
1 parent a8e9f67 commit edb3df2

File tree

4 files changed

+217
-15
lines changed

4 files changed

+217
-15
lines changed

c/tests/test_trees.c

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6140,10 +6140,16 @@ test_seek_multi_tree(void)
61406140
ret = tsk_tree_seek(&t, breakpoints[j], 0);
61416141
CU_ASSERT_EQUAL_FATAL(ret, 0);
61426142
CU_ASSERT_EQUAL_FATAL(t.index, j);
6143+
ret = tsk_tree_seek_index(&t, j, 0);
6144+
CU_ASSERT_EQUAL_FATAL(ret, 0);
6145+
CU_ASSERT_EQUAL_FATAL(t.index, j);
61436146
for (k = 0; k < num_trees; k++) {
61446147
ret = tsk_tree_seek(&t, breakpoints[k], 0);
61456148
CU_ASSERT_EQUAL_FATAL(ret, 0);
61466149
CU_ASSERT_EQUAL_FATAL(t.index, k);
6150+
ret = tsk_tree_seek_index(&t, k, 0);
6151+
CU_ASSERT_EQUAL_FATAL(ret, 0);
6152+
CU_ASSERT_EQUAL_FATAL(t.index, k);
61476153
}
61486154
}
61496155

@@ -6205,6 +6211,10 @@ test_seek_errors(void)
62056211
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SEEK_OUT_OF_BOUNDS);
62066212
ret = tsk_tree_seek(&t, 11, 0);
62076213
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SEEK_OUT_OF_BOUNDS);
6214+
ret = tsk_tree_seek_index(&t, (tsk_id_t) ts.num_trees, 0);
6215+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SEEK_OUT_OF_BOUNDS);
6216+
ret = tsk_tree_seek_index(&t, -1, 0);
6217+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SEEK_OUT_OF_BOUNDS);
62086218

62096219
tsk_tree_free(&t);
62106220
tsk_treeseq_free(&ts);

c/tskit/trees.c

Lines changed: 146 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4549,19 +4549,138 @@ tsk_tree_position_in_interval(const tsk_tree_t *self, double x)
45494549
return self->interval.left <= x && x < self->interval.right;
45504550
}
45514551

4552-
int TSK_WARN_UNUSED
4553-
tsk_tree_seek(tsk_tree_t *self, double x, tsk_flags_t TSK_UNUSED(options))
4552+
/* NOTE:
4553+
*
4554+
* Notes from Kevin Thornton:
4555+
*
4556+
* This method inserts the edges for an arbitrary tree
4557+
* in linear time and requires no additional memory.
4558+
*
4559+
* During design, the following alternatives were tested
4560+
* (in a combination of rust + C):
4561+
* 1. Indexing edge insertion/removal locations by tree.
4562+
* The indexing can be done in O(n) time, giving O(1)
4563+
* access to the first edge in a tree. We can then add
4564+
* edges to the tree in O(e) time, where e is the number
4565+
* of edges. This apparoach requires O(n) additional memory
4566+
* and is only marginally faster than the implementation below.
4567+
* 2. Building an interval tree mapping edge id -> span.
4568+
* This approach adds a lot of complexity and wasn't any faster
4569+
* than the indexing described above.
4570+
*/
4571+
static int
4572+
tsk_tree_seek_from_null(tsk_tree_t *self, double x, tsk_flags_t TSK_UNUSED(options))
45544573
{
45554574
int ret = 0;
4575+
tsk_size_t edge;
4576+
tsk_id_t p, c, e, j, k, tree_index;
45564577
const double L = tsk_treeseq_get_sequence_length(self->tree_sequence);
4557-
const double t_l = self->interval.left;
4558-
const double t_r = self->interval.right;
4559-
double distance_left, distance_right;
4578+
const tsk_treeseq_t *treeseq = self->tree_sequence;
4579+
const tsk_table_collection_t *tables = treeseq->tables;
4580+
const tsk_id_t *restrict edge_parent = tables->edges.parent;
4581+
const tsk_id_t *restrict edge_child = tables->edges.child;
4582+
const tsk_size_t num_edges = tables->edges.num_rows;
4583+
const tsk_size_t num_trees = self->tree_sequence->num_trees;
4584+
const double *restrict edge_left = tables->edges.left;
4585+
const double *restrict edge_right = tables->edges.right;
4586+
const double *restrict breakpoints = treeseq->breakpoints;
4587+
const tsk_id_t *restrict insertion = tables->indexes.edge_insertion_order;
4588+
const tsk_id_t *restrict removal = tables->indexes.edge_removal_order;
4589+
4590+
// NOTE: it may be better to get the
4591+
// index first and then ask if we are
4592+
// searching in the first or last 1/2
4593+
// of trees.
4594+
j = -1;
4595+
if (x <= L / 2.0) {
4596+
for (edge = 0; edge < num_edges; edge++) {
4597+
e = insertion[edge];
4598+
if (edge_left[e] > x) {
4599+
j = (tsk_id_t) edge;
4600+
break;
4601+
}
4602+
if (x >= edge_left[e] && x < edge_right[e]) {
4603+
p = edge_parent[e];
4604+
c = edge_child[e];
4605+
tsk_tree_insert_edge(self, p, c, e);
4606+
}
4607+
}
4608+
} else {
4609+
for (edge = 0; edge < num_edges; edge++) {
4610+
e = removal[num_edges - edge - 1];
4611+
if (edge_right[e] < x) {
4612+
j = (tsk_id_t)(num_edges - edge - 1);
4613+
while (j < (tsk_id_t) num_edges && edge_left[insertion[j]] <= x) {
4614+
j++;
4615+
}
4616+
break;
4617+
}
4618+
if (x >= edge_left[e] && x < edge_right[e]) {
4619+
p = edge_parent[e];
4620+
c = edge_child[e];
4621+
tsk_tree_insert_edge(self, p, c, e);
4622+
}
4623+
}
4624+
}
45604625

4561-
if (x < 0 || x >= L) {
4626+
if (j == -1) {
4627+
j = 0;
4628+
while (j < (tsk_id_t) num_edges && edge_left[insertion[j]] <= x) {
4629+
j++;
4630+
}
4631+
}
4632+
k = 0;
4633+
while (k < (tsk_id_t) num_edges && edge_right[removal[k]] <= x) {
4634+
k++;
4635+
}
4636+
4637+
/* NOTE: tsk_search_sorted finds the first the first
4638+
* insertion locatiom >= the query point, which
4639+
* finds a RIGHT value for queries not at the left edge.
4640+
*/
4641+
tree_index = (tsk_id_t) tsk_search_sorted(breakpoints, num_trees + 1, x);
4642+
if (breakpoints[tree_index] > x) {
4643+
tree_index -= 1;
4644+
}
4645+
self->index = tree_index;
4646+
self->interval.left = breakpoints[tree_index];
4647+
self->interval.right = breakpoints[tree_index + 1];
4648+
self->left_index = j;
4649+
self->right_index = k;
4650+
self->direction = TSK_DIR_FORWARD;
4651+
self->num_nodes = tables->nodes.num_rows;
4652+
if (tables->sites.num_rows > 0) {
4653+
self->sites = treeseq->tree_sites[self->index];
4654+
self->sites_length = treeseq->tree_sites_length[self->index];
4655+
}
4656+
4657+
return ret;
4658+
}
4659+
4660+
int TSK_WARN_UNUSED
4661+
tsk_tree_seek_index(tsk_tree_t *self, tsk_id_t tree, tsk_flags_t options)
4662+
{
4663+
int ret = 0;
4664+
double x;
4665+
4666+
if (tree < 0 || tree >= (tsk_id_t) self->tree_sequence->num_trees) {
45624667
ret = TSK_ERR_SEEK_OUT_OF_BOUNDS;
45634668
goto out;
45644669
}
4670+
x = self->tree_sequence->breakpoints[tree];
4671+
ret = tsk_tree_seek(self, x, options);
4672+
out:
4673+
return ret;
4674+
}
4675+
4676+
static int TSK_WARN_UNUSED
4677+
tsk_tree_seek_linear(tsk_tree_t *self, double x, tsk_flags_t TSK_UNUSED(options))
4678+
{
4679+
const double L = tsk_treeseq_get_sequence_length(self->tree_sequence);
4680+
const double t_l = self->interval.left;
4681+
const double t_r = self->interval.right;
4682+
int ret = 0;
4683+
double distance_left, distance_right;
45654684

45664685
if (x < t_l) {
45674686
/* |-----|-----|========|---------| */
@@ -4594,6 +4713,27 @@ tsk_tree_seek(tsk_tree_t *self, double x, tsk_flags_t TSK_UNUSED(options))
45944713
return ret;
45954714
}
45964715

4716+
int TSK_WARN_UNUSED
4717+
tsk_tree_seek(tsk_tree_t *self, double x, tsk_flags_t options)
4718+
{
4719+
int ret = 0;
4720+
const double L = tsk_treeseq_get_sequence_length(self->tree_sequence);
4721+
4722+
if (x < 0 || x >= L) {
4723+
ret = TSK_ERR_SEEK_OUT_OF_BOUNDS;
4724+
goto out;
4725+
}
4726+
4727+
if (self->index == -1) {
4728+
ret = tsk_tree_seek_from_null(self, x, options);
4729+
} else {
4730+
ret = tsk_tree_seek_linear(self, x, options);
4731+
}
4732+
4733+
out:
4734+
return ret;
4735+
}
4736+
45974737
int TSK_WARN_UNUSED
45984738
tsk_tree_clear(tsk_tree_t *self)
45994739
{

c/tskit/trees.h

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1192,12 +1192,6 @@ we will have ``position < tree.interval.right``.
11921192
11931193
Seeking to a position currently covered by the tree is
11941194
a constant time operation.
1195-
1196-
.. warning::
1197-
The current implementation of ``seek`` does **not** provide efficient
1198-
random access to arbitrary positions along the genome. However,
1199-
sequentially seeking in either direction is as efficient as calling
1200-
:c:func:`tsk_tree_next` or :c:func:`tsk_tree_prev` directly.
12011195
@endrst
12021196
12031197
@param self A pointer to an initialised tsk_tree_t object.
@@ -1208,6 +1202,22 @@ a constant time operation.
12081202
*/
12091203
int tsk_tree_seek(tsk_tree_t *self, double position, tsk_flags_t options);
12101204

1205+
/**
1206+
@brief Seek to a specific tree in a tree sequence.
1207+
1208+
@rst
1209+
Set the state of this tree to reflect the tree in parent
1210+
tree sequence whose index is ``0 <= tree < num_trees``.
1211+
@endrst
1212+
1213+
@param self A pointer to an initialised tsk_tree_t object.
1214+
@param tree The target tree index.
1215+
@param options Seek options. Currently unused. Set to 0 for compatibility
1216+
with future versions of tskit.
1217+
@return Return 0 on success or a negative value on failure.
1218+
*/
1219+
int tsk_tree_seek_index(tsk_tree_t *self, tsk_id_t tree, tsk_flags_t options);
1220+
12111221
/** @} */
12121222

12131223
/**

python/tests/test_highlevel.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4598,10 +4598,13 @@ def test_index_from_different_directions(self, index):
45984598
t2.prev()
45994599
assert_same_tree_different_order(t1, t2)
46004600

4601-
def test_seek_0_from_null(self):
4601+
@pytest.mark.parametrize("position", [0, 1, 2, 3])
4602+
def test_seek_from_null(self, position):
46024603
t1, t2 = self.setup()
4603-
t1.first()
4604-
t2.seek(0)
4604+
t1.clear()
4605+
t1.seek(position)
4606+
t2.first()
4607+
t2.seek(position)
46054608
assert_trees_identical(t1, t2)
46064609

46074610
@pytest.mark.parametrize("index", range(3))
@@ -4654,6 +4657,14 @@ def test_seek_3_from_null(self):
46544657
t2.seek(3)
46554658
assert_trees_identical(t1, t2)
46564659

4660+
def test_seek_3_from_null_prev(self):
4661+
t1, t2 = self.setup()
4662+
t1.last()
4663+
t1.prev()
4664+
t2.seek(3)
4665+
t2.prev()
4666+
assert_trees_identical(t1, t2)
4667+
46574668
def test_seek_3_from_0(self):
46584669
t1, t2 = self.setup()
46594670
t1.last()
@@ -4669,6 +4680,37 @@ def test_seek_0_from_3(self):
46694680
t2.seek(0)
46704681
assert_trees_identical(t1, t2)
46714682

4683+
@pytest.mark.parametrize("ts", get_example_tree_sequences())
4684+
def test_seek_mid_null_and_middle(self, ts):
4685+
breakpoints = ts.breakpoints(as_array=True)
4686+
mid = breakpoints[:-1] + np.diff(breakpoints) / 2
4687+
for index, x in enumerate(mid[:-1]):
4688+
t1 = tskit.Tree(ts)
4689+
t1.seek(x)
4690+
# Also seek to this point manually to make sure we're not
4691+
# reusing the seek from null under the hood.
4692+
t2 = tskit.Tree(ts)
4693+
if index <= ts.num_trees / 2:
4694+
while t2.index != index:
4695+
t2.next()
4696+
else:
4697+
while t2.index != index:
4698+
t2.prev()
4699+
assert t1.index == t2.index
4700+
assert np.all(t1.parent_array == t2.parent_array)
4701+
4702+
@pytest.mark.parametrize("ts", get_example_tree_sequences())
4703+
def test_seek_last_then_prev(self, ts):
4704+
t1 = tskit.Tree(ts)
4705+
t1.seek(ts.sequence_length - 0.00001)
4706+
assert t1.index == ts.num_trees - 1
4707+
t2 = tskit.Tree(ts)
4708+
t2.prev()
4709+
assert_trees_identical(t1, t2)
4710+
t1.prev()
4711+
t2.prev()
4712+
assert_trees_identical(t1, t2)
4713+
46724714

46734715
class TestSeek:
46744716
@pytest.mark.parametrize("ts", get_example_tree_sequences())

0 commit comments

Comments
 (0)