Skip to content

Commit e54f7b4

Browse files
Wilder Wohnsjeromekelleher
authored andcommitted
added c implementation of kc distance
1 parent fffa7ae commit e54f7b4

File tree

5 files changed

+338
-0
lines changed

5 files changed

+338
-0
lines changed

c/meson.build

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ if not meson.is_subproject()
8989
sources: ['examples/tree_iteration.c'], link_with: [tskit_lib], dependencies: lib_deps)
9090
executable('tree_traversal',
9191
sources: ['examples/tree_traversal.c'], link_with: [tskit_lib], dependencies: lib_deps)
92+
executable('kc_distance',
93+
sources: ['examples/kc_distance.c'], link_with: [tskit_lib], dependencies: lib_deps)
9294

9395
gsl_dep = dependency('gsl', required: false)
9496
if gsl_dep.found()

c/tests/test_trees.c

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4420,6 +4420,192 @@ test_internal_sample_sample_sets(void)
44204420
}
44214421

44224422

4423+
4424+
/*=======================================================
4425+
* KC Distance tests.
4426+
*=======================================================*/
4427+
4428+
static void
4429+
test_single_tree_kc(void)
4430+
{
4431+
int ret;
4432+
tsk_treeseq_t ts;
4433+
tsk_tree_t t, other_t;
4434+
double result = 0;
4435+
4436+
tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges,
4437+
NULL, NULL, NULL, NULL, NULL);
4438+
ret = tsk_tree_init(&t, &ts, 0);
4439+
CU_ASSERT_EQUAL_FATAL(ret, 0);
4440+
ret = tsk_tree_first(&t);
4441+
CU_ASSERT_EQUAL_FATAL(ret, 1);
4442+
ret = tsk_tree_init(&other_t, &ts, 0);
4443+
CU_ASSERT_EQUAL_FATAL(ret, 0);
4444+
ret = tsk_tree_first(&other_t);
4445+
CU_ASSERT_EQUAL_FATAL(ret, 1);
4446+
ret = tsk_tree_copy(&t, &other_t, TSK_NO_INIT);
4447+
CU_ASSERT_EQUAL_FATAL(ret, 0);
4448+
check_trees_identical(&t, &other_t);
4449+
ret = tsk_tree_kc_distance(&t, &other_t, 0, &result);
4450+
CU_ASSERT_EQUAL_FATAL(ret, 0);
4451+
CU_ASSERT_EQUAL_FATAL(result, 0);
4452+
tsk_treeseq_free(&ts);
4453+
tsk_tree_free(&t);
4454+
tsk_tree_free(&other_t);
4455+
}
4456+
4457+
static void
4458+
test_two_trees_kc(void)
4459+
{
4460+
const char *nodes =
4461+
"1 0 0\n"
4462+
"1 0 0\n"
4463+
"1 0 0\n"
4464+
"0 2 0\n"
4465+
"0 3 0\n";
4466+
const char *nodes_other =
4467+
"1 0 0\n"
4468+
"1 0 0\n"
4469+
"1 0 0\n"
4470+
"0 4 0\n"
4471+
"0 6 0\n";
4472+
const char *edges =
4473+
"0 1 3 0,1\n"
4474+
"0 1 4 2,3\n";
4475+
int ret;
4476+
tsk_treeseq_t ts, other_ts;
4477+
tsk_tree_t t, other_t;
4478+
double result = 0;
4479+
4480+
tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL);
4481+
ret = tsk_tree_init(&t, &ts, 0);
4482+
CU_ASSERT_EQUAL_FATAL(ret, 0);
4483+
ret = tsk_tree_first(&t);
4484+
CU_ASSERT_EQUAL_FATAL(ret, 1);
4485+
tsk_treeseq_from_text(&other_ts, 1, nodes_other, edges, NULL, NULL, NULL, NULL, NULL);
4486+
ret = tsk_tree_init(&other_t, &other_ts, 0);
4487+
CU_ASSERT_EQUAL_FATAL(ret, 0);
4488+
ret = tsk_tree_first(&other_t);
4489+
CU_ASSERT_EQUAL_FATAL(ret, 1);
4490+
ret = tsk_tree_kc_distance(&t, &other_t, 0, &result);
4491+
CU_ASSERT_EQUAL_FATAL(ret, 0);
4492+
CU_ASSERT_EQUAL_FATAL(result, 0);
4493+
ret = tsk_tree_kc_distance(&t, &other_t, 1, &result);
4494+
CU_ASSERT_DOUBLE_EQUAL_FATAL(result, 4.243, 1e-2);
4495+
tsk_treeseq_free(&ts);
4496+
tsk_treeseq_free(&other_ts);
4497+
tsk_tree_free(&t);
4498+
tsk_tree_free(&other_t);
4499+
}
4500+
4501+
static void
4502+
test_empty_tree_kc(void)
4503+
{
4504+
tsk_treeseq_t ts;
4505+
tsk_table_collection_t tables;
4506+
tsk_tree_t t;
4507+
tsk_id_t v;
4508+
int ret;
4509+
double result = 0;
4510+
4511+
ret = tsk_table_collection_init(&tables, 0);
4512+
CU_ASSERT_EQUAL_FATAL(ret, 0);
4513+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
4514+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_SEQUENCE_LENGTH);
4515+
tsk_treeseq_free(&ts);
4516+
tables.sequence_length = 1.0;
4517+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
4518+
CU_ASSERT_EQUAL_FATAL(ret, 0);
4519+
4520+
verify_empty_tree_sequence(&ts, 1.0);
4521+
4522+
ret = tsk_tree_init(&t, &ts, 0);
4523+
CU_ASSERT_EQUAL_FATAL(ret, 0);
4524+
ret = tsk_tree_first(&t);
4525+
CU_ASSERT_EQUAL_FATAL(ret, 1);
4526+
CU_ASSERT_EQUAL_FATAL(t.left_root, TSK_NULL);
4527+
CU_ASSERT_EQUAL_FATAL(t.left, 0);
4528+
CU_ASSERT_EQUAL_FATAL(t.right, 1);
4529+
CU_ASSERT_EQUAL_FATAL(tsk_tree_get_parent(&t, 0, &v), TSK_ERR_NODE_OUT_OF_BOUNDS);
4530+
4531+
ret = tsk_tree_kc_distance(&t, &t, 0, &result);
4532+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE);
4533+
4534+
tsk_tree_free(&t);
4535+
tsk_treeseq_free(&ts);
4536+
tsk_table_collection_free(&tables);
4537+
}
4538+
4539+
static void
4540+
test_nonbinary_tree_kc(void)
4541+
{
4542+
const char *nodes =
4543+
"1 0 0\n"
4544+
"1 0 0\n"
4545+
"1 0 0\n"
4546+
"1 0 0\n"
4547+
"0 1 0";
4548+
const char *edges =
4549+
"0 1 4 0,1,2,3\n";
4550+
tsk_treeseq_t ts;
4551+
tsk_tree_t t;
4552+
int ret;
4553+
double result = 0;
4554+
4555+
tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL);
4556+
ret = tsk_tree_init(&t, &ts, 0);
4557+
CU_ASSERT_EQUAL_FATAL(ret, 0);
4558+
ret = tsk_tree_first(&t);
4559+
CU_ASSERT_EQUAL_FATAL(ret, 1);
4560+
tsk_tree_kc_distance(&t, &t, 0, &result);
4561+
CU_ASSERT_EQUAL_FATAL(result, 0);
4562+
tsk_treeseq_free(&ts);
4563+
tsk_tree_free(&t);
4564+
}
4565+
4566+
static void
4567+
test_unequal_samples_kc(void)
4568+
{
4569+
const char *nodes =
4570+
"1 0 0\n"
4571+
"1 0 0\n"
4572+
"1 0 0\n"
4573+
"0 2 0\n"
4574+
"0 3 0\n";
4575+
const char *nodes_other =
4576+
"1 0 0\n"
4577+
"1 0 0\n"
4578+
"0 1 0\n";
4579+
const char *edges =
4580+
"0 1 3 0,1\n"
4581+
"0 1 4 2,3\n";
4582+
const char *edges_other =
4583+
"0 1 2 0,1\n";
4584+
int ret;
4585+
tsk_treeseq_t ts, other_ts;
4586+
tsk_tree_t t, other_t;
4587+
double result = 0;
4588+
4589+
tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL);
4590+
ret = tsk_tree_init(&t, &ts, 0);
4591+
CU_ASSERT_EQUAL_FATAL(ret, 0);
4592+
ret = tsk_tree_first(&t);
4593+
CU_ASSERT_EQUAL_FATAL(ret, 1);
4594+
tsk_treeseq_from_text(&other_ts, 1, nodes_other, edges_other, NULL, NULL, NULL, NULL, NULL);
4595+
ret = tsk_tree_init(&other_t, &other_ts, 0);
4596+
CU_ASSERT_EQUAL_FATAL(ret, 0);
4597+
ret = tsk_tree_first(&other_t);
4598+
CU_ASSERT_EQUAL_FATAL(ret, 1);
4599+
ret = tsk_tree_kc_distance(&t, &other_t, 0, &result);
4600+
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_BAD_PARAM_VALUE);
4601+
tsk_treeseq_free(&ts);
4602+
tsk_treeseq_free(&other_ts);
4603+
tsk_tree_free(&t);
4604+
tsk_tree_free(&other_t);
4605+
}
4606+
4607+
4608+
44234609
/*=======================================================
44244610
* Miscellaneous tests.
44254611
*======================================================*/
@@ -5108,6 +5294,13 @@ main(int argc, char **argv)
51085294
{"test_nonbinary_sample_sets", test_nonbinary_sample_sets},
51095295
{"test_internal_sample_sample_sets", test_internal_sample_sample_sets},
51105296

5297+
/*KC_Distance tests */
5298+
{"test_single_tree_kc", test_single_tree_kc},
5299+
{"test_two_trees_kc", test_two_trees_kc},
5300+
{"test_empty_tree_kc", test_empty_tree_kc},
5301+
{"test_nonbinary_tree_kc", test_nonbinary_tree_kc},
5302+
{"test_unequal_samples_kc", test_unequal_samples_kc},
5303+
51115304
/* Misc */
51125305
{"test_tree_errors", test_tree_errors},
51135306
{"test_tree_copy_flags", test_tree_copy_flags},

c/tskit/stats.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,13 @@ int tsk_ld_calc_get_r2_array(tsk_ld_calc_t *self, tsk_id_t a, int direction,
4848
tsk_size_t max_sites, double max_distance,
4949
double *r2, tsk_size_t *num_r2_values);
5050

51+
typedef struct stack_elmt {
52+
tsk_id_t node;
53+
int path_depth;
54+
double time_depth;
55+
} stack_elmt;
56+
57+
5158

5259
#ifdef __cplusplus
5360
}

c/tskit/trees.c

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4426,3 +4426,138 @@ tsk_diff_iter_next(tsk_diff_iter_t *self, double *ret_left, double *ret_right,
44264426
self->tree_left = right;
44274427
return ret;
44284428
}
4429+
4430+
4431+
int
4432+
tsk_tree_kc_distance(tsk_tree_t *self, tsk_tree_t *other, double lambda, double *result)
4433+
{
4434+
tsk_size_t num_nodes_self, num_nodes_other;
4435+
int ret = 0, stack_top = 0;
4436+
int path_depth, tree_index, pair_index, i;
4437+
double vT1, vT2, distance_sum, time_depth, root_time;
4438+
int *m[2], *path_distance[2];
4439+
double *M[2], *times, *time_distance[2];
4440+
tsk_id_t N, u, v, mrca, n1, n2, num_samples;
4441+
tsk_tree_t *trees[2], *tree;
4442+
4443+
struct stack_elmt {
4444+
tsk_id_t node;
4445+
int path_depth;
4446+
double time_depth;
4447+
};
4448+
4449+
trees[0] = self;
4450+
trees[1] = other;
4451+
4452+
struct stack_elmt* stack = NULL;
4453+
memset(path_distance, 0, sizeof(path_distance));
4454+
memset(time_distance, 0, sizeof(time_distance));
4455+
memset(m, 0, sizeof(m));
4456+
memset(M, 0, sizeof(M));
4457+
4458+
if (tsk_tree_get_num_roots(self) != 1 || tsk_tree_get_num_roots(other) != 1) {
4459+
ret = TSK_ERR_BAD_PARAM_VALUE;
4460+
goto out;
4461+
}
4462+
4463+
if (self->tree_sequence->num_samples != other->tree_sequence->num_samples) {
4464+
ret = TSK_ERR_BAD_PARAM_VALUE;
4465+
goto out;
4466+
}
4467+
4468+
num_samples = (tsk_id_t) self->tree_sequence->num_samples;
4469+
N = (num_samples * (num_samples - 1)) / 2;
4470+
num_nodes_self = self->num_nodes;
4471+
num_nodes_other = other->num_nodes;
4472+
stack = malloc(TSK_MAX(num_nodes_self, num_nodes_other) * sizeof(*stack));
4473+
m[0] = calloc((size_t) (N + num_samples), sizeof(m[0]));
4474+
m[1] = calloc((size_t) (N + num_samples), sizeof(m[1]));
4475+
M[0] = malloc((size_t) (N + num_samples) * sizeof(M[0]));
4476+
M[1] = malloc((size_t) (N + num_samples) * sizeof(M[1]));
4477+
path_distance[0] = malloc(num_nodes_self * sizeof(path_distance[0]));
4478+
path_distance[1] = malloc(num_nodes_other * sizeof(path_distance[1]));
4479+
time_distance[0] = malloc(num_nodes_self * sizeof(time_distance[0]));
4480+
time_distance[1] = malloc(num_nodes_other * sizeof(time_distance[1]));
4481+
4482+
if (stack == NULL
4483+
|| m[0] == NULL
4484+
|| m[1] == NULL
4485+
|| M[0] == NULL
4486+
|| M[1] == NULL
4487+
|| path_distance[0] == NULL
4488+
|| time_distance[1] == NULL) {
4489+
ret = TSK_ERR_NO_MEMORY;
4490+
goto out;
4491+
}
4492+
4493+
for (i=0; i <= N + num_samples; i++) {
4494+
m[0][i] = 1;
4495+
m[1][i] = 1;
4496+
}
4497+
4498+
for (tree_index=0; tree_index != 2; tree_index++) {
4499+
tree = trees[tree_index];
4500+
times = tree->tree_sequence->tables->nodes.time;
4501+
stack_top = 0;
4502+
u = tree->left_root;
4503+
root_time = times[u];
4504+
stack[stack_top].node = u;
4505+
stack[stack_top].path_depth = 0;
4506+
stack[stack_top].time_depth = root_time;
4507+
while (stack_top >= 0) {
4508+
u = stack[stack_top].node;
4509+
path_depth = stack[stack_top].path_depth;
4510+
time_depth = stack[stack_top].time_depth;
4511+
stack_top--;
4512+
for (v = tree->left_child[u]; v != TSK_NULL;
4513+
v = tree->right_sib[v]) {
4514+
stack_top++;
4515+
stack[stack_top].node = v;
4516+
stack[stack_top].path_depth = path_depth + 1;
4517+
stack[stack_top].time_depth = times[v];
4518+
}
4519+
path_distance[tree_index][u] = path_depth;
4520+
time_distance[tree_index][u] = root_time - time_depth;
4521+
if (tree->left_child[u] == TSK_NULL) {
4522+
M[tree_index][u + N] = times[tree->parent[u]] - times[u];
4523+
}
4524+
}
4525+
for (n1 = 0; n1 != num_samples; n1++) {
4526+
for (n2 = n1 + 1; n2 != num_samples; n2++){
4527+
ret = tsk_tree_get_mrca(tree, n1, n2, &mrca);
4528+
if (ret != 0) {
4529+
goto out;
4530+
}
4531+
pair_index = (n1 * (n1 - 2 * num_samples + 1)) / (-2) + n2 - n1 -1;
4532+
assert (m[tree_index][pair_index] == 1);
4533+
m[tree_index][pair_index] = path_distance[tree_index][mrca];
4534+
M[tree_index][pair_index] = time_distance[tree_index][mrca];
4535+
}
4536+
}
4537+
}
4538+
4539+
vT1 = 0;
4540+
vT2 = 0;
4541+
distance_sum=0;
4542+
4543+
for (i = 0; i != N + num_samples; i++) {
4544+
vT1 = (m[0][i] * (1 - lambda)) + (lambda * M[0][i]);
4545+
vT2 = (m[1][i] * (1 - lambda)) + (lambda * M[1][i]);
4546+
distance_sum += (vT1 - vT2) * (vT1 - vT2);
4547+
}
4548+
4549+
*result = sqrt(distance_sum);
4550+
out:
4551+
tsk_safe_free(stack);
4552+
tsk_safe_free(m[0]);
4553+
tsk_safe_free(m[1]);
4554+
tsk_safe_free(M[0]);
4555+
tsk_safe_free(M[1]);
4556+
tsk_safe_free(path_distance[0]);
4557+
tsk_safe_free(path_distance[1]);
4558+
tsk_safe_free(time_distance[0]);
4559+
tsk_safe_free(time_distance[1]);
4560+
return ret;
4561+
}
4562+
4563+

c/tskit/trees.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,7 @@ int tsk_tree_map_mutations(tsk_tree_t *self, int8_t *genotypes,
384384
int8_t *ancestral_state,
385385
tsk_size_t *num_transitions, tsk_state_transition_t **transitions);
386386

387+
int tsk_tree_kc_distance(tsk_tree_t *self, tsk_tree_t *other, double lambda, double *result);
387388

388389
/****************************************************************************/
389390
/* Diff iterator */

0 commit comments

Comments
 (0)