Skip to content

Commit 8a49681

Browse files
benjefferymergify-bot
authored andcommitted
Add tree sequence metadata and metadata_schema to C API
1 parent 1a1f315 commit 8a49681

File tree

11 files changed

+309
-9
lines changed

11 files changed

+309
-9
lines changed

c/CHANGELOG.rst

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,12 @@
11
---------------------
2-
[0.99.3] - 2019-XX-XX
2+
[0.99.3] - 2020-XX-XX
33
---------------------
44

55
In development.
66

77
**Breaking changes**
88

9-
- Change genotypes from unsigned to signed to accomodate missing data
9+
- Change genotypes from unsigned to signed to accommodate missing data
1010
(see :issue:`144` for discussion). This only affects users of the
1111
``tsk_vargen_t`` class. Genotypes are now stored as int8_t and int16_t
1212
types rather than the former unsigned types. The field names in the
@@ -34,6 +34,10 @@ In development.
3434

3535
**New features**
3636

37+
- Add ``metadata`` and ``metadata_schema`` fields to table collection, with accessors on
38+
tree sequence. These store arbitrary bytes and are optional in the file format.
39+
(:user: `benjeffery`, :pr:`641`)
40+
3741
- Add the ``TSK_KEEP_UNARY`` option to simplify (:user:`gtsambos`). See :issue:`1`
3842
and :pr:`143`.
3943

@@ -46,7 +50,7 @@ In development.
4650
off (:pr:`462`).
4751

4852
- Tables with metadata now have an optional `metadata_schema` field that can contain
49-
arbitary bytes. (:user:`benjeffery`, :pr:`493`)
53+
arbitrary bytes. (:user:`benjeffery`, :pr:`493`)
5054

5155
- Tables loaded from a file can now be edited in the same way as any other
5256
table collection (:user:`jeromekelleher`, :issue:`536`, :pr:`530`.

c/tests/test_file_format.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -613,7 +613,8 @@ test_metadata_schemas_optional(void)
613613
tsk_treeseq_t *ts = caterpillar_tree(5, 3, 3);
614614
tsk_table_collection_t t1, t2;
615615
const char *cols[] = {
616-
/* "metadata_schema", FIXME - add when table collection gets this */
616+
"metadata",
617+
"metadata_schema",
617618
"individuals/metadata_schema",
618619
"populations/metadata_schema",
619620
"nodes/metadata_schema",

c/tests/test_tables.c

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,95 @@ test_table_collection_simplify_errors(void)
6262
tsk_table_collection_free(&tables);
6363
}
6464

65+
static void
66+
test_table_collection_metadata(void)
67+
{
68+
int ret;
69+
tsk_table_collection_t tc1, tc2;
70+
71+
char example_metadata[100] = "An example of metadata with unicode 🎄🌳🌴🌲🎋";
72+
char example_metadata_schema[100]
73+
= "An example of metadata schema with unicode 🎄🌳🌴🌲🎋";
74+
tsk_size_t example_metadata_length = (tsk_size_t) strlen(example_metadata);
75+
tsk_size_t example_metadata_schema_length
76+
= (tsk_size_t) strlen(example_metadata_schema);
77+
78+
// Test equality
79+
ret = tsk_table_collection_init(&tc1, 0);
80+
CU_ASSERT_EQUAL_FATAL(ret, 0);
81+
ret = tsk_table_collection_init(&tc2, 0);
82+
CU_ASSERT_EQUAL_FATAL(ret, 0);
83+
CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2));
84+
ret = tsk_table_collection_set_metadata(
85+
&tc1, example_metadata, example_metadata_length);
86+
CU_ASSERT_EQUAL_FATAL(ret, 0);
87+
CU_ASSERT_FALSE(tsk_table_collection_equals(&tc1, &tc2));
88+
ret = tsk_table_collection_set_metadata(
89+
&tc2, example_metadata, example_metadata_length);
90+
CU_ASSERT_EQUAL_FATAL(ret, 0);
91+
CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2));
92+
ret = tsk_table_collection_set_metadata_schema(
93+
&tc1, example_metadata_schema, example_metadata_schema_length);
94+
CU_ASSERT_EQUAL_FATAL(ret, 0);
95+
CU_ASSERT_FALSE(tsk_table_collection_equals(&tc1, &tc2));
96+
ret = tsk_table_collection_set_metadata_schema(
97+
&tc2, example_metadata_schema, example_metadata_schema_length);
98+
CU_ASSERT_EQUAL_FATAL(ret, 0);
99+
CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2));
100+
101+
// Test copy
102+
tsk_table_collection_free(&tc1);
103+
tsk_table_collection_free(&tc2);
104+
ret = tsk_table_collection_init(&tc1, 0);
105+
CU_ASSERT_EQUAL_FATAL(ret, 0);
106+
ret = tsk_table_collection_set_metadata(
107+
&tc1, example_metadata, example_metadata_length);
108+
CU_ASSERT_EQUAL_FATAL(ret, 0);
109+
ret = tsk_table_collection_copy(&tc1, &tc2, 0);
110+
CU_ASSERT_EQUAL_FATAL(ret, 0);
111+
CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2));
112+
113+
ret = tsk_table_collection_set_metadata_schema(
114+
&tc1, example_metadata_schema, example_metadata_schema_length);
115+
CU_ASSERT_EQUAL_FATAL(ret, 0);
116+
tsk_table_collection_free(&tc2);
117+
ret = tsk_table_collection_copy(&tc1, &tc2, 0);
118+
CU_ASSERT_EQUAL_FATAL(ret, 0);
119+
CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2));
120+
121+
// Test dump and load with empty metadata and schema
122+
tsk_table_collection_free(&tc1);
123+
tsk_table_collection_free(&tc2);
124+
ret = tsk_table_collection_init(&tc1, 0);
125+
CU_ASSERT_EQUAL_FATAL(ret, 0);
126+
tc1.sequence_length = 1.0;
127+
ret = tsk_table_collection_dump(&tc1, _tmp_file_name, 0);
128+
CU_ASSERT_EQUAL_FATAL(ret, 0);
129+
ret = tsk_table_collection_load(&tc2, _tmp_file_name, 0);
130+
CU_ASSERT_EQUAL_FATAL(ret, 0);
131+
CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2));
132+
133+
// Test dump and load with set metadata and schema
134+
tsk_table_collection_free(&tc1);
135+
tsk_table_collection_free(&tc2);
136+
ret = tsk_table_collection_init(&tc1, 0);
137+
CU_ASSERT_EQUAL_FATAL(ret, 0);
138+
tc1.sequence_length = 1.0;
139+
ret = tsk_table_collection_set_metadata(
140+
&tc1, example_metadata, example_metadata_length);
141+
CU_ASSERT_EQUAL_FATAL(ret, 0);
142+
ret = tsk_table_collection_set_metadata_schema(
143+
&tc1, example_metadata_schema, example_metadata_schema_length);
144+
CU_ASSERT_EQUAL_FATAL(ret, 0);
145+
ret = tsk_table_collection_dump(&tc1, _tmp_file_name, 0);
146+
CU_ASSERT_EQUAL_FATAL(ret, 0);
147+
ret = tsk_table_collection_load(&tc2, _tmp_file_name, 0);
148+
CU_ASSERT_EQUAL_FATAL(ret, 0);
149+
CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2));
150+
tsk_table_collection_free(&tc1);
151+
tsk_table_collection_free(&tc2);
152+
}
153+
65154
static void
66155
test_node_table(void)
67156
{
@@ -2830,6 +2919,7 @@ main(int argc, char **argv)
28302919
{ "test_provenance_table", test_provenance_table },
28312920
{ "test_table_collection_simplify_errors",
28322921
test_table_collection_simplify_errors },
2922+
{ "test_table_collection_metadata", test_table_collection_metadata },
28332923
{ "test_simplify_tables_drops_indexes", test_simplify_tables_drops_indexes },
28342924
{ "test_simplify_empty_tables", test_simplify_empty_tables },
28352925
{ "test_link_ancestors_no_edges", test_link_ancestors_no_edges },

c/tests/test_trees.c

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5607,6 +5607,49 @@ test_sample_counts_deprecated(void)
56075607
tsk_treeseq_free(&ts);
56085608
}
56095609

5610+
static void
5611+
test_tree_sequence_metadata(void)
5612+
{
5613+
int ret;
5614+
tsk_table_collection_t tc;
5615+
tsk_treeseq_t ts;
5616+
5617+
char example_metadata[100] = "An example of metadata with unicode 🎄🌳🌴🌲🎋";
5618+
char example_metadata_schema[100]
5619+
= "An example of metadata schema with unicode 🎄🌳🌴🌲🎋";
5620+
tsk_size_t example_metadata_length = (tsk_size_t) strlen(example_metadata);
5621+
tsk_size_t example_metadata_schema_length
5622+
= (tsk_size_t) strlen(example_metadata_schema);
5623+
5624+
ret = tsk_table_collection_init(&tc, 0);
5625+
CU_ASSERT_EQUAL_FATAL(ret, 0);
5626+
tc.sequence_length = 1.0;
5627+
ret = tsk_table_collection_build_index(&tc, 0);
5628+
CU_ASSERT_EQUAL_FATAL(ret, 0);
5629+
ret = tsk_table_collection_set_metadata(
5630+
&tc, example_metadata, example_metadata_length);
5631+
CU_ASSERT_EQUAL_FATAL(ret, 0);
5632+
ret = tsk_table_collection_set_metadata_schema(
5633+
&tc, example_metadata_schema, example_metadata_schema_length);
5634+
CU_ASSERT_EQUAL_FATAL(ret, 0);
5635+
5636+
ret = tsk_treeseq_init(&ts, &tc, 0);
5637+
CU_ASSERT_EQUAL_FATAL(ret, 0);
5638+
5639+
CU_ASSERT_EQUAL(tsk_treeseq_get_metadata_length(&ts), example_metadata_length);
5640+
CU_ASSERT_EQUAL(
5641+
tsk_treeseq_get_metadata_schema_length(&ts), example_metadata_schema_length);
5642+
CU_ASSERT_EQUAL(
5643+
memcmp(tsk_treeseq_get_metadata(&ts), example_metadata, example_metadata_length),
5644+
0);
5645+
CU_ASSERT_EQUAL(memcmp(tsk_treeseq_get_metadata_schema(&ts), example_metadata_schema,
5646+
example_metadata_schema_length),
5647+
0);
5648+
5649+
tsk_treeseq_free(&ts);
5650+
tsk_table_collection_free(&tc);
5651+
}
5652+
56105653
int
56115654
main(int argc, char **argv)
56125655
{
@@ -5747,6 +5790,7 @@ main(int argc, char **argv)
57475790
{ "test_empty_tree_sequence", test_empty_tree_sequence },
57485791
{ "test_zero_edges", test_zero_edges },
57495792
{ "test_sample_counts_deprecated", test_sample_counts_deprecated },
5793+
{ "test_tree_sequence_metadata", test_tree_sequence_metadata },
57505794

57515795
{ NULL, NULL },
57525796
};

c/tests/testlib.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,8 @@ caterpillar_tree(tsk_size_t n, tsk_size_t num_sites, tsk_size_t num_mutations)
535535
const char *metadata[] = { "This", "is", "some", "metadata" };
536536
const int num_metadatas = sizeof(metadata) / sizeof(*metadata);
537537
const char *metadata_schema = "mock metadata schema";
538+
const char *ts_metadata = "This is a caterpillar tree";
539+
const char *ts_metadata_schema = "The metadata is an example";
538540
const char *prov_timestamp = "a timestamp, should be ISO8601";
539541
const char *prov_record = "Produced by caterpillar_tree for testing purposes";
540542

@@ -545,6 +547,10 @@ caterpillar_tree(tsk_size_t n, tsk_size_t num_sites, tsk_size_t num_mutations)
545547
CU_ASSERT_FATAL(num_sites > 0 && num_mutations < n - 1);
546548

547549
tables.sequence_length = 1.0;
550+
551+
tsk_table_collection_set_metadata(&tables, ts_metadata, strlen(ts_metadata));
552+
tsk_table_collection_set_metadata_schema(
553+
&tables, ts_metadata_schema, strlen(ts_metadata_schema));
548554
tsk_population_table_set_metadata_schema(
549555
&tables.populations, metadata_schema, strlen(metadata_schema));
550556
tsk_individual_table_set_metadata_schema(

c/tskit/core.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ to the API or ABI are introduced, i.e., internal refactors of bugfixes.
9696
#define TSK_FILE_FORMAT_NAME "tskit.trees"
9797
#define TSK_FILE_FORMAT_NAME_LENGTH 11
9898
#define TSK_FILE_FORMAT_VERSION_MAJOR 12
99-
#define TSK_FILE_FORMAT_VERSION_MINOR 1
99+
#define TSK_FILE_FORMAT_VERSION_MINOR 2
100100

101101
/**
102102
@defgroup GENERAL_ERROR_GROUP General errors.

c/tskit/tables.c

Lines changed: 94 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,12 @@ read_table_cols(kastore_t *store, read_table_col_t *read_cols, size_t num_cols)
9292
*read_cols[j].len_dest = (tsk_size_t) -1;
9393
}
9494
for (j = 0; j < num_cols; j++) {
95-
if (kastore_containss(store, read_cols[j].name)) {
95+
ret = kastore_containss(store, read_cols[j].name);
96+
if (ret < 0) {
97+
ret = tsk_set_kas_error(ret);
98+
goto out;
99+
}
100+
if (ret == 1) {
96101
ret = kastore_gets(
97102
store, read_cols[j].name, read_cols[j].array_dest, &len, &type);
98103
if (ret != 0) {
@@ -6681,6 +6686,12 @@ tsk_table_collection_print_state(tsk_table_collection_t *self, FILE *out)
66816686
{
66826687
fprintf(out, "Table collection state\n");
66836688
fprintf(out, "sequence_length = %f\n", self->sequence_length);
6689+
fprintf(out, "#metadata_schema#\n");
6690+
fprintf(out, "%.*s\n", self->metadata_schema_length, self->metadata_schema);
6691+
fprintf(out, "#end#metadata_schema\n");
6692+
fprintf(out, "#metadata#\n");
6693+
fprintf(out, "%.*s\n", self->metadata_length, self->metadata);
6694+
fprintf(out, "#end#metadata\n");
66846695
tsk_individual_table_print_state(&self->individuals, out);
66856696
tsk_node_table_print_state(&self->nodes, out);
66866697
tsk_edge_table_print_state(&self->edges, out);
@@ -6747,6 +6758,8 @@ tsk_table_collection_free(tsk_table_collection_t *self)
67476758
tsk_safe_free(self->indexes.edge_insertion_order);
67486759
tsk_safe_free(self->indexes.edge_removal_order);
67496760
tsk_safe_free(self->file_uuid);
6761+
tsk_safe_free(self->metadata);
6762+
tsk_safe_free(self->metadata_schema);
67506763
return 0;
67516764
}
67526765

@@ -6758,6 +6771,14 @@ bool
67586771
tsk_table_collection_equals(tsk_table_collection_t *self, tsk_table_collection_t *other)
67596772
{
67606773
bool ret = self->sequence_length == other->sequence_length
6774+
&& self->metadata_length == other->metadata_length
6775+
&& self->metadata_schema_length == other->metadata_schema_length
6776+
&& memcmp(self->metadata, other->metadata,
6777+
self->metadata_length * sizeof(char))
6778+
== 0
6779+
&& memcmp(self->metadata_schema, other->metadata_schema,
6780+
self->metadata_schema_length * sizeof(char))
6781+
== 0
67616782
&& tsk_individual_table_equals(&self->individuals, &other->individuals)
67626783
&& tsk_node_table_equals(&self->nodes, &other->nodes)
67636784
&& tsk_edge_table_equals(&self->edges, &other->edges)
@@ -6769,6 +6790,22 @@ tsk_table_collection_equals(tsk_table_collection_t *self, tsk_table_collection_t
67696790
return ret;
67706791
}
67716792

6793+
int
6794+
tsk_table_collection_set_metadata(
6795+
tsk_table_collection_t *self, const char *metadata, tsk_size_t metadata_length)
6796+
{
6797+
return replace_string(
6798+
&self->metadata, &self->metadata_length, metadata, metadata_length);
6799+
}
6800+
6801+
int
6802+
tsk_table_collection_set_metadata_schema(tsk_table_collection_t *self,
6803+
const char *metadata_schema, tsk_size_t metadata_schema_length)
6804+
{
6805+
return replace_string(&self->metadata_schema, &self->metadata_schema_length,
6806+
metadata_schema, metadata_schema_length);
6807+
}
6808+
67726809
static int
67736810
tsk_table_collection_set_index(tsk_table_collection_t *self,
67746811
tsk_id_t *edge_insertion_order, tsk_id_t *edge_removal_order)
@@ -6922,6 +6959,16 @@ tsk_table_collection_copy(
69226959
goto out;
69236960
}
69246961
}
6962+
ret = tsk_table_collection_set_metadata(dest, self->metadata, self->metadata_length);
6963+
if (ret != 0) {
6964+
goto out;
6965+
}
6966+
ret = tsk_table_collection_set_metadata_schema(
6967+
dest, self->metadata_schema, self->metadata_schema_length);
6968+
if (ret != 0) {
6969+
goto out;
6970+
}
6971+
69256972
out:
69266973
return ret;
69276974
}
@@ -6935,6 +6982,10 @@ tsk_table_collection_read_format_data(tsk_table_collection_t *self, kastore_t *s
69356982
int8_t *format_name, *uuid;
69366983
double *L;
69376984

6985+
char *metadata = NULL;
6986+
char *metadata_schema = NULL;
6987+
size_t metadata_length, metadata_schema_length;
6988+
69386989
ret = kastore_gets_int8(store, "format/name", &format_name, &len);
69396990
if (ret != 0) {
69406991
ret = tsk_set_kas_error(ret);
@@ -7000,6 +7051,45 @@ tsk_table_collection_read_format_data(tsk_table_collection_t *self, kastore_t *s
70007051
}
70017052
memcpy(self->file_uuid, uuid, TSK_UUID_SIZE);
70027053
self->file_uuid[TSK_UUID_SIZE] = '\0';
7054+
7055+
ret = kastore_containss(store, "metadata");
7056+
if (ret < 0) {
7057+
ret = tsk_set_kas_error(ret);
7058+
goto out;
7059+
}
7060+
if (ret == 1) {
7061+
ret = kastore_gets_int8(
7062+
store, "metadata", (int8_t **) &metadata, (size_t *) &metadata_length);
7063+
if (ret != 0) {
7064+
ret = tsk_set_kas_error(ret);
7065+
goto out;
7066+
}
7067+
ret = tsk_table_collection_set_metadata(
7068+
self, metadata, (tsk_size_t) metadata_length);
7069+
if (ret != 0) {
7070+
goto out;
7071+
}
7072+
}
7073+
7074+
ret = kastore_containss(store, "metadata_schema");
7075+
if (ret < 0) {
7076+
ret = tsk_set_kas_error(ret);
7077+
goto out;
7078+
}
7079+
if (ret == 1) {
7080+
ret = kastore_gets_int8(store, "metadata_schema", (int8_t **) &metadata_schema,
7081+
(size_t *) &metadata_schema_length);
7082+
if (ret != 0) {
7083+
ret = tsk_set_kas_error(ret);
7084+
goto out;
7085+
}
7086+
ret = tsk_table_collection_set_metadata_schema(
7087+
self, metadata_schema, (tsk_size_t) metadata_schema_length);
7088+
if (ret != 0) {
7089+
goto out;
7090+
}
7091+
}
7092+
70037093
out:
70047094
if ((ret ^ (1 << TSK_KAS_ERR_BIT)) == KAS_ERR_KEY_NOT_FOUND) {
70057095
ret = TSK_ERR_REQUIRED_COL_NOT_FOUND;
@@ -7146,6 +7236,9 @@ tsk_table_collection_write_format_data(tsk_table_collection_t *self, kastore_t *
71467236
{ "format/version", (void *) version, 2, KAS_UINT32 },
71477237
{ "sequence_length", (void *) &self->sequence_length, 1, KAS_FLOAT64 },
71487238
{ "uuid", (void *) uuid, TSK_UUID_SIZE, KAS_INT8 },
7239+
{ "metadata", (void *) self->metadata, self->metadata_length, KAS_INT8 },
7240+
{ "metadata_schema", (void *) self->metadata_schema,
7241+
self->metadata_schema_length, KAS_INT8 },
71497242
};
71507243

71517244
ret = tsk_generate_uuid(uuid, 0);

0 commit comments

Comments
 (0)