Skip to content

Commit 7bd4356

Browse files
Add reference as a sample, and include reference sequence
Closes #152 Closes #22
1 parent 8739cad commit 7bd4356

File tree

3 files changed

+27
-4
lines changed

3 files changed

+27
-4
lines changed

sc2ts/core.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -82,8 +82,8 @@ def get_problematic_sites():
8282
def get_reference_sequence():
8383
global __cached_reference
8484
if __cached_reference is None:
85-
reader = FastaReader(data_path / "reference.fasta")
86-
__cached_reference = reader[REFERENCE_GENBANK]
85+
reader = pyfaidx.Fasta(str(data_path / "reference.fasta"))
86+
__cached_reference = "X" + str(reader[REFERENCE_GENBANK])
8787
return __cached_reference
8888

8989

sc2ts/inference.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import datetime
55
import dataclasses
66
import collections
7+
import json
78
import pickle
89
import os
910
import sqlite3
@@ -220,6 +221,12 @@ def initial_ts():
220221
tables = tskit.TableCollection(L)
221222
tables.time_units = core.TIME_UNITS
222223
base_schema = tskit.MetadataSchema.permissive_json().schema
224+
tables.reference_sequence.metadata_schema = tskit.MetadataSchema(base_schema)
225+
tables.reference_sequence.metadata = {
226+
"genbank_id": core.REFERENCE_GENBANK,
227+
"notes": "X prepended to alignment to map from 1-based to 0-based coordinates"
228+
}
229+
tables.reference_sequence.data = reference
223230

224231
tables.metadata_schema = tskit.MetadataSchema(base_schema)
225232

@@ -235,10 +242,10 @@ def initial_ts():
235242
tables.sites.add_row(pos, reference[pos], metadata={"masked_samples": 0})
236243
# TODO should probably make the ultimate ancestor time something less
237244
# plausible or at least configurable. However, this will be removed
238-
# in later versions when we remove the dependence on tskit.
245+
# in later versions when we remove the dependence on tsinfer.
239246
tables.nodes.add_row(time=1, metadata={"strain": "Vestigial_ignore"})
240247
tables.nodes.add_row(
241-
time=0, metadata={"strain": core.REFERENCE_STRAIN, "date": core.REFERENCE_DATE}
248+
flags=tskit.NODE_IS_SAMPLE, time=0, metadata={"strain": core.REFERENCE_STRAIN, "date": core.REFERENCE_DATE}
242249
)
243250
tables.edges.add_row(0, L, 0, 1)
244251
return tables.tree_sequence()

tests/test_inference.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,22 @@
88
import util
99

1010

11+
class TestInitialTs:
12+
def test_reference_sequence(self):
13+
ts = sc2ts.initial_ts()
14+
assert ts.reference_sequence.metadata["genbank_id"] == "MN908947"
15+
assert ts.reference_sequence.data == sc2ts.core.get_reference_sequence()
16+
17+
def test_reference_sample(self):
18+
ts = sc2ts.initial_ts()
19+
assert ts.num_samples == 1
20+
node = ts.node(ts.samples()[0])
21+
assert node.time == 0
22+
assert node.metadata == {"date": "2019-12-26", "strain": "Wuhan/Hu-1/2019"}
23+
alignment = next(ts.alignments())
24+
assert alignment == sc2ts.core.get_reference_sequence()
25+
26+
1127
class TestAddMatchingResults:
1228
def add_matching_results(
1329
self,

0 commit comments

Comments
 (0)