Skip to content

Commit 2647e3e

Browse files
committed
Add a sample_id column to ancestors
Starts to address #604. Also changes the sampledata format
1 parent 64ee760 commit 2647e3e

File tree

3 files changed

+89
-16
lines changed

3 files changed

+89
-16
lines changed

docs/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,7 @@
193193
# Example configuration for intersphinx: refer to the Python standard library.
194194
intersphinx_mapping = {
195195
"https://docs.python.org/3/": None,
196+
"https://numpy.org/doc/stable/": None,
196197
"https://tskit.dev/tskit/docs/stable": None,
197198
"https://tskit.dev/msprime/docs/stable": None,
198199
"https://numcodecs.readthedocs.io/en/stable/": None,

tests/test_formats.py

Lines changed: 35 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2132,13 +2132,42 @@ def test_insert_proxy_1_sample(self):
21322132
sample_data, sample_ids=[i]
21332133
)
21342134
assert ancestors.num_ancestors + 1 == ancestors_extra.num_ancestors
2135-
inserted = -1
2135+
inserted = -1 # inserted one should always be the last
21362136
self.assert_ancestor_full_span(ancestors_extra, [inserted])
21372137
assert np.array_equal(
21382138
ancestors_extra.ancestors_haplotype[inserted],
21392139
sample_data.sites_genotypes[:, i][used_sites],
21402140
)
21412141

2142+
def test_insert_proxy_sample_ids(self):
2143+
ids = [1, 4]
2144+
sd, _ = self.get_example_data(10, 10, 40)
2145+
ancestors = tsinfer.generate_ancestors(sd).insert_proxy_samples(
2146+
sd, sample_ids=ids
2147+
)
2148+
inference_sites = np.isin(sd.sites_position[:], ancestors.sites_position[:])
2149+
anc_sample_ids = ancestors.ancestors_sample_id[:]
2150+
assert np.sum(anc_sample_ids != tskit.NULL) == len(ids)
2151+
for sample_id in ids:
2152+
assert np.sum(anc_sample_ids == sample_id) == 1
2153+
anc_sample = np.where(anc_sample_ids == sample_id)[0][0]
2154+
assert ancestors.ancestors_start[anc_sample] == 0
2155+
assert ancestors.ancestors_end[anc_sample] == ancestors.num_sites
2156+
assert len(ancestors.ancestors_focal_sites[anc_sample]) == 0
2157+
2158+
haplotype = next(sd.haplotypes([sample_id], sites=inference_sites))[1]
2159+
assert np.all(ancestors.ancestors_haplotype[anc_sample] == haplotype)
2160+
2161+
def test_insert_proxy_different_sample_data(self):
2162+
ids = [1, 4]
2163+
sd, _ = self.get_example_data(10, 10, 40)
2164+
ancestors = tsinfer.generate_ancestors(sd)
2165+
sd_copy, _ = self.get_example_data(10, 10, num_ancestors=40)
2166+
ancestors_extra = ancestors.insert_proxy_samples(
2167+
sd_copy, sample_ids=ids, require_same_sample_data=False
2168+
)
2169+
assert np.all(ancestors_extra.ancestors_sample_id[:] == tskit.NULL)
2170+
21422171
def test_insert_proxy_sample_provenance(self):
21432172
sample_data, _ = self.get_example_data(10, 10, 40)
21442173
ancestors = tsinfer.generate_ancestors(sample_data)
@@ -2181,6 +2210,8 @@ def test_insert_proxy_time_historical_samples(self):
21812210
assert np.array_equal(
21822211
ancestors_extra.ancestors_time[-1], historical_sample_time + epsilon
21832212
)
2213+
assert np.sum(ancestors_extra.ancestors_sample_id[:] != tskit.NULL) == 1
2214+
assert ancestors_extra.ancestors_sample_id[-1] == 9
21842215

21852216
# Test 2 proxies, one historical, specified in different ways / orders
21862217
s_ids = np.array([9, 0])
@@ -2202,6 +2233,9 @@ def test_insert_proxy_time_historical_samples(self):
22022233
ancestors_extra.ancestors_haplotype[-1], G[:, 0][used_sites]
22032234
)
22042235
assert np.array_equal(ancestors_extra.ancestors_time[-1], epsilon)
2236+
assert np.sum(ancestors_extra.ancestors_sample_id[:] != tskit.NULL) == 2
2237+
assert ancestors_extra.ancestors_sample_id[-2] == 9
2238+
assert ancestors_extra.ancestors_sample_id[-1] == 0
22052239

22062240
def test_insert_proxy_sample_epsilon(self):
22072241
sample_data, _ = self.get_example_data(10, 10, 40)

tsinfer/formats.py

Lines changed: 53 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -2021,6 +2021,10 @@ def haplotypes(self, samples=None, sites=None):
20212021
``None``, return haplotypes for all sample nodes, otherwise this may be a
20222022
numpy array (or array-like) object (converted to dtype=np.int32).
20232023
:param array sites: A numpy array of sites to use.
2024+
2025+
2026+
:return: An iterator returning sucessive instances of (sample_id, haplotype).
2027+
:rtype: iter(int, numpy.ndarray(dtype=int8))
20242028
"""
20252029
if samples is None:
20262030
samples = np.arange(self.num_samples)
@@ -2113,6 +2117,7 @@ class Ancestor:
21132117
time = attr.ib()
21142118
focal_sites = attr.ib()
21152119
haplotype = attr.ib()
2120+
sample_id = attr.ib()
21162121

21172122

21182123
class AncestorData(DataContainer):
@@ -2150,7 +2155,7 @@ class AncestorData(DataContainer):
21502155
"""
21512156

21522157
FORMAT_NAME = "tsinfer-ancestor-data"
2153-
FORMAT_VERSION = (3, 0)
2158+
FORMAT_VERSION = (3, 1)
21542159

21552160
def __init__(self, sample_data, **kwargs):
21562161
super().__init__(**kwargs)
@@ -2209,6 +2214,13 @@ def __init__(self, sample_data, **kwargs):
22092214
dtype="array:i1",
22102215
compressor=self._compressor,
22112216
)
2217+
self.data.create_dataset(
2218+
"ancestors/sample_id",
2219+
shape=(0,),
2220+
chunks=chunks,
2221+
compressor=self._compressor,
2222+
dtype=np.int32,
2223+
)
22122224

22132225
self._alloc_ancestor_writer()
22142226

@@ -2224,6 +2236,7 @@ def _alloc_ancestor_writer(self):
22242236
"time": self.ancestors_time,
22252237
"focal_sites": self.ancestors_focal_sites,
22262238
"haplotype": self.ancestors_haplotype,
2239+
"sample_id": self.ancestors_sample_id,
22272240
},
22282241
num_threads=self._num_flush_threads,
22292242
)
@@ -2245,6 +2258,7 @@ def __str__(self):
22452258
("ancestors/time", zarr_summary(self.ancestors_time)),
22462259
("ancestors/focal_sites", zarr_summary(self.ancestors_focal_sites)),
22472260
("ancestors/haplotype", zarr_summary(self.ancestors_haplotype)),
2261+
("ancestors/sample_id", zarr_summary(self.ancestors_sample_id)),
22482262
]
22492263
return super().__str__() + self._format_str(values)
22502264

@@ -2269,6 +2283,9 @@ def data_equal(self, other):
22692283
self.ancestors_focal_sites[:], other.ancestors_focal_sites[:]
22702284
)
22712285
and np_obj_equal(self.ancestors_haplotype[:], other.ancestors_haplotype[:])
2286+
and np.array_equal(
2287+
self.ancestors_sample_id[:], other.ancestors_sample_id[:]
2288+
)
22722289
)
22732290

22742291
@property
@@ -2311,6 +2328,10 @@ def ancestors_focal_sites(self):
23112328
def ancestors_haplotype(self):
23122329
return self.data["ancestors/haplotype"]
23132330

2331+
@property
2332+
def ancestors_sample_id(self):
2333+
return self.data["ancestors/sample_id"]
2334+
23142335
@property
23152336
def ancestors_length(self):
23162337
"""
@@ -2329,6 +2350,7 @@ def insert_proxy_samples(
23292350
*,
23302351
sample_ids=None,
23312352
epsilon=None,
2353+
map_ancestors=False,
23322354
allow_mutation=False,
23332355
require_same_sample_data=True,
23342356
**kwargs,
@@ -2341,7 +2363,8 @@ def insert_proxy_samples(
23412363
23422364
A *proxy sample ancestor* is an ancestor based upon a known sample. At
23432365
sites used in the full inference process, the haplotype of this ancestor
2344-
is identical to that of the sample on which it is based. The time of the
2366+
is identical to that of the sample on which it is based, and the
2367+
The time of the
23452368
ancestor is taken to be a fraction ``epsilon`` older than the sample on
23462369
which it is based.
23472370
@@ -2355,11 +2378,11 @@ def insert_proxy_samples(
23552378
23562379
.. note::
23572380
2358-
The proxy sample ancestors inserted here will correspond to extra nodes
2359-
in the inferred tree sequence. At sites which are not used in the full
2381+
The proxy sample ancestors inserted here will end up as extra nodes
2382+
in the inferred tree sequence, but at sites which are not used in the full
23602383
inference process (e.g. sites unique to a single historical sample),
2361-
these proxy sample ancestor nodes may have a different genotype from
2362-
their corresponding sample.
2384+
it is possible for these proxy sample ancestor nodes to have a different
2385+
genotype from their corresponding sample.
23632386
23642387
:param SampleData sample_data: The :class:`.SampleData` instance
23652388
from which to select the samples used to create extra ancestors.
@@ -2394,7 +2417,8 @@ def insert_proxy_samples(
23942417
to ensure that the encoding of alleles in ``sample_data`` matches the
23952418
encoding in the current :class:`AncestorData` instance (i.e. that in the
23962419
original :class:`.SampleData` instance on which the current ancestors
2397-
are based).
2420+
are based). Note that in this case, the sample_id is not recorded in the
2421+
returned object.
23982422
:param \\**kwargs: Further arguments passed to the constructor when creating
23992423
the new :class:`AncestorData` instance which will be returned.
24002424
@@ -2492,7 +2516,11 @@ def insert_proxy_samples(
24922516
time=proxy_time,
24932517
focal_sites=[],
24942518
haplotype=haplotype,
2519+
sample_id=sample_id
2520+
if sample_data.uuid == self.sample_data_uuid
2521+
else tskit.NULL,
24952522
)
2523+
24962524
# Add any ancestors remaining in the current instance
24972525
while ancestor is not None:
24982526
other.add_ancestor(**attr.asdict(ancestor, filter=exclude_id))
@@ -2574,7 +2602,6 @@ def truncate_ancestors(
25742602
start = self.ancestors_start[:]
25752603
end = self.ancestors_end[:]
25762604
time = self.ancestors_time[:]
2577-
focal_sites = self.ancestors_focal_sites[:]
25782605
haplotypes = self.ancestors_haplotype[:]
25792606
if upper_time_bound > np.max(time) or lower_time_bound > np.max(time):
25802607
raise ValueError("Time bounds cannot be greater than older ancestor")
@@ -2612,16 +2639,12 @@ def truncate_ancestors(
26122639
)
26132640
start[anc.id] = insert_pos_start
26142641
end[anc.id] = insert_pos_end
2615-
time[anc.id] = anc.time
2616-
focal_sites[anc.id] = anc.focal_sites
26172642
haplotypes[anc.id] = anc.haplotype[
26182643
insert_pos_start - anc.start : insert_pos_end - anc.start
26192644
]
26202645
# TODO - record truncation in ancestors' metadata when supported
26212646
truncated.ancestors_start[:] = start
26222647
truncated.ancestors_end[:] = end
2623-
truncated.ancestors_time[:] = time
2624-
truncated.ancestors_focal_sites[:] = focal_sites
26252648
truncated.ancestors_haplotype[:] = haplotypes
26262649
truncated.record_provenance(command="truncate_ancestors")
26272650
truncated.finalise()
@@ -2642,6 +2665,12 @@ def set_inference_sites(self, site_ids):
26422665
sites in the sample data file, and the IDs must be in increasing order.
26432666
26442667
This must be called before the first call to :meth:`.add_ancestor`.
2668+
2669+
.. note::
2670+
To obtain a list of which sites in a sample data or a tree sequence have
2671+
been placed into the ancestors file for use in inference, you can apply
2672+
:func:`numpy.isin` to the list of positions, e.g.
2673+
``np.isin(sample_data.sites_position[:], ancestors.sites_position[:])``
26452674
"""
26462675
self._check_build_mode()
26472676
position = self.sample_data.sites_position[:][site_ids]
@@ -2650,12 +2679,18 @@ def set_inference_sites(self, site_ids):
26502679
array[:] = position
26512680
self._num_alleles = self.sample_data.num_alleles(site_ids)
26522681

2653-
def add_ancestor(self, start, end, time, focal_sites, haplotype):
2682+
def add_ancestor(
2683+
self, start, end, time, focal_sites, haplotype, sample_id=tskit.NULL
2684+
):
26542685
"""
26552686
Adds an ancestor with the specified haplotype, with ancestral material over the
26562687
interval [start:end], that is associated with the specified timepoint and has new
2657-
mutations at the specified list of focal sites. Ancestors should be added in time
2658-
order, with the oldest first. The id of the added ancestor is returned.
2688+
mutations at the specified list of focal sites. If this ancestor is based on a
2689+
specific sample from the associated sample_data file (i.e. a historical sample)
2690+
then the ``sample_id`` in the sample data file can also be passed as a parameter.
2691+
2692+
The Ancestors should be added in time order, with the oldest first. The id of
2693+
the added ancestor is returned.
26592694
"""
26602695
self._check_build_mode()
26612696
haplotype = tskit.util.safe_np_int_cast(haplotype, dtype=np.int8, copy=True)
@@ -2685,6 +2720,7 @@ def add_ancestor(self, start, end, time, focal_sites, haplotype):
26852720
time=time,
26862721
focal_sites=focal_sites,
26872722
haplotype=haplotype,
2723+
sample_id=sample_id,
26882724
)
26892725

26902726
def finalise(self):
@@ -2706,6 +2742,7 @@ def ancestors(self):
27062742
end = self.ancestors_end[:]
27072743
time = self.ancestors_time[:]
27082744
focal_sites = self.ancestors_focal_sites[:]
2745+
sample_id = self.ancestors_sample_id[:]
27092746
for j, h in enumerate(chunk_iterator(self.ancestors_haplotype)):
27102747
yield Ancestor(
27112748
id=j,
@@ -2714,6 +2751,7 @@ def ancestors(self):
27142751
time=time[j],
27152752
focal_sites=focal_sites[j],
27162753
haplotype=h,
2754+
sample_id=sample_id[j],
27172755
)
27182756

27192757

0 commit comments

Comments
 (0)