@@ -2021,6 +2021,10 @@ def haplotypes(self, samples=None, sites=None):
2021
2021
``None``, return haplotypes for all sample nodes, otherwise this may be a
2022
2022
numpy array (or array-like) object (converted to dtype=np.int32).
2023
2023
:param array sites: A numpy array of sites to use.
2024
+
2025
+
2026
+ :return: An iterator returning sucessive instances of (sample_id, haplotype).
2027
+ :rtype: iter(int, numpy.ndarray(dtype=int8))
2024
2028
"""
2025
2029
if samples is None :
2026
2030
samples = np .arange (self .num_samples )
@@ -2113,6 +2117,7 @@ class Ancestor:
2113
2117
time = attr .ib ()
2114
2118
focal_sites = attr .ib ()
2115
2119
haplotype = attr .ib ()
2120
+ sample_id = attr .ib ()
2116
2121
2117
2122
2118
2123
class AncestorData (DataContainer ):
@@ -2150,7 +2155,7 @@ class AncestorData(DataContainer):
2150
2155
"""
2151
2156
2152
2157
FORMAT_NAME = "tsinfer-ancestor-data"
2153
- FORMAT_VERSION = (3 , 0 )
2158
+ FORMAT_VERSION = (3 , 1 )
2154
2159
2155
2160
def __init__ (self , sample_data , ** kwargs ):
2156
2161
super ().__init__ (** kwargs )
@@ -2209,6 +2214,13 @@ def __init__(self, sample_data, **kwargs):
2209
2214
dtype = "array:i1" ,
2210
2215
compressor = self ._compressor ,
2211
2216
)
2217
+ self .data .create_dataset (
2218
+ "ancestors/sample_id" ,
2219
+ shape = (0 ,),
2220
+ chunks = chunks ,
2221
+ compressor = self ._compressor ,
2222
+ dtype = np .int32 ,
2223
+ )
2212
2224
2213
2225
self ._alloc_ancestor_writer ()
2214
2226
@@ -2224,6 +2236,7 @@ def _alloc_ancestor_writer(self):
2224
2236
"time" : self .ancestors_time ,
2225
2237
"focal_sites" : self .ancestors_focal_sites ,
2226
2238
"haplotype" : self .ancestors_haplotype ,
2239
+ "sample_id" : self .ancestors_sample_id ,
2227
2240
},
2228
2241
num_threads = self ._num_flush_threads ,
2229
2242
)
@@ -2245,6 +2258,7 @@ def __str__(self):
2245
2258
("ancestors/time" , zarr_summary (self .ancestors_time )),
2246
2259
("ancestors/focal_sites" , zarr_summary (self .ancestors_focal_sites )),
2247
2260
("ancestors/haplotype" , zarr_summary (self .ancestors_haplotype )),
2261
+ ("ancestors/sample_id" , zarr_summary (self .ancestors_sample_id )),
2248
2262
]
2249
2263
return super ().__str__ () + self ._format_str (values )
2250
2264
@@ -2269,6 +2283,9 @@ def data_equal(self, other):
2269
2283
self .ancestors_focal_sites [:], other .ancestors_focal_sites [:]
2270
2284
)
2271
2285
and np_obj_equal (self .ancestors_haplotype [:], other .ancestors_haplotype [:])
2286
+ and np .array_equal (
2287
+ self .ancestors_sample_id [:], other .ancestors_sample_id [:]
2288
+ )
2272
2289
)
2273
2290
2274
2291
@property
@@ -2311,6 +2328,10 @@ def ancestors_focal_sites(self):
2311
2328
def ancestors_haplotype (self ):
2312
2329
return self .data ["ancestors/haplotype" ]
2313
2330
2331
+ @property
2332
+ def ancestors_sample_id (self ):
2333
+ return self .data ["ancestors/sample_id" ]
2334
+
2314
2335
@property
2315
2336
def ancestors_length (self ):
2316
2337
"""
@@ -2329,6 +2350,7 @@ def insert_proxy_samples(
2329
2350
* ,
2330
2351
sample_ids = None ,
2331
2352
epsilon = None ,
2353
+ map_ancestors = False ,
2332
2354
allow_mutation = False ,
2333
2355
require_same_sample_data = True ,
2334
2356
** kwargs ,
@@ -2341,7 +2363,8 @@ def insert_proxy_samples(
2341
2363
2342
2364
A *proxy sample ancestor* is an ancestor based upon a known sample. At
2343
2365
sites used in the full inference process, the haplotype of this ancestor
2344
- is identical to that of the sample on which it is based. The time of the
2366
+ is identical to that of the sample on which it is based, and the
2367
+ The time of the
2345
2368
ancestor is taken to be a fraction ``epsilon`` older than the sample on
2346
2369
which it is based.
2347
2370
@@ -2355,11 +2378,11 @@ def insert_proxy_samples(
2355
2378
2356
2379
.. note::
2357
2380
2358
- The proxy sample ancestors inserted here will correspond to extra nodes
2359
- in the inferred tree sequence. At sites which are not used in the full
2381
+ The proxy sample ancestors inserted here will end up as extra nodes
2382
+ in the inferred tree sequence, but at sites which are not used in the full
2360
2383
inference process (e.g. sites unique to a single historical sample),
2361
- these proxy sample ancestor nodes may have a different genotype from
2362
- their corresponding sample.
2384
+ it is possible for these proxy sample ancestor nodes to have a different
2385
+ genotype from their corresponding sample.
2363
2386
2364
2387
:param SampleData sample_data: The :class:`.SampleData` instance
2365
2388
from which to select the samples used to create extra ancestors.
@@ -2394,7 +2417,8 @@ def insert_proxy_samples(
2394
2417
to ensure that the encoding of alleles in ``sample_data`` matches the
2395
2418
encoding in the current :class:`AncestorData` instance (i.e. that in the
2396
2419
original :class:`.SampleData` instance on which the current ancestors
2397
- are based).
2420
+ are based). Note that in this case, the sample_id is not recorded in the
2421
+ returned object.
2398
2422
:param \\ **kwargs: Further arguments passed to the constructor when creating
2399
2423
the new :class:`AncestorData` instance which will be returned.
2400
2424
@@ -2492,7 +2516,11 @@ def insert_proxy_samples(
2492
2516
time = proxy_time ,
2493
2517
focal_sites = [],
2494
2518
haplotype = haplotype ,
2519
+ sample_id = sample_id
2520
+ if sample_data .uuid == self .sample_data_uuid
2521
+ else tskit .NULL ,
2495
2522
)
2523
+
2496
2524
# Add any ancestors remaining in the current instance
2497
2525
while ancestor is not None :
2498
2526
other .add_ancestor (** attr .asdict (ancestor , filter = exclude_id ))
@@ -2574,7 +2602,6 @@ def truncate_ancestors(
2574
2602
start = self .ancestors_start [:]
2575
2603
end = self .ancestors_end [:]
2576
2604
time = self .ancestors_time [:]
2577
- focal_sites = self .ancestors_focal_sites [:]
2578
2605
haplotypes = self .ancestors_haplotype [:]
2579
2606
if upper_time_bound > np .max (time ) or lower_time_bound > np .max (time ):
2580
2607
raise ValueError ("Time bounds cannot be greater than older ancestor" )
@@ -2612,16 +2639,12 @@ def truncate_ancestors(
2612
2639
)
2613
2640
start [anc .id ] = insert_pos_start
2614
2641
end [anc .id ] = insert_pos_end
2615
- time [anc .id ] = anc .time
2616
- focal_sites [anc .id ] = anc .focal_sites
2617
2642
haplotypes [anc .id ] = anc .haplotype [
2618
2643
insert_pos_start - anc .start : insert_pos_end - anc .start
2619
2644
]
2620
2645
# TODO - record truncation in ancestors' metadata when supported
2621
2646
truncated .ancestors_start [:] = start
2622
2647
truncated .ancestors_end [:] = end
2623
- truncated .ancestors_time [:] = time
2624
- truncated .ancestors_focal_sites [:] = focal_sites
2625
2648
truncated .ancestors_haplotype [:] = haplotypes
2626
2649
truncated .record_provenance (command = "truncate_ancestors" )
2627
2650
truncated .finalise ()
@@ -2642,6 +2665,12 @@ def set_inference_sites(self, site_ids):
2642
2665
sites in the sample data file, and the IDs must be in increasing order.
2643
2666
2644
2667
This must be called before the first call to :meth:`.add_ancestor`.
2668
+
2669
+ .. note::
2670
+ To obtain a list of which sites in a sample data or a tree sequence have
2671
+ been placed into the ancestors file for use in inference, you can apply
2672
+ :func:`numpy.isin` to the list of positions, e.g.
2673
+ ``np.isin(sample_data.sites_position[:], ancestors.sites_position[:])``
2645
2674
"""
2646
2675
self ._check_build_mode ()
2647
2676
position = self .sample_data .sites_position [:][site_ids ]
@@ -2650,12 +2679,18 @@ def set_inference_sites(self, site_ids):
2650
2679
array [:] = position
2651
2680
self ._num_alleles = self .sample_data .num_alleles (site_ids )
2652
2681
2653
- def add_ancestor (self , start , end , time , focal_sites , haplotype ):
2682
+ def add_ancestor (
2683
+ self , start , end , time , focal_sites , haplotype , sample_id = tskit .NULL
2684
+ ):
2654
2685
"""
2655
2686
Adds an ancestor with the specified haplotype, with ancestral material over the
2656
2687
interval [start:end], that is associated with the specified timepoint and has new
2657
- mutations at the specified list of focal sites. Ancestors should be added in time
2658
- order, with the oldest first. The id of the added ancestor is returned.
2688
+ mutations at the specified list of focal sites. If this ancestor is based on a
2689
+ specific sample from the associated sample_data file (i.e. a historical sample)
2690
+ then the ``sample_id`` in the sample data file can also be passed as a parameter.
2691
+
2692
+ The Ancestors should be added in time order, with the oldest first. The id of
2693
+ the added ancestor is returned.
2659
2694
"""
2660
2695
self ._check_build_mode ()
2661
2696
haplotype = tskit .util .safe_np_int_cast (haplotype , dtype = np .int8 , copy = True )
@@ -2685,6 +2720,7 @@ def add_ancestor(self, start, end, time, focal_sites, haplotype):
2685
2720
time = time ,
2686
2721
focal_sites = focal_sites ,
2687
2722
haplotype = haplotype ,
2723
+ sample_id = sample_id ,
2688
2724
)
2689
2725
2690
2726
def finalise (self ):
@@ -2706,6 +2742,7 @@ def ancestors(self):
2706
2742
end = self .ancestors_end [:]
2707
2743
time = self .ancestors_time [:]
2708
2744
focal_sites = self .ancestors_focal_sites [:]
2745
+ sample_id = self .ancestors_sample_id [:]
2709
2746
for j , h in enumerate (chunk_iterator (self .ancestors_haplotype )):
2710
2747
yield Ancestor (
2711
2748
id = j ,
@@ -2714,6 +2751,7 @@ def ancestors(self):
2714
2751
time = time [j ],
2715
2752
focal_sites = focal_sites [j ],
2716
2753
haplotype = h ,
2754
+ sample_id = sample_id [j ],
2717
2755
)
2718
2756
2719
2757
0 commit comments