Skip to content

Commit bac1d2b

Browse files
authored
Merge pull request #599 from hyanwong/change-default-ts
Change time defaults for from_tree_sequence
2 parents fc606d7 + 4422e5e commit bac1d2b

File tree

4 files changed

+93
-27
lines changed

4 files changed

+93
-27
lines changed

CHANGELOG.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,10 @@
1616
- Oldest nodes in a standard inferred tree sequence are no longer set to frequencies ~2
1717
and ~3 (i.e. 2 or 3 times as old as all the other nodes), but are spaced above the
1818
others by the mean time between unique ancestor ages (:pr:`485`, :user:`hyanwong`)
19+
20+
- The ``tsinfer.SampleData.from_tree_sequence()`` function now defaults to setting
21+
``use_sites_time`` and ``use_individuals_time`` to ``False`` rather than ``True``
22+
(:pr:`599`, :user:`hyanwong`)
1923

2024
********************
2125
[0.2.1] - 2021-05-26

tests/test_formats.py

Lines changed: 48 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -366,7 +366,7 @@ def test_from_tree_sequence_simple(self):
366366
ts = tsutil.get_example_ts(10, 10, 1)
367367
sd1 = formats.SampleData(sequence_length=ts.sequence_length)
368368
self.verify_data_round_trip(ts, sd1)
369-
sd2 = formats.SampleData.from_tree_sequence(ts)
369+
sd2 = formats.SampleData.from_tree_sequence(ts, use_sites_time=True)
370370
assert sd1.data_equal(sd2)
371371

372372
def test_from_tree_sequence_variable_allele_number(self):
@@ -398,7 +398,7 @@ def test_from_tree_sequence_variable_allele_number(self):
398398
num_alleles = sd1.num_alleles()
399399
for var in ts.variants():
400400
assert len(var.alleles) == num_alleles[var.site.id]
401-
sd2 = formats.SampleData.from_tree_sequence(ts)
401+
sd2 = formats.SampleData.from_tree_sequence(ts, use_sites_time=True)
402402
assert sd1.data_equal(sd2)
403403

404404
def test_from_tree_sequence_with_metadata(self):
@@ -412,34 +412,63 @@ def test_from_tree_sequence_with_metadata(self):
412412
ts_no_individuals = tables.tree_sequence()
413413
sd1 = formats.SampleData(sequence_length=ts.sequence_length)
414414
self.verify_data_round_trip(ts_no_individuals, sd1)
415-
sd2 = formats.SampleData.from_tree_sequence(ts_no_individuals)
415+
sd2 = formats.SampleData.from_tree_sequence(
416+
ts_no_individuals, use_sites_time=True
417+
)
416418
assert sd1.data_equal(sd2)
417419

418420
def test_from_tree_sequence_with_metadata_and_individuals(self):
419421
ts = tsutil.get_example_individuals_ts_with_metadata(5, 3, 10)
420422
sd1 = formats.SampleData(sequence_length=ts.sequence_length)
421423
self.verify_data_round_trip(ts, sd1)
422-
sd2 = formats.SampleData.from_tree_sequence(ts)
424+
sd2 = formats.SampleData.from_tree_sequence(ts, use_sites_time=True)
423425
sd1.assert_data_equal(sd2)
424426

425-
def test_from_historical_tree_sequence(self):
427+
def test_from_historical_tree_sequence_with_times(self):
428+
n_indiv = 5
426429
ploidy = 2
427-
individual_times = np.arange(5)
430+
individual_times = np.arange(n_indiv)
428431
ts = tsutil.get_example_historical_sampled_ts(individual_times, ploidy, 10)
432+
# Test on a tree seq containing an individual with no nodes
433+
keep_samples = [u for i in ts.individuals() for u in i.nodes if i.id < n_indiv]
434+
ts = ts.simplify(samples=keep_samples, filter_individuals=False)
429435
sd1 = formats.SampleData(sequence_length=ts.sequence_length)
430436
self.verify_data_round_trip(ts, sd1)
431-
sd2 = formats.SampleData.from_tree_sequence(ts)
437+
sd2 = formats.SampleData.from_tree_sequence(
438+
ts, use_sites_time=True, use_individuals_time=True
439+
)
432440
assert sd1.data_equal(sd2)
441+
# Fails if use_individuals_time is not set
442+
sd2 = formats.SampleData.from_tree_sequence(ts, use_sites_time=True)
443+
assert not sd1.data_equal(sd2)
433444

434-
def test_from_tree_sequence_use_time(self):
445+
def test_from_tree_sequence_no_times(self):
446+
n_indiv = 5
435447
ploidy = 2
436-
individual_times = np.arange(5)
448+
individual_times = np.arange(n_indiv + 1)
437449
ts = tsutil.get_example_historical_sampled_ts(individual_times, ploidy, 10)
438-
sd1 = formats.SampleData.from_tree_sequence(ts, use_individuals_time=False)
450+
# Test on a tree seq containing an individual with no nodes
451+
keep_samples = [u for i in ts.individuals() for u in i.nodes if i.id < n_indiv]
452+
ts = ts.simplify(samples=keep_samples, filter_individuals=False)
453+
sd1 = formats.SampleData.from_tree_sequence(ts)
454+
assert sd1.num_individuals == n_indiv
439455
assert np.all(sd1.individuals_time[:] == 0)
440-
sd2 = formats.SampleData.from_tree_sequence(ts, use_sites_time=False)
441-
assert np.all(tskit.is_unknown_time(sd2.sites_time[:]))
442-
assert np.array_equal(sd2.individuals_time[:], individual_times)
456+
457+
def test_from_tree_sequence_time_incompatibilities(self):
458+
ploidy = 2
459+
individual_times = np.arange(5)
460+
ts = tsutil.get_example_historical_sampled_ts(individual_times, ploidy, 10)
461+
with pytest.raises(ValueError, match="Incompatible timescales"):
462+
_ = formats.SampleData.from_tree_sequence(ts, use_individuals_time=True)
463+
# Similar error if no individuals in the TS
464+
tables = ts.dump_tables()
465+
tables.individuals.clear()
466+
tables.nodes.individual = np.full(
467+
tables.nodes.num_rows, tskit.NULL, dtype=tables.nodes.individual.dtype
468+
)
469+
ts = tables.tree_sequence()
470+
with pytest.raises(ValueError, match="Incompatible timescales"):
471+
_ = formats.SampleData.from_tree_sequence(ts, use_individuals_time=True)
443472

444473
def test_chunk_size(self):
445474
ts = tsutil.get_example_ts(4, 2)
@@ -802,7 +831,7 @@ def test_sites(self):
802831
def test_sites_subset(self):
803832
ts = tsutil.get_example_ts(11, 15)
804833
assert ts.num_sites > 1
805-
input_file = formats.SampleData.from_tree_sequence(ts)
834+
input_file = formats.SampleData.from_tree_sequence(ts, use_sites_time=True)
806835
assert list(input_file.sites([])) == []
807836
index = np.arange(input_file.num_sites)
808837
site_list = list(input_file.sites())
@@ -1587,7 +1616,7 @@ def verify(self, sd1, sd2):
15871616
def test_merge_identical(self):
15881617
n = 10
15891618
ts = tsutil.get_example_ts(n, 10, 1)
1590-
sd1 = formats.SampleData.from_tree_sequence(ts)
1619+
sd1 = formats.SampleData.from_tree_sequence(ts, use_sites_time=True)
15911620
sd2 = sd1.merge(sd1)
15921621
assert sd2.num_sites == sd1.num_sites
15931622
assert sd2.num_samples == 2 * sd1.num_samples
@@ -1721,7 +1750,7 @@ class TestMinSiteTimes:
17211750

17221751
def test_no_historical(self):
17231752
ts = tsutil.get_example_ts(10, 10, 1)
1724-
sd1 = formats.SampleData.from_tree_sequence(ts)
1753+
sd1 = formats.SampleData.from_tree_sequence(ts, use_sites_time=True)
17251754
# No arguments and individuals_only=True should give array of zeros
17261755
bounds_individuals_only = sd1.min_site_times(individuals_only=True)
17271756
assert np.array_equal(bounds_individuals_only, np.zeros(sd1.num_sites))
@@ -1731,7 +1760,9 @@ def test_no_historical(self):
17311760
def test_simple_case(self):
17321761
individual_times = [0, 0, 0.5, 1]
17331762
ts = tsutil.get_example_historical_sampled_ts(individual_times, ploidy=1)
1734-
sd1 = formats.SampleData.from_tree_sequence(ts)
1763+
sd1 = formats.SampleData.from_tree_sequence(
1764+
ts, use_sites_time=True, use_individuals_time=True
1765+
)
17351766
time_bound_individuals_only = sd1.min_site_times(individuals_only=True)
17361767
# Because this is a haploid tree sequence we can use the
17371768
# individual and sample IDs interchangably.

tests/test_inference.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1116,7 +1116,11 @@ def test_from_historical_tree_sequence(self):
11161116
seq_len = 10
11171117
individual_times = np.arange(n_indiv)
11181118
ts = tsutil.get_example_historical_sampled_ts(individual_times, ploidy, seq_len)
1119-
ts_inferred = tsinfer.infer(tsinfer.SampleData.from_tree_sequence(ts))
1119+
ts_inferred = tsinfer.infer(
1120+
tsinfer.SampleData.from_tree_sequence(
1121+
ts, use_sites_time=True, use_individuals_time=True
1122+
)
1123+
)
11201124
assert ts.sequence_length == ts_inferred.sequence_length
11211125
assert ts.metadata_schema == ts_inferred.metadata_schema
11221126
assert ts.metadata == ts_inferred.metadata

tsinfer/formats.py

Lines changed: 36 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1419,8 +1419,8 @@ def min_site_times(self, individuals_only=False):
14191419
def from_tree_sequence(
14201420
cls,
14211421
ts,
1422-
use_sites_time=True,
1423-
use_individuals_time=True,
1422+
use_sites_time=None,
1423+
use_individuals_time=None,
14241424
**kwargs,
14251425
):
14261426
"""
@@ -1444,16 +1444,18 @@ def from_tree_sequence(
14441444
14451445
:param TreeSequence ts: The :class:`tskit.TreeSequence` from which to generate
14461446
samples.
1447-
:param bool use_sites_time: If True (default), the times of nodes in the tree
1447+
:param bool use_sites_time: If ``True``, the times of nodes in the tree
14481448
sequence are used to set a time for each site (which affects the relative
14491449
temporal order of ancestors during inference). Times for a site are only
14501450
used if there is a single mutation at that site, in which case the node
14511451
immediately below the mutation is taken as the origination time for the
1452-
variant. If False, the frequency of the variant is used as a proxy for the
1453-
relative variant time (see :meth:`.add_site`).
1454-
:param bool use_individuals_time: If True (default), set a time for individuals
1455-
that contain historical sample nodes. If False, all individuals are
1456-
set at time 0.
1452+
variant. If ``False``, the frequency of the variant is used as a proxy for
1453+
the relative variant time (see :meth:`.add_site`). Defaults to ``False``.
1454+
:param bool use_individuals_time: If ``True``, use the time of the sample nodes
1455+
in the tree sequence as the time of the individuals associated with
1456+
those nodes in the sample data file. This is likely only to be meaningful if
1457+
``use_sites_time`` is also ``True``. If ``False``, all individuals are set
1458+
to time 0. Defaults to ``False``.
14571459
:param \\**kwargs: Further arguments passed to the :class:`SampleData`
14581460
constructor.
14591461
:return: A :class:`.SampleData` object.
@@ -1468,6 +1470,11 @@ def encode_metadata(metadata, schema):
14681470
metadata = None
14691471
return metadata
14701472

1473+
if use_sites_time is None:
1474+
use_sites_time = False
1475+
if use_individuals_time is None:
1476+
use_individuals_time = False
1477+
14711478
tables = ts.tables
14721479
self = cls(sequence_length=ts.sequence_length, **kwargs)
14731480

@@ -1488,6 +1495,7 @@ def encode_metadata(metadata, schema):
14881495
for individual in ts.individuals():
14891496
nodes = individual.nodes
14901497
if len(nodes) > 0:
1498+
time = 0
14911499
first_node = ts.node(nodes[0])
14921500
for u in nodes[1:]:
14931501
if ts.node(u).time != first_node.time:
@@ -1502,18 +1510,37 @@ def encode_metadata(metadata, schema):
15021510
"population".format(individual.id)
15031511
)
15041512
metadata = encode_metadata(individual.metadata, schema)
1513+
if use_individuals_time:
1514+
time = first_node.time
1515+
if time != 0 and not use_sites_time:
1516+
raise ValueError(
1517+
"Incompatible timescales: site frequencies used for times "
1518+
f"(use_sites_time=False), but node {first_node.id} in "
1519+
f"individual {individual.id} has a nonzero time and "
1520+
"use_individuals_time=True. Please set site times manually."
1521+
)
15051522
self.add_individual(
15061523
location=individual.location,
15071524
metadata=metadata,
15081525
population=first_node.population,
15091526
flags=individual.flags,
1510-
time=first_node.time if use_individuals_time else 0,
1527+
time=time,
15111528
ploidy=len(nodes),
15121529
)
15131530
for u in ts.samples():
15141531
node = ts.node(u)
15151532
if node.individual == tskit.NULL:
15161533
# The sample node has no individual: create a haploid individual for it
1534+
time = 0
1535+
if use_individuals_time:
1536+
time = node.time
1537+
if time != 0 and not use_sites_time:
1538+
raise ValueError(
1539+
"Incompatible timescales: site frequencies used for times "
1540+
f"(use_sites_time=False), but node {node.id} "
1541+
"has a nonzero time and use_individuals_time=True. "
1542+
"Please set site times manually."
1543+
)
15171544
self.add_individual(
15181545
population=node.population,
15191546
time=node.time if use_individuals_time else 0,

0 commit comments

Comments
 (0)