Skip to content

Commit 0f88d11

Browse files
authored
Merge pull request #275 from hyanwong/provenance-saving
Provenance saving
2 parents b199fe4 + b7ec883 commit 0f88d11

File tree

7 files changed

+220
-55
lines changed

7 files changed

+220
-55
lines changed

CHANGELOG.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,14 @@
3333
probabilities (scaled so that they sum to one) rather than standardized
3434
"probabilities" whose maximum value is one.
3535

36+
- The population size is saved in provenance metadata when possible (e.g. if it is
37+
a single number)
38+
39+
- ``preprocess_ts`` always records provenance as being from the ``preprocess_ts``
40+
command, even if no gaps are removed. The command now has a (rarely used)
41+
`delete_intervals` parameter, which is normally filled out and saved in provenance
42+
(as it was before). If no gap deletion is done, the param is saved as ``[]``
43+
3644
--------------------
3745
[0.1.5] - 2022-06-07
3846
--------------------

tests/test_functions.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1781,11 +1781,28 @@ def test_no_intervals(self):
17811781
assert ts.tables.edges == self.verify(ts, remove_telomeres=False).tables.edges
17821782
assert ts.tables.edges == self.verify(ts, minimum_gap=0.05).tables.edges
17831783

1784+
def test_passed_intervals(self):
1785+
# Mostly we would not pass in user-defined intervals: this is mainly for testing
1786+
ts = utility_functions.single_tree_ts_n3() # No sites!
1787+
ts = tsdate.preprocess_ts(
1788+
ts, delete_intervals=[(0, 0.1), (0.5, ts.sequence_length)]
1789+
)
1790+
assert ts.num_edges > 1
1791+
assert np.allclose(ts.edges_left, 0.1)
1792+
assert np.allclose(ts.edges_right, 0.5)
1793+
1794+
def test_bad_delete_intervals(self):
1795+
ts = utility_functions.two_tree_mutation_ts()
1796+
with pytest.raises(ValueError, match="specify both"):
1797+
tsdate.preprocess_ts(ts, delete_intervals=[(0, 0.1)], minimum_gap=0.05)
1798+
with pytest.raises(ValueError, match="specify both"):
1799+
tsdate.preprocess_ts(ts, delete_intervals=[(0, 0.1)], remove_telomeres=True)
1800+
17841801
def test_delete_interval(self):
17851802
ts = utility_functions.ts_w_data_desert(40, 60, 100)
17861803
trimmed = self.verify(ts, minimum_gap=20, remove_telomeres=False)
1787-
lefts = trimmed.tables.edges.left
1788-
rights = trimmed.tables.edges.right
1804+
lefts = trimmed.edges_left
1805+
rights = trimmed.edges_right
17891806
assert not np.any(np.logical_and(lefts > 41, lefts < 59))
17901807
assert not np.any(np.logical_and(rights > 41, rights < 59))
17911808

tests/test_provenance.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
# MIT License
2+
#
3+
# Copyright (c) 2021-23 Tskit Developers
4+
#
5+
# Permission is hereby granted, free of charge, to any person obtaining a copy
6+
# of this software and associated documentation files (the "Software"), to deal
7+
# in the Software without restriction, including without limitation the rights
8+
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
# copies of the Software, and to permit persons to whom the Software is
10+
# furnished to do so, subject to the following conditions:
11+
#
12+
# The above copyright notice and this permission notice shall be included in
13+
# all
14+
# copies or substantial portions of the Software.
15+
#
16+
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17+
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18+
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19+
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20+
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21+
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22+
# SOFTWARE.
23+
"""
24+
Test cases for saving provenances in tsdate.
25+
"""
26+
import json
27+
28+
import numpy as np
29+
import pytest
30+
import utility_functions
31+
32+
import tsdate
33+
from tsdate import provenance
34+
35+
36+
class TestProvenance:
37+
def test_bad_get_dict(self):
38+
with pytest.raises(ValueError, match="cannot be None"):
39+
provenance.get_provenance_dict(None)
40+
41+
def test_date_cmd_recorded(self):
42+
ts = utility_functions.single_tree_ts_n2()
43+
dated_ts = tsdate.date(ts, population_size=1, mutation_rate=None)
44+
rec = json.loads(dated_ts.provenance(-1).record)
45+
assert rec["software"]["name"] == "tsdate"
46+
assert rec["parameters"]["command"] == "date"
47+
48+
def test_date_params_recorded(self):
49+
ts = utility_functions.single_tree_ts_n2()
50+
mu = 0.123
51+
Ne = 9
52+
dated_ts = tsdate.date(ts, population_size=Ne, mutation_rate=mu)
53+
rec = json.loads(dated_ts.provenance(-1).record)
54+
assert np.isclose(rec["parameters"]["mutation_rate"], mu)
55+
assert np.isclose(rec["parameters"]["population_size"], Ne)
56+
57+
@pytest.mark.skip(
58+
reason="Not implemented yet: https://github.com/tskit-dev/tsdate/issues/274"
59+
)
60+
def test_date_popsizehist_recorded(self):
61+
ts = utility_functions.single_tree_ts_n2()
62+
mu = 0.123
63+
popsize = tsdate.demography.PopulationSizeHistory(
64+
np.array([1, 1]), np.array([1])
65+
)
66+
dated_ts = tsdate.date(ts, population_size=popsize, mutation_rate=mu)
67+
rec = json.loads(dated_ts.provenance(-1).record)
68+
assert np.isclose(rec["parameters"]["mutation_rate"], mu)
69+
assert "population_size" in rec["parameters"]
70+
# TODO: check that the population size history is recorded correctly
71+
# https://github.com/tskit-dev/tsdate/issues/274
72+
assert np.isclose(
73+
# This is wrong - left in for now to show the sort of thing we want
74+
(np.array(rec["parameters"]["population_size"])),
75+
popsize,
76+
)
77+
78+
def test_preprocess_cmd_recorded(self):
79+
ts = utility_functions.ts_w_data_desert(40, 60, 100)
80+
preprocessed_ts = tsdate.preprocess_ts(ts)
81+
rec = json.loads(preprocessed_ts.provenance(-1).record)
82+
assert rec["software"]["name"] == "tsdate"
83+
assert rec["parameters"]["command"] == "preprocess_ts"
84+
85+
def test_preprocess_defaults_recorded(self):
86+
ts = utility_functions.ts_w_data_desert(40, 60, 100)
87+
preprocessed_ts = tsdate.preprocess_ts(ts)
88+
rec = json.loads(preprocessed_ts.provenance(-1).record)
89+
assert rec["parameters"]["remove_telomeres"]
90+
assert rec["parameters"]["minimum_gap"] == 1000000
91+
assert rec["parameters"]["delete_intervals"] == []
92+
93+
def test_preprocess_interval_recorded(self):
94+
ts = utility_functions.ts_w_data_desert(40, 60, 100)
95+
preprocessed_ts = tsdate.preprocess_ts(
96+
ts, minimum_gap=20, remove_telomeres=False
97+
)
98+
rec = json.loads(preprocessed_ts.provenance(-1).record)
99+
print(rec)
100+
assert rec["parameters"]["minimum_gap"] == 20
101+
assert rec["parameters"]["remove_telomeres"] is not None
102+
assert not rec["parameters"]["remove_telomeres"]
103+
deleted_intervals = rec["parameters"]["delete_intervals"]
104+
assert len(deleted_intervals) == 1
105+
assert deleted_intervals[0][0] < deleted_intervals[0][1]
106+
assert 40 < deleted_intervals[0][0] < 60
107+
assert 40 < deleted_intervals[0][1] < 60

tsdate/core.py

Lines changed: 11 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1215,11 +1215,7 @@ def date(
12151215
Ignoring outside root provides greater stability when dating tree sequences
12161216
inferred from real data. Default: False
12171217
:param bool progress: Whether to display a progress bar. Default: False
1218-
:param float Ne: the estimated (diploid) effective population size used to
1219-
construct the (default) conditional coalescent prior. This is used when
1220-
``priors`` is ``None``. Conversely, if ``priors`` is not ``None``, no
1221-
``population_size`` value should be given. (Deprecated, use the
1222-
``population_size`` argument instead).
1218+
:param float Ne: Deprecated, use the``population_size`` argument instead.
12231219
:return: A copy of the input tree sequence but with altered node times, or (if
12241220
``return_posteriors`` is True) a tuple of that tree sequence plus a dictionary
12251221
of posterior probabilities from the "inside_outside" estimation ``method``.
@@ -1262,13 +1258,19 @@ def date(
12621258
# Remove any times associated with mutations
12631259
tables.mutations.time = np.full(tree_sequence.num_mutations, tskit.UNKNOWN_TIME)
12641260
tables.sort()
1265-
# TODO: record population_size provenance, or record that it is omitted
1266-
provenance.record_provenance(
1267-
tables,
1268-
"date",
1261+
params = dict(
12691262
mutation_rate=mutation_rate,
12701263
recombination_rate=recombination_rate,
12711264
progress=progress,
1265+
)
1266+
if isinstance(population_size, (int, float)):
1267+
params["population_size"] = population_size
1268+
else:
1269+
params["population_size"] = "TODO: PopulationSizeHistory object"
1270+
provenance.record_provenance(
1271+
tables,
1272+
"date",
1273+
**params,
12721274
**kwargs,
12731275
)
12741276
if return_posteriors:

tsdate/demography.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,3 +245,7 @@ def to_gamma(self, shape=1, rate=1):
245245
# """
246246
# Create a `PopulationSizeHistory` instance from a `demes` format YAML
247247
# """
248+
249+
# TODO:
250+
# output a PopulationSizeHistory as a json object, so we can store it
251+
# in the provenance. See https://github.com/tskit-dev/tsdate/issues/274

tsdate/provenance.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,13 @@ def get_environment():
6363
return env
6464

6565

66-
def get_provenance_dict(command=None, **kwargs):
66+
def get_provenance_dict(command, **kwargs):
6767
"""
6868
Returns a dictionary encoding an execution of tsdate conforming to the
6969
tskit provenance schema.
7070
"""
7171
if command is None:
72-
raise ValueError("Command must be provided")
72+
raise ValueError("`command` cannot be None")
7373
parameters = dict(kwargs)
7474
parameters["command"] = command
7575
document = {

tsdate/util.py

Lines changed: 69 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,12 @@ def reduce_to_contemporaneous(ts):
6464
def preprocess_ts(
6565
tree_sequence,
6666
*,
67-
minimum_gap=1000000,
68-
remove_telomeres=True,
67+
minimum_gap=None,
68+
remove_telomeres=None,
6969
filter_populations=False,
7070
filter_individuals=False,
7171
filter_sites=False,
72+
delete_intervals=None,
7273
**kwargs,
7374
):
7475
"""
@@ -81,9 +82,10 @@ def preprocess_ts(
8182
:param TreeSequence tree_sequence: The input :class`tskit.TreeSequence`
8283
to be preprocessed.
8384
:param float minimum_gap: The minimum gap between sites to remove from the tree
84-
sequence. Default: "1000000"
85+
sequence. Default: ``None`` treated as ``1000000``
8586
:param bool remove_telomeres: Should all material before the first site and after the
86-
last site be removed, regardless of the length. Default: "True"
87+
last site be removed, regardless of the length. Default: ``None`` treated as
88+
``True``
8789
:param bool filter_populations: parameter passed to the ``tskit.simplify``
8890
command. Unlike calling that command directly, this defaults to ``False``, such
8991
that all populations in the tree sequence are kept.
@@ -93,71 +95,96 @@ def preprocess_ts(
9395
:param bool filter_sites: parameter passed to the ``tskit.simplify``
9496
command. Unlike calling that command directly, this defaults to ``False``, such
9597
that all sites in the tree sequence are kept
98+
:param array_like delete_intervals: A list (start, end) pairs describing the
99+
genomic intervals (gaps) to delete. This is usually left as ``None``
100+
(the default) in which case ``minimum_gap`` and ``remove_telomeres`` are used
101+
to determine the gaps to remove, and the calculated intervals are recorded in
102+
the provenance of the resulting tree sequence.
96103
:param \\**kwargs: All further keyword arguments are passed to the ``tskit.simplify``
97104
command.
98105
99106
:return: A tree sequence with gaps removed.
100107
:rtype: tskit.TreeSequence
101108
"""
109+
102110
logger.info("Beginning preprocessing")
103111
logger.info(f"Minimum_gap: {minimum_gap} and remove_telomeres: {remove_telomeres}")
104-
if tree_sequence.num_sites < 1:
105-
raise ValueError("Invalid tree sequence: no sites present")
112+
if delete_intervals is not None and (
113+
minimum_gap is not None or remove_telomeres is not None
114+
):
115+
raise ValueError(
116+
"Cannot specify both delete_intervals and minimum_gap/remove_telomeres"
117+
)
106118

107119
tables = tree_sequence.dump_tables()
108120
sites = tables.sites.position[:]
109-
delete_intervals = []
110-
if remove_telomeres:
111-
first_site = sites[0] - 1
112-
if first_site > 0:
113-
delete_intervals.append([0, first_site])
114-
logger.info(
115-
"REMOVING TELOMERE: Snip topology "
116-
"from 0 to first site at {}.".format(first_site)
117-
)
118-
last_site = sites[-1] + 1
119-
sequence_length = tables.sequence_length
120-
if last_site < sequence_length:
121-
delete_intervals.append([last_site, sequence_length])
122-
logger.info(
123-
"REMOVING TELOMERE: Snip topology "
124-
"from {} to end of sequence at {}.".format(last_site, sequence_length)
125-
)
126-
gaps = sites[1:] - sites[:-1]
127-
threshold_gaps = np.where(gaps >= minimum_gap)[0]
128-
for gap in threshold_gaps:
129-
gap_start = sites[gap] + 1
130-
gap_end = sites[gap + 1] - 1
131-
if gap_end > gap_start:
132-
logger.info(
133-
"Gap Size is {}. Snip topology "
134-
"from {} to {}.".format(gap_end - gap_start, gap_start, gap_end)
135-
)
136-
delete_intervals.append([gap_start, gap_end])
137-
delete_intervals = sorted(delete_intervals, key=lambda x: x[0])
121+
if delete_intervals is None:
122+
if minimum_gap is None:
123+
minimum_gap = 1000000
124+
if remove_telomeres is None:
125+
remove_telomeres = True
126+
127+
if tree_sequence.num_sites < 1:
128+
raise ValueError("Invalid tree sequence: no sites present")
129+
delete_intervals = []
130+
if remove_telomeres:
131+
first_site = sites[0] - 1
132+
if first_site > 0:
133+
delete_intervals.append([0, first_site])
134+
logger.info(
135+
"REMOVING TELOMERE: Snip topology "
136+
"from 0 to first site at {}.".format(first_site)
137+
)
138+
last_site = sites[-1] + 1
139+
sequence_length = tables.sequence_length
140+
if last_site < sequence_length:
141+
delete_intervals.append([last_site, sequence_length])
142+
logger.info(
143+
"REMOVING TELOMERE: Snip topology "
144+
"from {} to end of sequence at {}.".format(
145+
last_site, sequence_length
146+
)
147+
)
148+
gaps = sites[1:] - sites[:-1]
149+
threshold_gaps = np.where(gaps >= minimum_gap)[0]
150+
for gap in threshold_gaps:
151+
gap_start = sites[gap] + 1
152+
gap_end = sites[gap + 1] - 1
153+
if gap_end > gap_start:
154+
logger.info(
155+
"Gap Size is {}. Snip topology "
156+
"from {} to {}.".format(gap_end - gap_start, gap_start, gap_end)
157+
)
158+
delete_intervals.append([gap_start, gap_end])
159+
delete_intervals = sorted(delete_intervals, key=lambda x: x[0])
138160
if len(delete_intervals) > 0:
139161
tables.delete_intervals(delete_intervals, simplify=False)
140162
tables.simplify(
141163
filter_populations=filter_populations,
142164
filter_individuals=filter_individuals,
143165
filter_sites=filter_sites,
166+
record_provenance=False,
144167
**kwargs,
145168
)
146-
provenance.record_provenance(
147-
tables,
148-
"preprocess_ts",
149-
minimum_gap=minimum_gap,
150-
remove_telomeres=remove_telomeres,
151-
delete_intervals=delete_intervals,
152-
)
153169
else:
154170
logger.info("No gaps to remove")
155171
tables.simplify(
156172
filter_populations=filter_populations,
157173
filter_individuals=filter_individuals,
158174
filter_sites=filter_sites,
175+
record_provenance=False,
159176
**kwargs,
160177
)
178+
provenance.record_provenance(
179+
tables,
180+
"preprocess_ts",
181+
minimum_gap=minimum_gap,
182+
remove_telomeres=remove_telomeres,
183+
filter_populations=filter_populations,
184+
filter_individuals=filter_individuals,
185+
filter_sites=filter_sites,
186+
delete_intervals=delete_intervals,
187+
)
161188
return tables.tree_sequence()
162189

163190

0 commit comments

Comments
 (0)