Rework build-prior and inside / outside logic to allow historical samples

hyanwong · hyanwong · commit 8e28db2d2145 · 2022-09-08T09:07:21.000+01:00
And speed up time constraint algorithms while also allowing nodes to be out of time order
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,4 @@
-tskit>=0.4.0
+tskit>=0.5.2
 tsinfer>=0.2.0
 flake8
 numpy
diff --git a/tests/test_functions.py b/tests/test_functions.py
@@ -742,14 +742,16 @@ def test_logsumexp(self):
         assert np.allclose(LogLikelihoods.logsumexp(log_lls), np.log(ll_sum))
 
     def test_zeros_logsumexp(self):
-        lls = np.log(np.concatenate([np.zeros(100), np.random.rand(1000)]))
-        assert np.allclose(LogLikelihoods.logsumexp(lls), self.naive_logsumexp(lls))
+        with np.errstate(divide="ignore"):
+            lls = np.log(np.concatenate([np.zeros(100), np.random.rand(1000)]))
+            assert np.allclose(LogLikelihoods.logsumexp(lls), self.naive_logsumexp(lls))
 
     def test_logsumexp_underflow(self):
         # underflow in the naive case, but not in the LogLikelihoods implementation
-        lls = np.array([-1000, -1001])
-        assert self.naive_logsumexp(lls) == -np.inf
-        assert LogLikelihoods.logsumexp(lls) != -np.inf
+        with np.errstate(divide="ignore"):
+            lls = np.array([-1000, -1001])
+            assert self.naive_logsumexp(lls) == -np.inf
+            assert LogLikelihoods.logsumexp(lls) != -np.inf
 
     def test_log_tri_functions(self):
         ts = utility_functions.two_tree_mutation_ts()
@@ -1047,7 +1049,7 @@ def test_dangling_fails(self):
         print(ts.draw_text())
         print("Samples:", ts.samples())
         Ne = 0.5
-        with pytest.raises(ValueError, match="simplified"):
+        with pytest.raises(ValueError, match="simplify"):
             tsdate.build_prior_grid(ts, Ne, timepoints=np.array([0, 1.2, 2]))
         # mut_rate = 1
         # eps = 1e-6
@@ -1420,7 +1422,7 @@ def test_date_input(self):
 
     def test_sample_as_parent_fails(self):
         ts = utility_functions.single_tree_ts_n3_sample_as_parent()
-        with pytest.raises(NotImplementedError):
+        with pytest.raises(ValueError, match="samples at non-zero times"):
             tsdate.date(ts, mutation_rate=None, Ne=1)
 
     def test_recombination_not_implemented(self):
@@ -1531,18 +1533,7 @@ def test_constrain_ages_topo(self):
         ts = utility_functions.two_tree_ts()
         post_mn = np.array([0.0, 0.0, 0.0, 2.0, 1.0, 3.0])
         eps = 1e-6
-        nodes_to_date = np.array([3, 4, 5])
-        constrained_ages = constrain_ages_topo(ts, post_mn, eps, nodes_to_date)
-        assert np.array_equal(
-            np.array([0.0, 0.0, 0.0, 2.0, 2.000001, 3.0]), constrained_ages
-        )
-
-    def test_constrain_ages_topo_no_nodes_to_date(self):
-        ts = utility_functions.two_tree_ts()
-        post_mn = np.array([0.0, 0.0, 0.0, 2.0, 1.0, 3.0])
-        eps = 1e-6
-        nodes_to_date = None
-        constrained_ages = constrain_ages_topo(ts, post_mn, eps, nodes_to_date)
+        constrained_ages = constrain_ages_topo(ts, post_mn, eps)
         assert np.array_equal(
             np.array([0.0, 0.0, 0.0, 2.0, 2.000001, 3.0]), constrained_ages
         )
diff --git a/tests/test_inference.py b/tests/test_inference.py
@@ -61,7 +61,7 @@ def test_bad_Ne(self):
 
     def test_dangling_failure(self):
         ts = utility_functions.single_tree_ts_n2_dangling()
-        with pytest.raises(ValueError, match="simplified"):
+        with pytest.raises(ValueError, match="simplify"):
             tsdate.date(ts, mutation_rate=None, Ne=1)
 
     def test_unary_failure(self):
@@ -271,16 +271,29 @@ def test_fails_multi_root(self):
         with pytest.raises(ValueError):
             tsdate.date(multiroot_ts, Ne=1, mutation_rate=2, priors=good_priors)
 
-    def test_non_contemporaneous(self):
+    def test_non_contemporaneous_warn(self):
         samples = [
             msprime.Sample(population=0, time=0),
             msprime.Sample(population=0, time=0),
             msprime.Sample(population=0, time=0),
             msprime.Sample(population=0, time=1.0),
         ]
         ts = msprime.simulate(samples=samples, Ne=1, mutation_rate=2, random_seed=12)
-        with pytest.raises(NotImplementedError):
+        with pytest.raises(ValueError, match="samples at non-zero times"):
             tsdate.date(ts, Ne=1, mutation_rate=2)
+        with pytest.raises(ValueError, match="samples at non-zero times"):
+            tsdate.build_prior_grid(ts, Ne=1)
+
+    def test_non_contemporaneous(self):
+        samples = [
+            msprime.Sample(population=0, time=0),
+            msprime.Sample(population=0, time=0),
+            msprime.Sample(population=0, time=0),
+            msprime.Sample(population=0, time=1.0),
+        ]
+        ts = msprime.simulate(samples=samples, Ne=1, mutation_rate=2, random_seed=12)
+        priors = tsdate.build_prior_grid(ts, Ne=1, allow_historical_samples=True)
+        tsdate.date(ts, priors=priors, mutation_rate=2)
 
     def test_no_mutation_times(self):
         ts = msprime.simulate(20, Ne=1, mutation_rate=1, random_seed=12)
diff --git a/tsdate/base.py b/tsdate/base.py
@@ -95,6 +95,12 @@ def __init__(
         ] = (-np.arange(num_nodes - self.num_nonfixed) - 1)
         self.probability_space = LIN
 
+    def fixed_node_ids(self):
+        return np.where(self.row_lookup < 0)[0]
+
+    def nonfixed_node_ids(self):
+        return np.where(self.row_lookup >= 0)[0]
+
     def force_probability_space(self, probability_space):
         """
         probability_space can be "logarithmic" or "linear": this function will force
@@ -119,7 +125,7 @@ def force_probability_space(self, probability_space):
             if self.probability_space == LOG:
                 pass
             elif self.probability_space == LIN:
-                with np.errstate(divide="ignore"):
+                with np.errstate(divide="ignore", invalid="ignore"):
                     self.grid_data = np.log(self.grid_data)
                     self.fixed_data = np.log(self.fixed_data)
                 self.probability_space = LOG
@@ -140,6 +146,9 @@ def normalize(self):
         else:
             raise RuntimeError("Probability space is not", LIN, "or", LOG)
 
+    def is_fixed(self, node_id):
+        return self.row_lookup[node_id] < 0
+
     def __getitem__(self, node_id):
         index = self.row_lookup[node_id]
         if index < 0:
@@ -207,8 +216,7 @@ def fill_fixed(orig, fixed_data):
             new_obj.fixed_data = fill_fixed(
                 self, grid_data if fixed_data is None else fixed_data
             )
-        if probability_space is None:
-            new_obj.probability_space = self.probability_space
-        else:
-            new_obj.probability_space = probability_space
+        new_obj.probability_space = self.probability_space
+        if probability_space is not None:
+            new_obj.force_probability_space(probability_space)
         return new_obj
diff --git a/tsdate/core.py b/tsdate/core.py
@@ -402,7 +402,7 @@ def get_fixed(self, arr, edge):
         return arr * liks
 
     def scale_geometric(self, fraction, value):
-        return value**fraction
+        return value ** fraction
 
 
 class LogLikelihoods(Likelihoods):
@@ -647,11 +647,22 @@ def inside_pass(self, *, normalize=True, cache_inside=False, progress=None):
         inside = self.priors.clone_with_new_data(  # store inside matrix values
             grid_data=np.nan, fixed_data=self.lik.identity_constant
         )
+        # It is possible that a simple node is non-fixed, in which case we want to
+        # provide an inside array that reflects the prior distribution
+        nonfixed_samples = np.intersect1d(inside.nonfixed_node_ids(), self.ts.samples())
+        for u in nonfixed_samples:
+            # this is in the same probability space as the prior, so we should be
+            # OK just to copy the prior values straight in. It's unclear to me (Yan)
+            # how/if they should be normalised, however
+            inside[u][:] = self.priors[u]
+
         if cache_inside:
             g_i = np.full(
                 (self.ts.num_edges, self.lik.grid_size), self.lik.identity_constant
             )
         norm = np.full(self.ts.num_nodes, np.nan)
+        to_visit = np.zeros(self.ts.num_nodes, dtype=bool)
+        to_visit[inside.nonfixed_node_ids()] = True
         # Iterate through the nodes via groupby on parent node
         for parent, edges in tqdm(
             self.edges_by_parent_asc(),
@@ -686,16 +697,22 @@ def inside_pass(self, *, normalize=True, cache_inside=False, progress=None):
                             "dangling nodes: please simplify it"
                         )
                     daughter_val = self.lik.scale_geometric(
-                        spanfrac, self.lik.make_lower_tri(inside[edge.child])
+                        spanfrac, self.lik.make_lower_tri(inside_values)
                     )
                     edge_lik = self.lik.get_inside(daughter_val, edge)
                 val = self.lik.combine(val, edge_lik)
                 if np.all(val == 0):
                     raise ValueError
                 if cache_inside:
                     g_i[edge.id] = edge_lik
-            norm[parent] = np.max(val) if normalize else 1
+            norm[parent] = np.max(val) if normalize else self.lik.identity_constant
             inside[parent] = self.lik.reduce(val, norm[parent])
+            to_visit[parent] = False
+
+        # There may be nodes that are not parents but are also not fixed (e.g.
+        # undated sample nodes). These need an identity normalization constant
+        for unfixed_unvisited in np.where(to_visit)[0]:
+            norm[unfixed_unvisited] = self.lik.identity_constant
 
         if cache_inside:
             self.g_i = self.lik.reduce(g_i, norm[self.ts.tables.edges.child, None])
@@ -913,34 +930,32 @@ def posterior_mean_var(ts, posterior, *, fixed_node_set=None):
     return ts, mn_post, vr_post
 
 
-def constrain_ages_topo(ts, post_mn, eps, nodes_to_date=None, progress=False):
+def constrain_ages_topo(ts, node_times, eps, progress=False):
     """
-    If predicted node times violate topology, restrict node ages so that they
-    must be older than all their children.
+    If node_times violate topology, return increased node_times so that each node is
+    guaranteed to be older than any of its their children.
     """
-    new_mn_post = np.copy(post_mn)
-    if nodes_to_date is None:
-        nodes_to_date = np.arange(ts.num_nodes, dtype=np.uint64)
-        nodes_to_date = nodes_to_date[~np.isin(nodes_to_date, ts.samples())]
-
-    tables = ts.tables
-    parents = tables.edges.parent
-    nd_children = tables.edges.child[np.argsort(parents)]
-    parents = sorted(parents)
-    parents_unique = np.unique(parents, return_index=True)
-    parent_indices = parents_unique[1][np.isin(parents_unique[0], nodes_to_date)]
-    for index, nd in tqdm(
-        enumerate(sorted(nodes_to_date)), desc="Constrain Ages", disable=not progress
+    edges_parent = ts.edges_parent
+    edges_child = ts.edges_child
+
+    new_node_times = np.copy(node_times)
+    # Traverse through the ARG, ensuring children come before parents.
+    # This can be done by iterating over groups of edges with the same parent
+    new_parent_edge_idx = np.where(np.diff(edges_parent) != 0)[0] + 1
+    for edges_start, edges_end in tqdm(
+        zip(
+            itertools.chain([0], new_parent_edge_idx),
+            itertools.chain(new_parent_edge_idx, [len(edges_parent)]),
+        ),
+        desc="Constrain Ages",
+        disable=not progress,
     ):
-        if index + 1 != len(nodes_to_date):
-            children_index = np.arange(parent_indices[index], parent_indices[index + 1])
-        else:
-            children_index = np.arange(parent_indices[index], ts.num_edges)
-        children = nd_children[children_index]
-        time = np.max(new_mn_post[children])
-        if new_mn_post[nd] <= time:
-            new_mn_post[nd] = time + eps
-    return new_mn_post
+        parent = edges_parent[edges_start]
+        child_ids = edges_child[edges_start:edges_end]  # May contain dups
+        oldest_child_time = np.max(new_node_times[child_ids])
+        if oldest_child_time >= new_node_times[parent]:
+            new_node_times[parent] = oldest_child_time + eps
+    return new_node_times
 
 
 def date(
@@ -1031,7 +1046,7 @@ def date(
         progress=progress,
         **kwargs
     )
-    constrained = constrain_ages_topo(tree_sequence, dates, eps, nds, progress)
+    constrained = constrain_ages_topo(tree_sequence, dates, eps, progress)
     tables = tree_sequence.dump_tables()
     tables.time_units = time_units
     tables.nodes.time = constrained
@@ -1080,8 +1095,6 @@ def get_dates(
 
     :return: tuple(mn_post, posterior, timepoints, eps, nodes_to_date)
     """
-    fixed_nodes = set(tree_sequence.samples())
-
     # Default to not creating approximate priors unless ts has > 1000 samples
     approx_priors = False
     if tree_sequence.num_samples > 1000:
@@ -1109,6 +1122,8 @@ def get_dates(
             )
         priors = priors
 
+    fixed_nodes = set(priors.fixed_node_ids())
+
     if probability_space != base.LOG:
         liklhd = Likelihoods(
             tree_sequence,
diff --git a/tsdate/prior.py b/tsdate/prior.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-tskit>=0.4.0`
	`1`	`+tskit>=0.5.2`
`2`	`2`	`tsinfer>=0.2.0`
`3`	`3`	`flake8`
`4`	`4`	`numpy`