Allow ancient samples

awohns · hyanwong · commit b87780747256 · 2022-09-08T09:07:05.000+01:00
diff --git a/tsdate/core.py b/tsdate/core.py
@@ -151,7 +151,7 @@ def _lik(muts, span, dt, mutation_rate, normalize=True):
         """
         ll = scipy.stats.poisson.pmf(muts, dt * mutation_rate * span)
         if normalize:
-            return ll / np.max(ll)
+            return ll / np.nanmax(ll)
         else:
             return ll
 
@@ -258,15 +258,28 @@ def get_mut_lik_fixed_node(self, edge):
 
         mutations_on_edge = self.mut_edges[edge.id]
         child_time = self.ts.node(edge.child).time
-        assert child_time == 0
-        # Temporary hack - we should really take a more precise likelihood
-        return self._lik(
-            mutations_on_edge,
-            edge.span,
-            self.timediff,
-            self.mut_rate,
-            normalize=self.normalize,
-        )
+        if child_time == 0:
+            return self._lik(
+                mutations_on_edge,
+                edge.span,
+                self.timediff,
+                self.mut_rate,
+                normalize=self.normalize,
+            )
+        else:
+            timediff = self.timepoints - child_time + 1e-8
+            # Temporary hack - we should really take a more precise likelihood
+            likelihood = self._lik(
+                mutations_on_edge,
+                edge.span,
+                timediff,
+                self.mut_rate,
+                normalize=self.normalize,
+            )
+            # Prevent child from being older than parent
+            likelihood[timediff < 0] = 0
+
+            return likelihood
 
     def get_mut_lik_lower_tri(self, edge):
         """
@@ -429,7 +442,7 @@ def _lik(muts, span, dt, mutation_rate, normalize=True):
         """
         ll = scipy.stats.poisson.logpmf(muts, dt * mutation_rate * span)
         if normalize:
-            return ll - np.max(ll)
+            return ll - np.nanmax(ll)
         else:
             return ll
 
@@ -677,10 +690,13 @@ def inside_pass(self, *, normalize=True, cache_inside=False, progress=None):
                     )
                     edge_lik = self.lik.get_inside(daughter_val, edge)
                 val = self.lik.combine(val, edge_lik)
+                if np.all(val == 0):
+                    raise ValueError
                 if cache_inside:
                     g_i[edge.id] = edge_lik
             norm[parent] = np.max(val) if normalize else 1
             inside[parent] = self.lik.reduce(val, norm[parent])
+
         if cache_inside:
             self.g_i = self.lik.reduce(g_i, norm[self.ts.tables.edges.child, None])
         # Keep the results in this object
@@ -1064,10 +1080,6 @@ def get_dates(
 
     :return: tuple(mn_post, posterior, timepoints, eps, nodes_to_date)
     """
-    # Stuff yet to be implemented. These can be deleted once fixed
-    for sample in tree_sequence.samples():
-        if tree_sequence.node(sample).time != 0:
-            raise NotImplementedError("Samples must all be at time 0")
     fixed_nodes = set(tree_sequence.samples())
 
     # Default to not creating approximate priors unless ts has > 1000 samples
diff --git a/tsdate/prior.py b/tsdate/prior.py
@@ -419,10 +419,10 @@ def __init__(self, tree_sequence, *, progress=False, allow_unary=False):
 
         self.ts = tree_sequence
         self.sample_node_set = set(self.ts.samples())
-        if np.any(self.ts.tables.nodes.time[self.ts.samples()] != 0):
-            raise ValueError(
-                "The SpansBySamples class needs a tree seq with all samples at time 0"
-            )
+        #if np.any(self.ts.tables.nodes.time[self.ts.samples()] != 0):
+        #    raise ValueError(
+        #        "The SpansBySamples class needs a tree seq with all samples at time 0"
+        #    )
         self.progress = progress
 
         # We will store the spans in here, and normalize them at the end
@@ -996,6 +996,59 @@ def fill_priors(node_parameters, timepoints, ts, Ne, *, prior_distr, progress=Fa
     return prior_times
 
 
+def truncate_priors(ts, sample_times, priors, nodes_to_date=None, progress=False):
+    """
+    Truncate priors so they conform to the age of nodes in the tree sequence
+    """
+    grid_data = np.copy(priors.grid_data[:])
+    timepoints = priors.timepoints
+    if np.max(sample_times) >= np.max(timepoints):
+        raise ValueError("Sample times cannot be larger than the oldest timepoint")
+    if priors.probability_space == "linear":
+        zero_value = 0
+        one_value = 1
+    elif priors.probability_space == "logarithmic":
+        zero_value = -np.inf
+        one_value = 0
+    constrained_min_times = np.copy(sample_times)
+    constrained_max_times = np.full(sample_times.shape[0], np.inf)
+    if nodes_to_date is None:
+        nodes_to_date = np.arange(ts.num_nodes, dtype=np.uint64)
+        nodes_to_date = nodes_to_date[~np.isin(nodes_to_date, ts.samples())]
+
+    tables = ts.tables
+    parents = tables.edges.parent
+    nd_children = tables.edges.child[np.argsort(parents)]
+    parents = sorted(parents)
+    parents_unique = np.unique(parents, return_index=True)
+    parent_indices = parents_unique[1][np.isin(parents_unique[0], nodes_to_date)]
+    for index, nd in tqdm(
+        enumerate(sorted(nodes_to_date)), desc="Constrain Ages", disable=not progress
+    ):
+        if index + 1 != len(nodes_to_date):
+            children_index = np.arange(parent_indices[index], parent_indices[index + 1])
+        else:
+            children_index = np.arange(parent_indices[index], ts.num_edges)
+        children = nd_children[children_index]
+        time = np.max(constrained_min_times[children])
+        # The constrained time of the node should be the age of the oldest child
+        if constrained_min_times[nd] <= time:
+            constrained_min_times[nd] = time
+        nearest_time = np.argmin(np.abs(timepoints - time))
+        lookup_index = priors.row_lookup[int(nd)]
+        grid_data[lookup_index][:nearest_time] = zero_value
+    assert np.all(constrained_min_times < constrained_max_times)
+    all_zeros = np.where(np.all(grid_data == zero_value, axis=1))[0]
+
+    rowmax = grid_data[:, 1:].max(axis=1)
+    if priors.probability_space == "linear":
+        grid_data = grid_data / rowmax[:, np.newaxis]
+    elif priors.probability_space == "logarithmic":
+        grid_data = grid_data - rowmax[:, np.newaxis]
+                    
+    priors.grid_data[:] = grid_data
+    return constrained_min_times, constrained_max_times, priors
+
 def build_grid(
     tree_sequence,
     Ne,
@@ -1007,7 +1060,7 @@ def build_grid(
     eps=1e-6,
     # Parameters below undocumented
     progress=False,
-    allow_unary=False,
+    sample_times=None
 ):
     """
     Using the conditional coalescent, calculate the prior distribution for the age of
@@ -1038,6 +1091,8 @@ def build_grid(
         inference and a discretised time grid
     :rtype:  base.NodeGridValues Object
     """
+    #tree_sequence = tree_sequence.simplify(tree_sequence.samples())
+
     if Ne <= 0:
         raise ValueError("Parameter 'Ne' must be greater than 0")
     if approximate_priors:
@@ -1049,19 +1104,13 @@ def build_grid(
                 "Can't set approx_prior_size if approximate_prior is False"
             )
 
-    contmpr_ts, node_map = util.reduce_to_contemporaneous(tree_sequence)
-    if contmpr_ts.num_nodes != tree_sequence.num_nodes:
-        raise ValueError(
-            "Passed tree sequence is not simplified and/or contains "
-            "noncontemporaneous samples"
-        )
-    span_data = SpansBySamples(contmpr_ts, progress=progress, allow_unary=allow_unary)
+    span_data = SpansBySamples(tree_sequence, progress=progress)
 
     base_priors = ConditionalCoalescentTimes(
         approx_prior_size, Ne, prior_distribution, progress=progress
     )
 
-    base_priors.add(contmpr_ts.num_samples, approximate_priors)
+    base_priors.add(tree_sequence.num_samples, approximate_priors)
     for total_fixed in span_data.total_fixed_at_0_counts:
         # For missing data: trees vary in total fixed node count => have different priors
         if total_fixed > 0:
@@ -1085,9 +1134,7 @@ def build_grid(
     else:
         raise ValueError("time_slices must be an integer or a numpy array of floats")
 
-    prior_params_contmpr = base_priors.get_mixture_prior_params(span_data)
-    # Map the nodes in the prior params back to the node ids in the original ts
-    prior_params = prior_params_contmpr[node_map, :]
+    prior_params = base_priors.get_mixture_prior_params(span_data)
     # Set all fixed nodes (i.e. samples) to have 0 variance
     priors = fill_priors(
         prior_params,
@@ -1097,4 +1144,7 @@ def build_grid(
         prior_distr=prior_distribution,
         progress=progress,
     )
+    if np.any(tree_sequence.tables.nodes.time[tree_sequence.samples()] != 0):
+        if False:
+            priors = truncate_priors(tree_sequence, sample_times, priors, eps, progress=progress)
     return priors