Allow nonfixed sample nodes

hyanwong · hyanwong · commit 252529cf434d · 2022-09-08T09:02:57.000+01:00
diff --git a/tsdate/prior.py b/tsdate/prior.py
@@ -952,29 +952,55 @@ def gamma_cdf(t_set, alpha, beta):
     return np.insert(t_set, 0, 0)
 
 
-def fill_priors(node_parameters, timepoints, ts, Ne, *, prior_distr, progress=False):
+def fill_priors(
+    node_parameters,
+    timepoints,
+    ts,
+    Ne,
+    *,
+    prior_distr,
+    node_var_override=None,
+    progress=False,
+):
     """
     Take the alpha and beta values from the node_parameters array, which contains
-    one row for each node in the TS (including fixed nodes)
-    and fill out a NodeGridValues object with the prior values from the
-    gamma or lognormal distribution with those parameters.
+    one row for each node in the TS (including fixed nodes, although alpha and beta
+    are ignored for these nodes) and fill out a NodeGridValues object with the prior
+    values from the gamma or lognormal distribution with those parameters.
+
+    For a description of `node_var_override`, see the parameter description in
+    the `build_grid` function.
 
     TODO - what if there is an internal fixed node? Should we truncate
     """
     if prior_distr == "lognorm":
         cdf_func = scipy.stats.lognorm.cdf
-        main_param = np.sqrt(node_parameters[:, PriorParams.field_index("beta")])
+        shape_param = np.sqrt(node_parameters[:, PriorParams.field_index("beta")])
         scale_param = np.exp(node_parameters[:, PriorParams.field_index("alpha")])
+
+        def shape_scale_from_mean_var(mean, var):
+            a, b = lognorm_approx(mean, var)
+            return np.sqrt(b), np.exp(a)
+
     elif prior_distr == "gamma":
         cdf_func = scipy.stats.gamma.cdf
-        main_param = node_parameters[:, PriorParams.field_index("alpha")]
-        scale_param = 1 / node_parameters[:, PriorParams.field_index("beta")]
+        shape_param = node_parameters[:, PriorParams.field_index("alpha")]
+        scale_param = 1.0 / node_parameters[:, PriorParams.field_index("beta")]
+
+        def shape_scale_from_mean_var(mean, var):
+            a, b = gamma_approx(mean, var)
+            return a, 1.0 / b
+
     else:
         raise ValueError("prior distribution must be lognorm or gamma")
-
+    if node_var_override is None:
+        node_var_override = {}
     datable_nodes = np.ones(ts.num_nodes, dtype=bool)
     datable_nodes[ts.samples()] = False
+    # Mark all nodes in node_var_override as datable
+    datable_nodes[list(node_var_override.keys())] = True
     datable_nodes = np.where(datable_nodes)[0]
+
     prior_times = base.NodeGridValues(
         ts.num_nodes,
         datable_nodes[np.argsort(ts.tables.nodes.time[datable_nodes])].astype(np.int32),
@@ -985,8 +1011,16 @@ def fill_priors(node_parameters, timepoints, ts, Ne, *, prior_distr, progress=Fa
     for node in tqdm(
         datable_nodes, desc="Assign Prior to Each Node", disable=not progress
     ):
+        if node in node_var_override:
+            shape, scale = shape_scale_from_mean_var(
+                mean=ts.node(node).time,
+                var=node_var_override[node],
+            )
+        else:
+            shape = shape_param[node]
+            scale = scale_param[node]
         with np.errstate(divide="ignore", invalid="ignore"):
-            prior_node = cdf_func(timepoints, main_param[node], scale=scale_param[node])
+            prior_node = cdf_func(timepoints, shape, scale=scale)
         # force age to be less than max value
         prior_node = np.divide(prior_node, np.max(prior_node))
         # prior in each epoch
@@ -999,7 +1033,7 @@ def fill_priors(node_parameters, timepoints, ts, Ne, *, prior_distr, progress=Fa
 def _truncate_priors(ts, priors, progress=False):
     """
     Truncate priors for the nodes listed in truncate_nodes (or all nonfixed nodes
-    if truncate_nodes in None) so they conform to the age of fixed nodes in the tree
+    if truncate_nodes is None) so they conform to the age of fixed nodes in the tree
     sequence
     """
     tables = ts.tables
@@ -1065,6 +1099,7 @@ def build_grid(
     prior_distribution="lognorm",
     allow_historical_samples=None,
     truncate_priors=None,
+    node_var_override=None,
     eps=1e-6,
     # Parameters below undocumented
     progress=False,
@@ -1100,6 +1135,13 @@ def build_grid(
         priors of their direct ancestor nodes so that the probability of being younger
         than the oldest descendant sample is zero. If the tree sequence is trustworthy
         this should give better restults. Default: `True`
+    :param dict node_var_override: is a dict mapping node IDs to a variance value.
+        Any nodes listed here will be treated as non-fixed nodes whose prior is not
+        calculated from the conditional coalescent but instead are allocated a prior
+        whose mean is thenode time in the tree sequence and whose variance is the
+        value in this dictionary. This allows sample nodes to be treated as nonfixed
+        nodes, and therefore dated. If ``None`` (default) then all sample nodes are
+        treated as occurring ata  fixed time (as if this were an empty dict).
     :param float eps: Specify minimum distance separating points in the time grid. Also
         specifies the error factor in time difference calculations. Default: 1e-6
     :return: A prior object to pass to tsdate.date() containing prior values for
@@ -1160,16 +1202,18 @@ def build_grid(
         tree_sequence,
         Ne,
         prior_distr=prior_distribution,
+        node_var_override=node_var_override,
         progress=progress,
     )
-    if np.any(tree_sequence.tables.nodes.time[tree_sequence.samples()] != 0):
+    tables = tree_sequence.tables
+    if np.any(tables.nodes.time[tree_sequence.samples()] > 0):
         if not allow_historical_samples:
             raise ValueError(
                 "There are samples at non-zero times, invalidating the conditional "
                 "coalescent prior. You can set allow_historical_samples=True to carry "
                 "on regardless, calculating a prior as if all samples were "
                 "contemporaneous (reasonable if you only have a few ancient samples)"
             )
-        if truncate_priors:
+        if np.any(tables.nodes.time[priors.fixed_node_ids()] > 0) and truncate_priors:
             priors = _truncate_priors(tree_sequence, priors, progress=progress)
     return priors