Merge pull request #244 from hyanwong/fix-posteriors

hyanwong · web-flow · commit 4e5de920603e · 2023-01-16T22:11:01.000Z
Change "normalize" to "standardize"
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -8,11 +8,15 @@
   individuals, populations, or sites, aiming to change the tree sequence tables as
   little as possible.
 
+- Not strictly breaking, as not in the published API, but the "normalize" flag
+  in ``get_dates`` and the internal ``normalize`` terminology is changed to
+  ``standardize`` to better reflect the fact that the maximum (not sum) is one.
+
 **Bugfixes**
 
 - The returned posteriors when ``return_posteriors=True`` now return actual
-  probabilities (scaled so that they sum to one) rather than normalised
-  probabilites whose maximum value is one.
+  probabilities (scaled so that they sum to one) rather than standardized
+  "probabilites" whose maximum value is one.
 
 --------------------
 [0.1.5] - 2022-06-07
diff --git a/tests/test_functions.py b/tests/test_functions.py
@@ -570,17 +570,17 @@ def test_one_tree_n2_intervals(self):
 
 
 class TestLikelihoodClass:
-    def poisson(self, param, x, normalize=True):
+    def poisson(self, param, x, standardize=True):
         ll = np.exp(-param) * param**x / scipy.special.factorial(x)
-        if normalize:
+        if standardize:
             return ll / np.max(ll)
         else:
             return ll
 
-    def log_poisson(self, param, x, normalize=True):
+    def log_poisson(self, param, x, standardize=True):
         with np.errstate(divide="ignore"):
             ll = np.log(np.exp(-param) * param**x / scipy.special.factorial(x))
-        if normalize:
+        if standardize:
             return ll - np.max(ll)
         else:
             return ll
@@ -669,8 +669,8 @@ def test_precalc_lik_upper_multithread(self):
             (Likelihoods, self.poisson),
             (LogLikelihoods, self.log_poisson),
         ]:
-            for normalize in (True, False):
-                lik = L(ts, grid, mut_rate, eps, normalize=normalize)
+            for standardize in (True, False):
+                lik = L(ts, grid, mut_rate, eps, standardize=standardize)
                 dt = grid
                 for num_threads in (None, 1, 2):
                     n_internal_edges = 0
@@ -691,7 +691,7 @@ def test_precalc_lik_upper_multithread(self):
                             expected_lik_dt = pois(
                                 dt * (mut_rate * span),
                                 num_muts,
-                                normalize=normalize,
+                                standardize=standardize,
                             )
                             upper_tri = lik.get_mut_lik_upper_tri(edge)
 
@@ -946,7 +946,7 @@ def test_nonmatching_prior_vs_lik_fixednodes(self):
 
 
 class TestInsideAlgorithm:
-    def run_inside_algorithm(self, ts, prior_distr, normalize=True, **kwargs):
+    def run_inside_algorithm(self, ts, prior_distr, standardize=True, **kwargs):
         Ne = 0.5
         priors = tsdate.build_prior_grid(
             ts,
@@ -961,7 +961,7 @@ def run_inside_algorithm(self, ts, prior_distr, normalize=True, **kwargs):
         lls = Likelihoods(ts, priors.timepoints, mut_rate, eps=eps)
         lls.precalculate_mutation_likelihoods()
         algo = InOutAlgorithms(priors, lls)
-        algo.inside_pass(normalize=normalize)
+        algo.inside_pass(standardize=standardize)
         return algo, priors
 
     def test_one_tree_n2(self):
@@ -989,7 +989,7 @@ def test_polytomy_tree(self):
 
     def test_two_tree_ts(self):
         ts = utility_functions.two_tree_ts()
-        algo, priors = self.run_inside_algorithm(ts, "gamma", normalize=False)
+        algo, priors = self.run_inside_algorithm(ts, "gamma", standardize=False)
         mut_rate = 0.5
         # priors[3][1] * Ll_(0->3)(1.2 - 0 + eps) ** 2
         node3_t1 = (
@@ -1098,7 +1098,7 @@ def test_dangling_fails(self):
 
 class TestOutsideAlgorithm:
     def run_outside_algorithm(
-        self, ts, prior_distr="lognorm", normalize=False, ignore_oldest_root=False
+        self, ts, prior_distr="lognorm", standardize=False, ignore_oldest_root=False
     ):
         span_data = SpansBySamples(ts)
         Ne = 0.5
@@ -1113,7 +1113,9 @@ def run_outside_algorithm(
         lls.precalculate_mutation_likelihoods()
         algo = InOutAlgorithms(prior_vals, lls)
         algo.inside_pass()
-        algo.outside_pass(normalize=normalize, ignore_oldest_root=ignore_oldest_root)
+        algo.outside_pass(
+            standardize=standardize, ignore_oldest_root=ignore_oldest_root
+        )
         return algo
 
     def test_one_tree_n2(self):
@@ -1157,17 +1159,17 @@ def test_outside_before_inside_fails(self):
         with pytest.raises(RuntimeError):
             algo.outside_pass()
 
-    def test_normalize_outside(self):
+    def test_standardize_outside(self):
         ts = msprime.simulate(
             50, Ne=10000, mutation_rate=1e-8, recombination_rate=1e-8, random_seed=12
         )
-        normalize = self.run_outside_algorithm(ts, normalize=True)
-        no_normalize = self.run_outside_algorithm(ts, normalize=False)
+        standardize = self.run_outside_algorithm(ts, standardize=True)
+        no_standardize = self.run_outside_algorithm(ts, standardize=False)
         assert np.allclose(
-            normalize.outside.grid_data[:],
+            standardize.outside.grid_data[:],
             (
-                no_normalize.outside.grid_data[:]
-                / np.max(no_normalize.outside.grid_data[:], axis=1)[:, np.newaxis]
+                no_standardize.outside.grid_data[:]
+                / np.max(no_standardize.outside.grid_data[:], axis=1)[:, np.newaxis]
             ),
         )
 
@@ -1213,7 +1215,7 @@ def find_posterior(self, ts, prior_distr):
         lls.precalculate_mutation_likelihoods()
         algo = InOutAlgorithms(prior_vals, lls)
         algo.inside_pass()
-        posterior = algo.outside_pass(normalize=False)
+        posterior = algo.outside_pass(standardize=False)
         assert np.array_equal(
             np.sum(algo.inside.grid_data * algo.outside.grid_data, axis=1),
             np.sum(algo.inside.grid_data * algo.outside.grid_data, axis=1),
@@ -1278,11 +1280,11 @@ def test_gil_tree(self):
             prior_vals.grid_data[1] = [0, 0.05, 0.1, 0.2, 0.45, 0.1, 0.1]
             mut_rate = 1
             eps = 0.01
-            lls = Likelihoods(ts, grid, mut_rate, eps=eps, normalize=False)
+            lls = Likelihoods(ts, grid, mut_rate, eps=eps, standardize=False)
             lls.precalculate_mutation_likelihoods()
             algo = InOutAlgorithms(prior_vals, lls)
-            algo.inside_pass(normalize=False, cache_inside=cache_inside)
-            algo.outside_pass(normalize=False)
+            algo.inside_pass(standardize=False, cache_inside=cache_inside)
+            algo.outside_pass(standardize=False)
             assert np.allclose(
                 np.sum(algo.inside.grid_data * algo.outside.grid_data, axis=1),
                 [7.44449e-05, 7.44449e-05],
diff --git a/tsdate/base.py b/tsdate/base.py
@@ -1,6 +1,7 @@
 # MIT License
 #
-# Copyright (c) 2020 University of Oxford
+# Copyright (c) 2021-23 Tskit Developers
+# Copyright (c) 2020-21 University of Oxford
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
 # of this software and associated documentation files (the "Software"), to deal
@@ -127,9 +128,9 @@ def force_probability_space(self, probability_space):
         else:
             logging.warning("Cannot force", *descr)
 
-    def normalize(self):
+    def standardize(self):
         """
-        normalize grid data so the max is one (in linear space) or zero
+        Standardize grid data so the max for each row is one (in linear space) or zero
         (in logarithmic space)
 
         TODO - is it clear why we omit the first element of the
diff --git a/tsdate/core.py b/tsdate/core.py
@@ -50,6 +50,9 @@ class Likelihoods:
     A class to store and process likelihoods. Likelihoods for edges are stored as a
     flattened lower triangular matrix of all the possible delta t's. This class also
     provides methods for accessing this lower triangular matrix, multiplying it, etc.
+
+    If ``standardize`` is true, routines will operate to standardize the likelihoods
+    such that their maximum is one (in linear space) or zero (in log space)
     """
 
     probability_space = base.LIN
@@ -65,7 +68,7 @@ def __init__(
         *,
         eps=0,
         fixed_node_set=None,
-        normalize=True,
+        standardize=True,
         progress=False,
     ):
         self.ts = ts
@@ -75,7 +78,7 @@ def __init__(
         )
         self.mut_rate = mutation_rate
         self.rec_rate = recombination_rate
-        self.normalize = normalize
+        self.standardize = standardize
         self.grid_size = len(timepoints)
         self.tri_size = self.grid_size * (self.grid_size + 1) / 2
         self.ll_mut = {}
@@ -145,25 +148,25 @@ def get_mut_edges(ts):
         return mut_edges
 
     @staticmethod
-    def _lik(muts, span, dt, mutation_rate, normalize=True):
+    def _lik(muts, span, dt, mutation_rate, standardize=True):
         """
         The likelihood of an edge given a number of mutations, as set of time deltas (dt)
         and a span. This is a static function to allow parallelization
         """
         ll = scipy.stats.poisson.pmf(muts, dt * mutation_rate * span)
-        if normalize:
+        if standardize:
             return ll / np.max(ll)
         else:
             return ll
 
     @staticmethod
-    def _lik_wrapper(muts_span, dt, mutation_rate, normalize=True):
+    def _lik_wrapper(muts_span, dt, mutation_rate, standardize=True):
         """
         A wrapper to allow this _lik to be called by pool.imap_unordered, returning the
         mutation and span values
         """
         return muts_span, Likelihoods._lik(
-            muts_span[0], muts_span[1], dt, mutation_rate, normalize=normalize
+            muts_span[0], muts_span[1], dt, mutation_rate, standardize=standardize
         )
 
     def precalculate_mutation_likelihoods(self, num_threads=None, unique_method=0):
@@ -206,7 +209,7 @@ def precalculate_mutation_likelihoods(self, num_threads=None, unique_method=0):
                 self._lik_wrapper,
                 dt=self.timediff_lower_tri,
                 mutation_rate=self.mut_rate,
-                normalize=self.normalize,
+                standardize=self.standardize,
             )
             if num_threads == 1:
                 # Useful for testing
@@ -240,7 +243,7 @@ def precalculate_mutation_likelihoods(self, num_threads=None, unique_method=0):
                     span,
                     dt=self.timediff_lower_tri,
                     mutation_rate=self.mut_rate,
-                    normalize=self.normalize,
+                    standardize=self.standardize,
                 )
 
     def get_mut_lik_fixed_node(self, edge):
@@ -266,7 +269,7 @@ def get_mut_lik_fixed_node(self, edge):
             edge.span,
             self.timediff,
             self.mut_rate,
-            normalize=self.normalize,
+            standardize=self.standardize,
         )
 
     def get_mut_lik_lower_tri(self, edge):
@@ -423,24 +426,24 @@ def logsumexp(X):
         return np.log(r) + alpha
 
     @staticmethod
-    def _lik(muts, span, dt, mutation_rate, normalize=True):
+    def _lik(muts, span, dt, mutation_rate, standardize=True):
         """
         The likelihood of an edge given a number of mutations, as set of time deltas (dt)
         and a span. This is a static function to allow parallelization
         """
         ll = scipy.stats.poisson.logpmf(muts, dt * mutation_rate * span)
-        if normalize:
+        if standardize:
             return ll - np.max(ll)
         else:
             return ll
 
     @staticmethod
-    def _lik_wrapper(muts_span, dt, mutation_rate, normalize=True):
+    def _lik_wrapper(muts_span, dt, mutation_rate, standardize=True):
         """
         Needs redefining to refer to the LogLikelihoods class
         """
         return muts_span, LogLikelihoods._lik(
-            muts_span[0], muts_span[1], dt, mutation_rate, normalize=normalize
+            muts_span[0], muts_span[1], dt, mutation_rate, standardize=standardize
         )
 
     def rowsum_lower_tri(self, input_array):
@@ -626,7 +629,7 @@ def edges_by_child_then_parent_desc(self):
 
     # === MAIN ALGORITHMS ===
 
-    def inside_pass(self, *, normalize=True, cache_inside=False, progress=None):
+    def inside_pass(self, *, standardize=True, cache_inside=False, progress=None):
         """
         Use dynamic programming to find approximate posterior to sample from
         """
@@ -639,7 +642,7 @@ def inside_pass(self, *, normalize=True, cache_inside=False, progress=None):
             g_i = np.full(
                 (self.ts.num_edges, self.lik.grid_size), self.lik.identity_constant
             )
-        norm = np.full(self.ts.num_nodes, np.nan)
+        denominator = np.full(self.ts.num_nodes, np.nan)
         # Iterate through the nodes via groupby on parent node
         for parent, edges in tqdm(
             self.edges_by_parent_asc(),
@@ -680,18 +683,22 @@ def inside_pass(self, *, normalize=True, cache_inside=False, progress=None):
                 val = self.lik.combine(val, edge_lik)
                 if cache_inside:
                     g_i[edge.id] = edge_lik
-            norm[parent] = np.max(val) if normalize else 1
-            inside[parent] = self.lik.reduce(val, norm[parent])
+            denominator[parent] = (
+                np.max(val) if standardize else self.lik.identity_constant
+            )
+            inside[parent] = self.lik.reduce(val, denominator[parent])
         if cache_inside:
-            self.g_i = self.lik.reduce(g_i, norm[self.ts.tables.edges.child, None])
+            self.g_i = self.lik.reduce(
+                g_i, denominator[self.ts.tables.edges.child, None]
+            )
         # Keep the results in this object
         self.inside = inside
-        self.norm = norm
+        self.denominator = denominator
 
     def outside_pass(
         self,
         *,
-        normalize=False,
+        standardize=False,
         ignore_oldest_root=False,
         progress=None,
     ):
@@ -700,8 +707,8 @@ def outside_pass(
         posterior values. These are *not* probabilities, as they do not sum to one:
         to convert to probabilities, call posterior.to_probabilities()
 
-        Normalising *during* the outside process may be necessary if there is overflow,
-        but means that we cannot  check the total functional value at each node
+        Standardizing *during* the outside process may be necessary if there is
+        overflow, but means that we cannot  check the total functional value at each node
 
         Ignoring the oldest root may also be necessary when the oldest root node
         causes numerical stability issues.
@@ -750,7 +757,7 @@ def outside_pass(
                         spanfrac, self.lik.make_lower_tri(self.inside[edge.child])
                     )
                     edge_lik = self.lik.get_inside(daughter_val, edge)
-                    cur_g_i = self.lik.reduce(edge_lik, self.norm[child])
+                    cur_g_i = self.lik.reduce(edge_lik, self.denominator[child])
                     inside_div_gi = self.lik.reduce(
                         self.inside[edge.parent], cur_g_i, div_0_null=True
                     )
@@ -760,15 +767,15 @@ def outside_pass(
                         self.lik.combine(outside[edge.parent], inside_div_gi)
                     ),
                 )
-                if normalize:
+                if standardize:
                     parent_val = self.lik.reduce(parent_val, np.max(parent_val))
                 edge_lik = self.lik.get_outside(parent_val, edge)
                 val = self.lik.combine(val, edge_lik)
 
             # vv[0] = 0  # Seems a hack: internal nodes should be allowed at time 0
-            assert self.norm[edge.child] > self.lik.null_constant
-            outside[child] = self.lik.reduce(val, self.norm[child])
-            if normalize:
+            assert self.denominator[edge.child] > self.lik.null_constant
+            outside[child] = self.lik.reduce(val, self.denominator[child])
+            if standardize:
                 outside[child] = self.lik.reduce(val, np.max(val))
         self.outside = outside
         posterior = outside.clone_with_new_data(
@@ -1054,7 +1061,7 @@ def get_dates(
     eps=1e-6,
     num_threads=None,
     method="inside_outside",
-    outside_normalize=True,
+    outside_standardize=True,
     ignore_oldest_root=False,
     progress=False,
     cache_inside=False,
@@ -1134,10 +1141,10 @@ def get_dates(
     posterior = None
     if method == "inside_outside":
         posterior = dynamic_prog.outside_pass(
-            normalize=outside_normalize, ignore_oldest_root=ignore_oldest_root
+            standardize=outside_standardize, ignore_oldest_root=ignore_oldest_root
         )
         # Turn the posterior into probabilities
-        posterior.normalize()  # Just to make sure there are no floating point issues
+        posterior.standardize()  # Just to make sure there are no floating point issues
         posterior.force_probability_space(base.LIN)
         posterior.to_probabilities()
         tree_sequence, mn_post, _ = posterior_mean_var(
diff --git a/tsdate/prior.py b/tsdate/prior.py