[core] use MCLE to properly compute watterson estimator; fixes #28

terhorst · terhorst · commit c3ff1d92a397 · 2017-05-31T12:13:42.000-07:00
diff --git a/smcpp/analysis.py b/smcpp/analysis.py
@@ -88,8 +88,8 @@ def _load_data(self, files):
             assert c.data.shape[1] == 1 + 3 * len(c.n)
             logger.debug("Contig(pid=%r, fn=%r, n=%r, a=%r)", c.pid, c.fn, c.n, c.a)
         logger.info("%d population%s", self.npop, "" if self.npop == 1 else "s")
-        self._esfs = estimation_tools.empirical_sfs(self._contigs)
-        logger.debug("Empirical CSFS:\n%s", self._esfs)
+        self._watterson = estimation_tools.watterson_estimator(self._contigs)
+        logger.debug("Watterson estimator: %f", self._watterson)
 
     def _validate_data(self):
         for c in self._contigs:
@@ -170,14 +170,11 @@ def _normalize_data(self, length_cutoff, filter):
         self._contigs = new_contigs
 
     def _calculate_t1_tK(self, args):
-        n = 2 + max(self._ns)
-        e_sum = np.sum([e.sum() for e in self._esfs.values()])
-        e_0 = np.sum([e[0, 0] for e in self._esfs.values()])
-        p = (e_sum.astype('float') - e_0) / e_sum
-        Ne = p / self._theta / (2. * (1. / np.arange(1, n)).sum())
+        Ne = self._watterson / self._theta
         logger.debug("Ne: %f", Ne)
         Ne *= 2 * self._N0
         if args.t1 is None:
+            n = 2 + max(self._ns)
             # distribution of first coalescent in sample ~ 1 - exp(-nC2 t / (Ne / N0)) ~= .1
             args.t1 = 200 + np.log(.9) / (-n * (n - 1) / 2) * Ne
         if args.t1 <= 0:
@@ -190,6 +187,9 @@ def _calculate_t1_tK(self, args):
             logger.error("--tK should be >0")
             sys.exit(1)
         logger.debug("setting tK=%f", args.tK)
+        if args.tK <= args.t1:
+            logger.error("tK <= t1? Possible weirdness in data")
+            sys.exit(1)
 
 
     def _init_inference_manager(self, polarization_error):
diff --git a/smcpp/estimation_tools.py b/smcpp/estimation_tools.py
@@ -177,28 +177,29 @@ def f(t):
     return np.array(ret) * 2 * model.N0  # return in generations
 
 
-def empirical_sfs(contigs):
+def watterson_estimator(contigs):
     with mp_pool() as p:
-        esfss = list(map(_esfs_helper, contigs))
-    # Some contigs might be of a smaller sample size. Restrict to those
-    # that are "full dimensional"
-    d = {}
-    for e in esfss:
-        d.setdefault(e.shape, []).append(e)
-    return {k: np.sum(d[k], axis=0) for k in d}
+        num = denom = 0
+        for S, sample_sizes, spans in map(_watterson_helper, contigs):
+            num += S
+            non_missing = sample_sizes > 0
+            ss = sample_sizes[non_missing]
+            sp = spans[non_missing]
+            denom += (sp * (np.log(ss) + 0.5 / ss + 0.57721)).sum()
+    return num / denom
 
 
-def _esfs_helper(contig):
+def _watterson_helper(contig):
     c = contig
     shp = [x + 1 for na in zip(c.a, c.n) for x in na]
     ret = np.zeros(shp, dtype=int)
-    nmiss = np.where(np.all(c.data[:, 1::3] >= 0, axis=1) & 
-                     np.all(c.data[:, 3::3] == c.n, axis=1))
-    for row in c.data[nmiss]:
-        coord = tuple([x for ab in zip(row[1::3], row[2::3]) for x in ab])
-        ret[coord] += row[0]
-    return ret
-
+    spans = c.data[:, 0]
+    seg = (np.any(c.data[:, 1::3] >= 1, axis=1) |
+           np.any(c.data[:, 2::3] >  0, axis=1))
+    S = spans[seg].sum()
+    sample_sizes = (c.data[:, 3::3].sum(axis=1) +
+                    np.maximum(0, c.data[:, 1::3]).sum(axis=1))
+    return (S, sample_sizes, spans)
 
 def _load_data_helper(fn):
     try: