Skip to content

Commit c3ff1d9

Browse files
committed
[core] use MCLE to properly compute watterson estimator; fixes #28
1 parent dfad592 commit c3ff1d9

File tree

2 files changed

+24
-23
lines changed

2 files changed

+24
-23
lines changed

smcpp/analysis.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -88,8 +88,8 @@ def _load_data(self, files):
8888
assert c.data.shape[1] == 1 + 3 * len(c.n)
8989
logger.debug("Contig(pid=%r, fn=%r, n=%r, a=%r)", c.pid, c.fn, c.n, c.a)
9090
logger.info("%d population%s", self.npop, "" if self.npop == 1 else "s")
91-
self._esfs = estimation_tools.empirical_sfs(self._contigs)
92-
logger.debug("Empirical CSFS:\n%s", self._esfs)
91+
self._watterson = estimation_tools.watterson_estimator(self._contigs)
92+
logger.debug("Watterson estimator: %f", self._watterson)
9393

9494
def _validate_data(self):
9595
for c in self._contigs:
@@ -170,14 +170,11 @@ def _normalize_data(self, length_cutoff, filter):
170170
self._contigs = new_contigs
171171

172172
def _calculate_t1_tK(self, args):
173-
n = 2 + max(self._ns)
174-
e_sum = np.sum([e.sum() for e in self._esfs.values()])
175-
e_0 = np.sum([e[0, 0] for e in self._esfs.values()])
176-
p = (e_sum.astype('float') - e_0) / e_sum
177-
Ne = p / self._theta / (2. * (1. / np.arange(1, n)).sum())
173+
Ne = self._watterson / self._theta
178174
logger.debug("Ne: %f", Ne)
179175
Ne *= 2 * self._N0
180176
if args.t1 is None:
177+
n = 2 + max(self._ns)
181178
# distribution of first coalescent in sample ~ 1 - exp(-nC2 t / (Ne / N0)) ~= .1
182179
args.t1 = 200 + np.log(.9) / (-n * (n - 1) / 2) * Ne
183180
if args.t1 <= 0:
@@ -190,6 +187,9 @@ def _calculate_t1_tK(self, args):
190187
logger.error("--tK should be >0")
191188
sys.exit(1)
192189
logger.debug("setting tK=%f", args.tK)
190+
if args.tK <= args.t1:
191+
logger.error("tK <= t1? Possible weirdness in data")
192+
sys.exit(1)
193193

194194

195195
def _init_inference_manager(self, polarization_error):

smcpp/estimation_tools.py

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -177,28 +177,29 @@ def f(t):
177177
return np.array(ret) * 2 * model.N0 # return in generations
178178

179179

180-
def empirical_sfs(contigs):
180+
def watterson_estimator(contigs):
181181
with mp_pool() as p:
182-
esfss = list(map(_esfs_helper, contigs))
183-
# Some contigs might be of a smaller sample size. Restrict to those
184-
# that are "full dimensional"
185-
d = {}
186-
for e in esfss:
187-
d.setdefault(e.shape, []).append(e)
188-
return {k: np.sum(d[k], axis=0) for k in d}
182+
num = denom = 0
183+
for S, sample_sizes, spans in map(_watterson_helper, contigs):
184+
num += S
185+
non_missing = sample_sizes > 0
186+
ss = sample_sizes[non_missing]
187+
sp = spans[non_missing]
188+
denom += (sp * (np.log(ss) + 0.5 / ss + 0.57721)).sum()
189+
return num / denom
189190

190191

191-
def _esfs_helper(contig):
192+
def _watterson_helper(contig):
192193
c = contig
193194
shp = [x + 1 for na in zip(c.a, c.n) for x in na]
194195
ret = np.zeros(shp, dtype=int)
195-
nmiss = np.where(np.all(c.data[:, 1::3] >= 0, axis=1) &
196-
np.all(c.data[:, 3::3] == c.n, axis=1))
197-
for row in c.data[nmiss]:
198-
coord = tuple([x for ab in zip(row[1::3], row[2::3]) for x in ab])
199-
ret[coord] += row[0]
200-
return ret
201-
196+
spans = c.data[:, 0]
197+
seg = (np.any(c.data[:, 1::3] >= 1, axis=1) |
198+
np.any(c.data[:, 2::3] > 0, axis=1))
199+
S = spans[seg].sum()
200+
sample_sizes = (c.data[:, 3::3].sum(axis=1) +
201+
np.maximum(0, c.data[:, 1::3]).sum(axis=1))
202+
return (S, sample_sizes, spans)
202203

203204
def _load_data_helper(fn):
204205
try:

0 commit comments

Comments
 (0)