Skip to content

Commit cbf38af

Browse files
Changed defaults for pop_size to assume we have seen one in a billion of the population already. Works for all available examples.
1 parent d87d1ca commit cbf38af

File tree

1 file changed

+29
-14
lines changed

1 file changed

+29
-14
lines changed

src/preseq.cpp

Lines changed: 29 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -309,6 +309,9 @@ extrap_bootstrap(const bool VERBOSE, const bool allow_defects,
309309

310310
for (size_t iter = 0; (iter < max_iter && bootstrap_estimates.size() < n_bootstraps); ++iter) {
311311

312+
if (VERBOSE && iter > 0 && iter % 72 == 0)
313+
cerr << endl; // bootstrap success progress only 72 char wide
314+
312315
vector<double> yield_vector;
313316
vector<double> hist;
314317
resample_hist(rng, orig_hist_distinct_counts, distinct_orig_hist, hist);
@@ -1578,8 +1581,8 @@ pop_size(const int argc, const char **argv) {
15781581
string outfile;
15791582

15801583
size_t orig_max_terms = 100;
1581-
double max_extrap = 1.0e18;
1582-
double step_size;
1584+
double max_extrap = 0.0;
1585+
double step_size = 0.0;
15831586
// TL: desired number of steps for extrap
15841587
size_t n_desired_steps = 50;
15851588
size_t n_bootstraps = 100;
@@ -1600,23 +1603,23 @@ pop_size(const int argc, const char **argv) {
16001603
size_t MAX_SEGMENT_LENGTH = 5000;
16011604
#endif
16021605

1603-
string description = "Extrapolate the complexity of a library. This is the approach \
1604-
described in Daley & Smith (2013). The method applies rational \
1605-
function approximation via continued fractions with the \
1606-
original goal of estimating the number of distinct reads that a \
1607-
sequencing library would yield upon deeper sequencing. This \
1608-
method has been used for many different purposes since then.";
1606+
const string description =
1607+
"Estimate the total population size using the approach described in "
1608+
"Daley & Smith (2013), extrapolating to very long range. Default "
1609+
"parameters assume that the initial sample represents at least"
1610+
"1e-9 of the population, which is sufficient for every example "
1611+
"application we have seen.";
16091612

16101613
/********** GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/
16111614

16121615
OptionParser opt_parse(strip_path(argv[1]), description, "<input-file>");
16131616
opt_parse.add_opt("output", 'o', "yield output file (default: stdout)",
16141617
false , outfile);
16151618
opt_parse.add_opt("extrap",'e',"maximum extrapolation", false, max_extrap);
1616-
opt_parse.add_opt("desired_steps",'s',"desired number of steps", false, n_desired_steps);
1619+
opt_parse.add_opt("steps",'s',"number of steps", false, n_desired_steps);
16171620
opt_parse.add_opt("boots",'n',"number of bootstraps", false, n_bootstraps);
16181621
opt_parse.add_opt("cval", 'c', "level for confidence intervals", false, c_level);
1619-
opt_parse.add_opt("terms",'x',"maximum terms in estimator", false, orig_max_terms);
1622+
opt_parse.add_opt("terms", 'x', "maximum terms in estimator", false, orig_max_terms);
16201623
opt_parse.add_opt("verbose", 'v', "print more info", false, VERBOSE);
16211624
#ifdef HAVE_HTSLIB
16221625
opt_parse.add_opt("bam", 'B', "input is in BAM format",
@@ -1722,6 +1725,11 @@ pop_size(const int argc, const char **argv) {
17221725
orig_max_terms = min(orig_max_terms, first_zero - 1);
17231726
orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1);
17241727

1728+
if (max_extrap < 1.0)
1729+
max_extrap = 1000000000*distinct_reads;
1730+
if (step_size < 1.0)
1731+
step_size = (max_extrap - distinct_reads)/n_desired_steps;
1732+
17251733
const size_t distinct_counts =
17261734
std::count_if(begin(counts_hist), end(counts_hist),
17271735
[](const double x) {return x > 0.0;});
@@ -1761,9 +1769,6 @@ pop_size(const int argc, const char **argv) {
17611769
cerr << "[ESTIMATING YIELD CURVE]" << endl;
17621770

17631771
// TL: determine step size based on initial counts.
1764-
1765-
step_size = (max_extrap - distinct_reads) / n_desired_steps;
1766-
17671772
//
17681773
vector<double> yield_estimates;
17691774

@@ -1819,11 +1824,21 @@ pop_size(const int argc, const char **argv) {
18191824
out.setf(std::ios_base::fixed, std::ios_base::floatfield);
18201825
out.precision(1);
18211826

1827+
const size_t n_ests = yield_estimates.size() - 1;
1828+
if (n_ests < 2)
1829+
throw runtime_error("problem with number of estimates in pop_size");
1830+
1831+
const bool converged =
1832+
(yield_estimates[n_ests] - yield_estimates[n_ests-1] < 1.0);
1833+
18221834
out << "pop_size_estimate" << '\t'
18231835
<< "lower_ci" << '\t' << "upper_ci" << endl;
18241836
out << yield_estimates.back() << '\t'
18251837
<< yield_lower_ci_lognorm.back() << '\t'
1826-
<< yield_upper_ci_lognorm.back() << endl;
1838+
<< yield_upper_ci_lognorm.back();
1839+
if (!converged)
1840+
out << "\tnot_converged";
1841+
out << endl;
18271842
}
18281843
}
18291844
catch (runtime_error &e) {

0 commit comments

Comments
 (0)