@@ -309,6 +309,9 @@ extrap_bootstrap(const bool VERBOSE, const bool allow_defects,
309309
310310 for (size_t iter = 0 ; (iter < max_iter && bootstrap_estimates.size () < n_bootstraps); ++iter) {
311311
312+ if (VERBOSE && iter > 0 && iter % 72 == 0 )
313+ cerr << endl; // bootstrap success progress only 72 char wide
314+
312315 vector<double > yield_vector;
313316 vector<double > hist;
314317 resample_hist (rng, orig_hist_distinct_counts, distinct_orig_hist, hist);
@@ -1578,8 +1581,8 @@ pop_size(const int argc, const char **argv) {
15781581 string outfile;
15791582
15801583 size_t orig_max_terms = 100 ;
1581- double max_extrap = 1.0e18 ;
1582- double step_size;
1584+ double max_extrap = 0.0 ;
1585+ double step_size = 0.0 ;
15831586 // TL: desired number of steps for extrap
15841587 size_t n_desired_steps = 50 ;
15851588 size_t n_bootstraps = 100 ;
@@ -1600,23 +1603,23 @@ pop_size(const int argc, const char **argv) {
16001603 size_t MAX_SEGMENT_LENGTH = 5000 ;
16011604#endif
16021605
1603- string description = " Extrapolate the complexity of a library. This is the approach \
1604- described in Daley & Smith (2013). The method applies rational \
1605- function approximation via continued fractions with the \
1606- original goal of estimating the number of distinct reads that a \
1607- sequencing library would yield upon deeper sequencing. This \
1608- method has been used for many different purposes since then ." ;
1606+ const string description =
1607+ " Estimate the total population size using the approach described in "
1608+ " Daley & Smith (2013), extrapolating to very long range. Default "
1609+ " parameters assume that the initial sample represents at least "
1610+ " 1e-9 of the population, which is sufficient for every example "
1611+ " application we have seen ." ;
16091612
16101613 /* ********* GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/
16111614
16121615 OptionParser opt_parse (strip_path (argv[1 ]), description, " <input-file>" );
16131616 opt_parse.add_opt (" output" , ' o' , " yield output file (default: stdout)" ,
16141617 false , outfile);
16151618 opt_parse.add_opt (" extrap" ,' e' ," maximum extrapolation" , false , max_extrap);
1616- opt_parse.add_opt (" desired_steps " ,' s' ," desired number of steps" , false , n_desired_steps);
1619+ opt_parse.add_opt (" steps " ,' s' ," number of steps" , false , n_desired_steps);
16171620 opt_parse.add_opt (" boots" ,' n' ," number of bootstraps" , false , n_bootstraps);
16181621 opt_parse.add_opt (" cval" , ' c' , " level for confidence intervals" , false , c_level);
1619- opt_parse.add_opt (" terms" ,' x' ," maximum terms in estimator" , false , orig_max_terms);
1622+ opt_parse.add_opt (" terms" , ' x' , " maximum terms in estimator" , false , orig_max_terms);
16201623 opt_parse.add_opt (" verbose" , ' v' , " print more info" , false , VERBOSE);
16211624#ifdef HAVE_HTSLIB
16221625 opt_parse.add_opt (" bam" , ' B' , " input is in BAM format" ,
@@ -1722,6 +1725,11 @@ pop_size(const int argc, const char **argv) {
17221725 orig_max_terms = min (orig_max_terms, first_zero - 1 );
17231726 orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1 );
17241727
1728+ if (max_extrap < 1.0 )
1729+ max_extrap = 1000000000 *distinct_reads;
1730+ if (step_size < 1.0 )
1731+ step_size = (max_extrap - distinct_reads)/n_desired_steps;
1732+
17251733 const size_t distinct_counts =
17261734 std::count_if (begin (counts_hist), end (counts_hist),
17271735 [](const double x) {return x > 0.0 ;});
@@ -1761,9 +1769,6 @@ pop_size(const int argc, const char **argv) {
17611769 cerr << " [ESTIMATING YIELD CURVE]" << endl;
17621770
17631771 // TL: determine step size based on initial counts.
1764-
1765- step_size = (max_extrap - distinct_reads) / n_desired_steps;
1766-
17671772 //
17681773 vector<double > yield_estimates;
17691774
@@ -1819,11 +1824,21 @@ pop_size(const int argc, const char **argv) {
18191824 out.setf (std::ios_base::fixed, std::ios_base::floatfield);
18201825 out.precision (1 );
18211826
1827+ const size_t n_ests = yield_estimates.size () - 1 ;
1828+ if (n_ests < 2 )
1829+ throw runtime_error (" problem with number of estimates in pop_size" );
1830+
1831+ const bool converged =
1832+ (yield_estimates[n_ests] - yield_estimates[n_ests-1 ] < 1.0 );
1833+
18221834 out << " pop_size_estimate" << ' \t '
18231835 << " lower_ci" << ' \t ' << " upper_ci" << endl;
18241836 out << yield_estimates.back () << ' \t '
18251837 << yield_lower_ci_lognorm.back () << ' \t '
1826- << yield_upper_ci_lognorm.back () << endl;
1838+ << yield_upper_ci_lognorm.back ();
1839+ if (!converged)
1840+ out << " \t not_converged" ;
1841+ out << endl;
18271842 }
18281843 }
18291844 catch (runtime_error &e) {
0 commit comments