@@ -527,7 +527,7 @@ write_predicted_coverage_curve(const string &outfile,
527527
528528
529529static int
530- lc_extrap (const bool pop_size, const int argc, const char **argv) {
530+ lc_extrap (const int argc, const char **argv) {
531531
532532 try {
533533
@@ -546,12 +546,6 @@ lc_extrap(const bool pop_size, const int argc, const char **argv) {
546546 double c_level = 0.95 ;
547547 unsigned long int seed = 408 ;
548548
549- if (pop_size) {
550- // ADS: extrapolate far, without too many steps...
551- max_extrap = 1.0e20 ;
552- step_size = 1.0e18 ;
553- }
554-
555549 /* FLAGS */
556550 bool VERBOSE = false ;
557551 bool VALS_INPUT = false ;
@@ -565,20 +559,12 @@ lc_extrap(const bool pop_size, const int argc, const char **argv) {
565559 size_t MAX_SEGMENT_LENGTH = 5000 ;
566560#endif
567561
568- string description;
569- if (!pop_size) {
570- description =
571- " Extrapolate the complexity of a library. This is the approach \
562+ string description = " Extrapolate the complexity of a library. This is the approach \
572563 described in Daley & Smith (2013). The method applies rational \
573564 function approximation via continued fractions with the \
574565 original goal of estimating the number of distinct reads that a \
575566 sequencing library would yield upon deeper sequencing. This \
576567 method has been used for many different purposes since then." ;
577- }
578- else {
579- description =
580- " Determine the estimate of the number of unique classes in a library." ;
581- }
582568
583569 /* ********* GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/
584570
@@ -779,28 +765,9 @@ lc_extrap(const bool pop_size, const int argc, const char **argv) {
779765 if (VERBOSE)
780766 cerr << " [WRITING OUTPUT]" << endl;
781767
782- if (!pop_size) {
783- write_predicted_complexity_curve (outfile, c_level, step_size,
784- yield_estimates, yield_lower_ci_lognorm,
785- yield_upper_ci_lognorm);
786- }
787- else {
788- std::ofstream of;
789- if (!outfile.empty ()) of.open (outfile.c_str ());
790- std::ostream out (outfile.empty () ? std::cout.rdbuf () : of.rdbuf ());
791-
792- out.setf (std::ios_base::fixed, std::ios_base::floatfield);
793- out.precision (1 );
794-
795- out << " pop_size_estimate" << ' \t '
796- << " lower_ci" << ' \t ' << " upper_ci" << endl;
797- out << yield_estimates[yield_estimates.size () - 2 ] << ' \t '
798- << yield_lower_ci_lognorm[yield_estimates.size () - 2 ] << ' \t '
799- << yield_upper_ci_lognorm[yield_estimates.size () - 2 ] << endl;
800- out << yield_estimates.back () << ' \t '
801- << yield_lower_ci_lognorm.back () << ' \t '
802- << yield_upper_ci_lognorm.back () << endl;
803- }
768+ write_predicted_complexity_curve (outfile, c_level, step_size,
769+ yield_estimates, yield_lower_ci_lognorm,
770+ yield_upper_ci_lognorm);
804771 }
805772 }
806773 catch (runtime_error &e) {
@@ -1605,6 +1572,278 @@ bound_pop(const int argc, const char **argv) {
16051572 return EXIT_SUCCESS;
16061573}
16071574
1575+ static int
1576+ pop_size (const int argc, const char **argv) {
1577+
1578+ try {
1579+
1580+ static const size_t min_required_counts = 4 ;
1581+ static const string min_required_counts_error_message =
1582+ " max count before zero is less than min required count (" +
1583+ to_string (min_required_counts) + " ) duplicates removed" ;
1584+
1585+ string outfile;
1586+
1587+ size_t orig_max_terms = 100 ;
1588+ double max_extrap = 1.0e18 ;
1589+ double step_size;
1590+ // TL: desired number of steps for extrap
1591+ size_t n_desired_steps = 50 ;
1592+ size_t n_bootstraps = 100 ;
1593+ int diagonal = 0 ;
1594+ double c_level = 0.95 ;
1595+ unsigned long int seed = 408 ;
1596+
1597+ /* FLAGS */
1598+ bool VERBOSE = false ;
1599+ bool VALS_INPUT = false ;
1600+ bool PAIRED_END = false ;
1601+ bool HIST_INPUT = false ;
1602+ bool SINGLE_ESTIMATE = false ;
1603+ bool allow_defects = false ;
1604+
1605+ #ifdef HAVE_HTSLIB
1606+ bool BAM_FORMAT_INPUT = false ;
1607+ size_t MAX_SEGMENT_LENGTH = 5000 ;
1608+ #endif
1609+
1610+ string description = " Extrapolate the complexity of a library. This is the approach \
1611+ described in Daley & Smith (2013). The method applies rational \
1612+ function approximation via continued fractions with the \
1613+ original goal of estimating the number of distinct reads that a \
1614+ sequencing library would yield upon deeper sequencing. This \
1615+ method has been used for many different purposes since then." ;
1616+
1617+ /* ********* GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/
1618+
1619+ OptionParser opt_parse (strip_path (argv[1 ]), description, " <input-file>" );
1620+ opt_parse.add_opt (" output" , ' o' , " yield output file (default: stdout)" ,
1621+ false , outfile);
1622+ opt_parse.add_opt (" extrap" ,' e' ," maximum extrapolation" , false , max_extrap);
1623+ opt_parse.add_opt (" desired_steps" ,' s' ," desired number of steps" , false , n_desired_steps);
1624+ opt_parse.add_opt (" boots" ,' n' ," number of bootstraps" , false , n_bootstraps);
1625+ opt_parse.add_opt (" cval" , ' c' , " level for confidence intervals" , false , c_level);
1626+ opt_parse.add_opt (" terms" ,' x' ," maximum terms in estimator" , false , orig_max_terms);
1627+ opt_parse.add_opt (" verbose" , ' v' , " print more info" , false , VERBOSE);
1628+ #ifdef HAVE_HTSLIB
1629+ opt_parse.add_opt (" bam" , ' B' , " input is in BAM format" ,
1630+ false , BAM_FORMAT_INPUT);
1631+ opt_parse.add_opt (" seg_len" , ' l' , " maximum segment length when merging "
1632+ " paired end bam reads" ,
1633+ false , MAX_SEGMENT_LENGTH);
1634+ #endif
1635+ opt_parse.add_opt (" pe" , ' P' , " input is paired end read file" ,
1636+ false , PAIRED_END);
1637+ opt_parse.add_opt (" vals" , ' V' ,
1638+ " input is a text file containing only the observed counts" ,
1639+ false , VALS_INPUT);
1640+ opt_parse.add_opt (" hist" , ' H' ,
1641+ " input is a text file containing the observed histogram" ,
1642+ false , HIST_INPUT);
1643+ opt_parse.add_opt (" quick" , ' Q' ,
1644+ " quick mode (no bootstraps) for confidence intervals" ,
1645+ false , SINGLE_ESTIMATE);
1646+ opt_parse.add_opt (" defects" , ' D' , " no testing for defects" , false , allow_defects);
1647+ opt_parse.add_opt (" seed" , ' r' , " seed for random number generator" ,
1648+ false , seed);
1649+ opt_parse.set_show_defaults ();
1650+ vector<string> leftover_args;
1651+ // ADS: suspect bug below; "-about" isn't working.
1652+ opt_parse.parse (argc-1 , argv+1 , leftover_args);
1653+ if (argc == 2 || opt_parse.help_requested ()) {
1654+ cerr << opt_parse.help_message () << endl;
1655+ return EXIT_SUCCESS;
1656+ }
1657+ if (opt_parse.about_requested ()) {
1658+ cerr << opt_parse.about_message () << endl;
1659+ return EXIT_SUCCESS;
1660+ }
1661+ if (opt_parse.option_missing ()) {
1662+ cerr << opt_parse.option_missing_message () << endl;
1663+ return EXIT_SUCCESS;
1664+ }
1665+ if (leftover_args.empty ()) {
1666+ cerr << opt_parse.help_message () << endl;
1667+ return EXIT_SUCCESS;
1668+ }
1669+ const string input_file_name = leftover_args.front ();
1670+ /* *****************************************************************/
1671+
1672+ vector<double > counts_hist;
1673+ size_t n_reads = 0 ;
1674+
1675+ /* *********** loading input ***************************************/
1676+ if (HIST_INPUT) {
1677+ if (VERBOSE)
1678+ cerr << " HIST_INPUT" << endl;
1679+ n_reads = load_histogram (input_file_name, counts_hist);
1680+ }
1681+ else if (VALS_INPUT) {
1682+ if (VERBOSE)
1683+ cerr << " VALS_INPUT" << endl;
1684+ n_reads = load_counts (input_file_name, counts_hist);
1685+ }
1686+ #ifdef HAVE_HTSLIB
1687+ else if (BAM_FORMAT_INPUT && PAIRED_END) {
1688+ if (VERBOSE)
1689+ cerr << " PAIRED_END_BAM_INPUT" << endl;
1690+ const size_t MAX_READS_TO_HOLD = 5000000 ;
1691+ size_t n_paired = 0 ;
1692+ size_t n_mates = 0 ;
1693+ n_reads = load_counts_BAM_pe (VERBOSE, input_file_name,
1694+ MAX_SEGMENT_LENGTH,
1695+ MAX_READS_TO_HOLD, n_paired,
1696+ n_mates, counts_hist);
1697+ if (VERBOSE) {
1698+ cerr << " MERGED PAIRED END READS = " << n_paired << endl;
1699+ cerr << " MATES PROCESSED = " << n_mates << endl;
1700+ }
1701+ }
1702+ else if (BAM_FORMAT_INPUT) {
1703+ if (VERBOSE)
1704+ cerr << " BAM_INPUT" << endl;
1705+ n_reads = load_counts_BAM_se (input_file_name, counts_hist);
1706+ }
1707+ #endif
1708+ else if (PAIRED_END) {
1709+ if (VERBOSE)
1710+ cerr << " PAIRED_END_BED_INPUT" << endl;
1711+ n_reads = load_counts_BED_pe (input_file_name, counts_hist);
1712+ }
1713+ else { // default is single end bed file
1714+ if (VERBOSE)
1715+ cerr << " BED_INPUT" << endl;
1716+ n_reads = load_counts_BED_se (input_file_name, counts_hist);
1717+ }
1718+ /* *********** done loading input **********************************/
1719+
1720+ const size_t max_observed_count = counts_hist.size () - 1 ;
1721+ const double distinct_reads =
1722+ accumulate (begin (counts_hist), end (counts_hist), 0.0 );
1723+
1724+ // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
1725+ size_t first_zero = 1 ;
1726+ while (first_zero < counts_hist.size () && counts_hist[first_zero] > 0 )
1727+ ++first_zero;
1728+
1729+ orig_max_terms = min (orig_max_terms, first_zero - 1 );
1730+ orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1 );
1731+
1732+ const size_t distinct_counts =
1733+ std::count_if (begin (counts_hist), end (counts_hist),
1734+ [](const double x) {return x > 0.0 ;});
1735+
1736+ if (VERBOSE)
1737+ cerr << " TOTAL READS = " << n_reads << endl
1738+ << " DISTINCT READS = " << distinct_reads << endl
1739+ << " DISTINCT COUNTS = " << distinct_counts << endl
1740+ << " MAX COUNT = " << max_observed_count << endl
1741+ << " COUNTS OF 1 = " << counts_hist[1 ] << endl
1742+ << " MAX TERMS = " << orig_max_terms << endl;
1743+
1744+ if (VERBOSE) {
1745+ // OUTPUT THE ORIGINAL HISTOGRAM
1746+ cerr << " OBSERVED COUNTS (" << counts_hist.size () << " )" << endl;
1747+ for (size_t i = 0 ; i < counts_hist.size (); i++)
1748+ if (counts_hist[i] > 0 )
1749+ cerr << i << ' \t ' << static_cast <size_t >(counts_hist[i]) << endl;
1750+ cerr << endl;
1751+ }
1752+
1753+ // check to make sure library is not overly saturated
1754+ const double two_fold_extrap = GoodToulmin2xExtrap (counts_hist);
1755+ if (two_fold_extrap < 0.0 )
1756+ throw runtime_error (" Saturation expected at double initial sample size."
1757+ " Unable to extrapolate" );
1758+
1759+ // const size_t total_reads = get_counts_from_hist(counts_hist);
1760+
1761+ // assert(total_reads == n_reads); // ADS: why commented out?
1762+
1763+ // check that min required count is satisfied
1764+ if (orig_max_terms < min_required_counts)
1765+ throw runtime_error (min_required_counts_error_message);
1766+
1767+ if (VERBOSE)
1768+ cerr << " [ESTIMATING YIELD CURVE]" << endl;
1769+
1770+ // TL: determine step size based on initial counts.
1771+
1772+ step_size = (max_extrap - distinct_reads) / n_desired_steps;
1773+
1774+ //
1775+ vector<double > yield_estimates;
1776+
1777+ if (SINGLE_ESTIMATE) {
1778+
1779+ const bool single_estimate_success =
1780+ extrap_single_estimate (VERBOSE, allow_defects, counts_hist, orig_max_terms,
1781+ diagonal, step_size, max_extrap, yield_estimates);
1782+ // IF FAILURE, EXIT
1783+ if (!single_estimate_success)
1784+ throw runtime_error (" single estimate failed, run "
1785+ " full mode for estimates" );
1786+
1787+ std::ofstream of;
1788+ if (!outfile.empty ()) of.open (outfile.c_str ());
1789+ std::ostream out (outfile.empty () ? std::cout.rdbuf () : of.rdbuf ());
1790+
1791+ out << " TOTAL_READS\t EXPECTED_DISTINCT" << endl;
1792+ out.setf (std::ios_base::fixed, std::ios_base::floatfield);
1793+ out.precision (1 );
1794+
1795+ out << 0 << ' \t ' << 0 << endl;
1796+ for (size_t i = 0 ; i < yield_estimates.size (); ++i)
1797+ out << (i + 1 )*step_size << ' \t ' << yield_estimates[i] << endl;
1798+ }
1799+ else {
1800+ if (VERBOSE)
1801+ cerr << " [BOOTSTRAPPING HISTOGRAM]" << endl;
1802+
1803+ const size_t max_iter = 100 *n_bootstraps;
1804+
1805+ vector<vector <double > > bootstrap_estimates;
1806+ extrap_bootstrap (VERBOSE, allow_defects, seed, counts_hist, n_bootstraps,
1807+ orig_max_terms, diagonal, step_size, max_extrap,
1808+ max_iter, bootstrap_estimates);
1809+
1810+ if (VERBOSE)
1811+ cerr << " [COMPUTING CONFIDENCE INTERVALS]" << endl;
1812+ // yield ci
1813+ vector<double > yield_upper_ci_lognorm, yield_lower_ci_lognorm;
1814+
1815+ vector_median_and_ci (bootstrap_estimates, c_level, yield_estimates,
1816+ yield_lower_ci_lognorm, yield_upper_ci_lognorm);
1817+
1818+ // ///////////////////////////////////////////////////////////////////
1819+ if (VERBOSE)
1820+ cerr << " [WRITING OUTPUT]" << endl;
1821+
1822+ std::ofstream of;
1823+ if (!outfile.empty ()) of.open (outfile.c_str ());
1824+ std::ostream out (outfile.empty () ? std::cout.rdbuf () : of.rdbuf ());
1825+
1826+ out.setf (std::ios_base::fixed, std::ios_base::floatfield);
1827+ out.precision (1 );
1828+
1829+ out << " pop_size_estimate" << ' \t '
1830+ << " lower_ci" << ' \t ' << " upper_ci" << endl;
1831+ out << yield_estimates.back () << ' \t '
1832+ << yield_lower_ci_lognorm.back () << ' \t '
1833+ << yield_upper_ci_lognorm.back () << endl;
1834+ }
1835+ }
1836+ catch (runtime_error &e) {
1837+ cerr << " ERROR:\t " << e.what () << endl;
1838+ return EXIT_FAILURE;
1839+ }
1840+ catch (std::bad_alloc &ba) {
1841+ cerr << " ERROR: could not allocate memory" << endl;
1842+ return EXIT_FAILURE;
1843+ }
1844+ return EXIT_SUCCESS;
1845+ }
1846+
16081847int
16091848main (const int argc, const char **argv) {
16101849
@@ -1624,7 +1863,7 @@ main(const int argc, const char **argv) {
16241863
16251864 else if (strcmp (argv[1 ], " lc_extrap" ) == 0 ) {
16261865
1627- return lc_extrap (0 , argc, argv);
1866+ return lc_extrap (argc, argv);
16281867
16291868 }
16301869 else if (strcmp (argv[1 ], " c_curve" ) == 0 ) {
@@ -1644,7 +1883,7 @@ main(const int argc, const char **argv) {
16441883 }
16451884 else if (strcmp (argv[1 ], " pop_size" ) == 0 ) {
16461885
1647- return lc_extrap ( 1 , argc, argv);
1886+ return pop_size ( argc, argv);
16481887
16491888 }
16501889 else {
0 commit comments