@@ -521,7 +521,7 @@ write_predicted_coverage_curve(const string &outfile,
521521
522522
523523static int
524- lc_extrap (const int argc, const char **argv) {
524+ lc_extrap (const bool pop_size, const int argc, const char **argv) {
525525
526526 try {
527527
@@ -552,16 +552,24 @@ lc_extrap(const int argc, const char **argv) {
552552 bool BAM_FORMAT_INPUT = false ;
553553 size_t MAX_SEGMENT_LENGTH = 5000 ;
554554#endif
555-
556- const string description =
557- " Extrapolate the complexity of a library. This is the approach \
558- described in Daley & Smith (2013). The method applies rational \
559- function approximation via continued fractions with the \
560- original goal of estimating the number of distinct reads that a \
561- sequencing library would yield upon deeper sequencing. This \
562- method has been used for many different purposes since then." ;
555+
556+ string description;
557+ if (!pop_size) {
558+ description =
559+ " Extrapolate the complexity of a library. This is the approach \
560+ described in Daley & Smith (2013). The method applies rational \
561+ function approximation via continued fractions with the \
562+ original goal of estimating the number of distinct reads that a \
563+ sequencing library would yield upon deeper sequencing. This \
564+ method has been used for many different purposes since then." ;
565+ }
566+ else {
567+ description =
568+ " Determine the estimate of the number of unique classes in a library." ;
569+ }
563570
564571 /* ********* GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/
572+
565573 OptionParser opt_parse (strip_path (argv[1 ]), description, " <input-file>" );
566574 opt_parse.add_opt (" output" , ' o' , " yield output file (default: stdout)" ,
567575 false , outfile);
@@ -759,9 +767,25 @@ lc_extrap(const int argc, const char **argv) {
759767 if (VERBOSE)
760768 cerr << " [WRITING OUTPUT]" << endl;
761769
762- write_predicted_complexity_curve (outfile, c_level, step_size,
770+ if (!pop_size) {
771+ write_predicted_complexity_curve (outfile, c_level, step_size,
763772 yield_estimates, yield_lower_ci_lognorm,
764773 yield_upper_ci_lognorm);
774+ }
775+ else {
776+ std::ofstream of;
777+ if (!outfile.empty ()) of.open (outfile.c_str ());
778+ std::ostream out (outfile.empty () ? std::cout.rdbuf () : of.rdbuf ());
779+
780+ out.setf (std::ios_base::fixed, std::ios_base::floatfield);
781+ out.precision (1 );
782+
783+ out << " pop_size_estimate" << ' \t '
784+ << " lower_ci" << ' \t ' << " upper_ci" << endl;
785+ out << yield_estimates.back () << ' \t '
786+ << yield_lower_ci_lognorm.back () << ' \t '
787+ << yield_upper_ci_lognorm.back () << endl;
788+ }
765789 }
766790 }
767791 catch (runtime_error &e) {
@@ -988,10 +1012,12 @@ gc_extrap(const int argc, const char **argv) {
9881012
9891013 if (VERBOSE)
9901014 cerr << " [WRITING OUTPUT]" << endl;
1015+
9911016 write_predicted_coverage_curve (outfile, c_level, base_step_size,
9921017 bin_size, coverage_estimates,
9931018 coverage_lower_ci_lognorm,
9941019 coverage_upper_ci_lognorm);
1020+
9951021 }
9961022 }
9971023 catch (runtime_error &e) {
@@ -1564,261 +1590,6 @@ bound_pop(const int argc, const char **argv) {
15641590 return EXIT_SUCCESS;
15651591}
15661592
1567- static int
1568- pop_size (const int argc, const char **argv) {
1569-
1570- try {
1571-
1572- static const size_t min_required_counts = 4 ;
1573- static const string min_required_counts_error_message =
1574- " max count before zero is less than min required count (" +
1575- to_string (min_required_counts) + " ) duplicates removed" ;
1576-
1577- string outfile;
1578-
1579- size_t orig_max_terms = 100 ;
1580- double max_extrap = 1.0e10 ;
1581- double step_size = 1e6 ;
1582- size_t n_bootstraps = 100 ;
1583- int diagonal = 0 ;
1584- double c_level = 0.95 ;
1585- unsigned long int seed = 408 ;
1586-
1587- /* FLAGS */
1588- bool VERBOSE = false ;
1589- bool VALS_INPUT = false ;
1590- bool PAIRED_END = false ;
1591- bool HIST_INPUT = false ;
1592- bool SINGLE_ESTIMATE = false ;
1593- bool allow_defects = false ;
1594-
1595- #ifdef HAVE_HTSLIB
1596- bool BAM_FORMAT_INPUT = false ;
1597- size_t MAX_SEGMENT_LENGTH = 5000 ;
1598- #endif
1599-
1600- const string description =
1601- " Extrapolate the complexity of a library. This is the approach \
1602- described in Daley & Smith (2013). The method applies rational \
1603- function approximation via continued fractions with the \
1604- original goal of estimating the number of distinct reads that a \
1605- sequencing library would yield upon deeper sequencing. This \
1606- method has been used for many different purposes since then." ;
1607-
1608- /* ********* GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/
1609- OptionParser opt_parse (strip_path (argv[1 ]), description, " <input-file>" );
1610- opt_parse.add_opt (" output" , ' o' , " yield output file (default: stdout)" ,
1611- false , outfile);
1612- opt_parse.add_opt (" extrap" ,' e' ," maximum extrapolation" , false , max_extrap);
1613- opt_parse.add_opt (" step" ,' s' ," extrapolation step size" , false , step_size);
1614- opt_parse.add_opt (" boots" ,' n' ," number of bootstraps" , false , n_bootstraps);
1615- opt_parse.add_opt (" cval" , ' c' , " level for confidence intervals" , false , c_level);
1616- opt_parse.add_opt (" terms" ,' x' ," maximum terms in estimator" , false , orig_max_terms);
1617- opt_parse.add_opt (" verbose" , ' v' , " print more info" , false , VERBOSE);
1618- #ifdef HAVE_HTSLIB
1619- opt_parse.add_opt (" bam" , ' B' , " input is in BAM format" ,
1620- false , BAM_FORMAT_INPUT);
1621- opt_parse.add_opt (" seg_len" , ' l' , " maximum segment length when merging "
1622- " paired end bam reads" ,
1623- false , MAX_SEGMENT_LENGTH);
1624- #endif
1625- opt_parse.add_opt (" pe" , ' P' , " input is paired end read file" ,
1626- false , PAIRED_END);
1627- opt_parse.add_opt (" vals" , ' V' ,
1628- " input is a text file containing only the observed counts" ,
1629- false , VALS_INPUT);
1630- opt_parse.add_opt (" hist" , ' H' ,
1631- " input is a text file containing the observed histogram" ,
1632- false , HIST_INPUT);
1633- opt_parse.add_opt (" quick" , ' Q' ,
1634- " quick mode (no bootstraps) for confidence intervals" ,
1635- false , SINGLE_ESTIMATE);
1636- opt_parse.add_opt (" defects" , ' D' , " no testing for defects" , false , allow_defects);
1637- opt_parse.add_opt (" seed" , ' r' , " seed for random number generator" ,
1638- false , seed);
1639- opt_parse.set_show_defaults ();
1640- vector<string> leftover_args;
1641- // ADS: suspect bug below; "-about" isn't working.
1642- opt_parse.parse (argc-1 , argv+1 , leftover_args);
1643- if (argc == 2 || opt_parse.help_requested ()) {
1644- cerr << opt_parse.help_message () << endl;
1645- return EXIT_SUCCESS;
1646- }
1647- if (opt_parse.about_requested ()) {
1648- cerr << opt_parse.about_message () << endl;
1649- return EXIT_SUCCESS;
1650- }
1651- if (opt_parse.option_missing ()) {
1652- cerr << opt_parse.option_missing_message () << endl;
1653- return EXIT_SUCCESS;
1654- }
1655- if (leftover_args.empty ()) {
1656- cerr << opt_parse.help_message () << endl;
1657- return EXIT_SUCCESS;
1658- }
1659- const string input_file_name = leftover_args.front ();
1660- /* *****************************************************************/
1661-
1662- vector<double > counts_hist;
1663- size_t n_reads = 0 ;
1664-
1665- /* *********** loading input ***************************************/
1666- if (HIST_INPUT) {
1667- if (VERBOSE)
1668- cerr << " HIST_INPUT" << endl;
1669- n_reads = load_histogram (input_file_name, counts_hist);
1670- }
1671- else if (VALS_INPUT) {
1672- if (VERBOSE)
1673- cerr << " VALS_INPUT" << endl;
1674- n_reads = load_counts (input_file_name, counts_hist);
1675- }
1676- #ifdef HAVE_HTSLIB
1677- else if (BAM_FORMAT_INPUT && PAIRED_END) {
1678- if (VERBOSE)
1679- cerr << " PAIRED_END_BAM_INPUT" << endl;
1680- const size_t MAX_READS_TO_HOLD = 5000000 ;
1681- size_t n_paired = 0 ;
1682- size_t n_mates = 0 ;
1683- n_reads = load_counts_BAM_pe (VERBOSE, input_file_name,
1684- MAX_SEGMENT_LENGTH,
1685- MAX_READS_TO_HOLD, n_paired,
1686- n_mates, counts_hist);
1687- if (VERBOSE) {
1688- cerr << " MERGED PAIRED END READS = " << n_paired << endl;
1689- cerr << " MATES PROCESSED = " << n_mates << endl;
1690- }
1691- }
1692- else if (BAM_FORMAT_INPUT) {
1693- if (VERBOSE)
1694- cerr << " BAM_INPUT" << endl;
1695- n_reads = load_counts_BAM_se (input_file_name, counts_hist);
1696- }
1697- #endif
1698- else if (PAIRED_END) {
1699- if (VERBOSE)
1700- cerr << " PAIRED_END_BED_INPUT" << endl;
1701- n_reads = load_counts_BED_pe (input_file_name, counts_hist);
1702- }
1703- else { // default is single end bed file
1704- if (VERBOSE)
1705- cerr << " BED_INPUT" << endl;
1706- n_reads = load_counts_BED_se (input_file_name, counts_hist);
1707- }
1708- /* *********** done loading input **********************************/
1709-
1710- const size_t max_observed_count = counts_hist.size () - 1 ;
1711- const double distinct_reads =
1712- accumulate (begin (counts_hist), end (counts_hist), 0.0 );
1713-
1714- // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
1715- size_t first_zero = 1 ;
1716- while (first_zero < counts_hist.size () && counts_hist[first_zero] > 0 )
1717- ++first_zero;
1718-
1719- orig_max_terms = min (orig_max_terms, first_zero - 1 );
1720- orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1 );
1721-
1722- const size_t distinct_counts =
1723- std::count_if (begin (counts_hist), end (counts_hist),
1724- [](const double x) {return x > 0.0 ;});
1725-
1726- if (VERBOSE)
1727- cerr << " TOTAL READS = " << n_reads << endl
1728- << " DISTINCT READS = " << distinct_reads << endl
1729- << " DISTINCT COUNTS = " << distinct_counts << endl
1730- << " MAX COUNT = " << max_observed_count << endl
1731- << " COUNTS OF 1 = " << counts_hist[1 ] << endl
1732- << " MAX TERMS = " << orig_max_terms << endl;
1733-
1734- if (VERBOSE) {
1735- // OUTPUT THE ORIGINAL HISTOGRAM
1736- cerr << " OBSERVED COUNTS (" << counts_hist.size () << " )" << endl;
1737- for (size_t i = 0 ; i < counts_hist.size (); i++)
1738- if (counts_hist[i] > 0 )
1739- cerr << i << ' \t ' << static_cast <size_t >(counts_hist[i]) << endl;
1740- cerr << endl;
1741- }
1742- // check to make sure library is not overly saturated
1743- const double two_fold_extrap = GoodToulmin2xExtrap (counts_hist);
1744- if (two_fold_extrap < 0.0 )
1745- throw runtime_error (" Saturation expected at double initial sample size."
1746- " Unable to extrapolate" );
1747-
1748- // const size_t total_reads = get_counts_from_hist(counts_hist);
1749-
1750- // assert(total_reads == n_reads); // ADS: why commented out?
1751-
1752- // check that min required count is satisfied
1753- if (orig_max_terms < min_required_counts)
1754- throw runtime_error (min_required_counts_error_message);
1755-
1756- if (VERBOSE)
1757- cerr << " [ESTIMATING YIELD CURVE]" << endl;
1758- vector<double > yield_estimates;
1759-
1760- if (SINGLE_ESTIMATE) {
1761-
1762- const bool single_estimate_success =
1763- extrap_single_estimate (VERBOSE, allow_defects, counts_hist, orig_max_terms,
1764- diagonal, step_size, max_extrap, yield_estimates);
1765- // IF FAILURE, EXIT
1766- if (!single_estimate_success)
1767- throw runtime_error (" single estimate failed, run "
1768- " full mode for estimates" );
1769-
1770- std::ofstream of;
1771- if (!outfile.empty ()) of.open (outfile.c_str ());
1772- std::ostream out (outfile.empty () ? std::cout.rdbuf () : of.rdbuf ());
1773-
1774- out << " TOTAL_READS\t EXPECTED_DISTINCT" << endl;
1775- out.setf (std::ios_base::fixed, std::ios_base::floatfield);
1776- out.precision (1 );
1777-
1778- out << 0 << ' \t ' << 0 << endl;
1779- for (size_t i = 0 ; i < yield_estimates.size (); ++i)
1780- out << (i + 1 )*step_size << ' \t ' << yield_estimates[i] << endl;
1781- }
1782- else {
1783- if (VERBOSE)
1784- cerr << " [BOOTSTRAPPING HISTOGRAM]" << endl;
1785-
1786- const size_t max_iter = 100 *n_bootstraps;
1787-
1788- vector<vector <double > > bootstrap_estimates;
1789- extrap_bootstrap (VERBOSE, allow_defects, seed, counts_hist, n_bootstraps,
1790- orig_max_terms, diagonal, step_size, max_extrap,
1791- max_iter, bootstrap_estimates);
1792-
1793- if (VERBOSE)
1794- cerr << " [COMPUTING CONFIDENCE INTERVALS]" << endl;
1795- // yield ci
1796- vector<double > yield_upper_ci_lognorm, yield_lower_ci_lognorm;
1797-
1798- vector_median_and_ci (bootstrap_estimates, c_level, yield_estimates,
1799- yield_lower_ci_lognorm, yield_upper_ci_lognorm);
1800-
1801- // ///////////////////////////////////////////////////////////////////
1802- if (VERBOSE)
1803- cerr << " [WRITING OUTPUT]" << endl;
1804-
1805- write_predicted_complexity_curve (outfile, c_level, step_size,
1806- yield_estimates, yield_lower_ci_lognorm,
1807- yield_upper_ci_lognorm);
1808- }
1809- }
1810- catch (runtime_error &e) {
1811- cerr << " ERROR:\t " << e.what () << endl;
1812- return EXIT_FAILURE;
1813- }
1814- catch (std::bad_alloc &ba) {
1815- cerr << " ERROR: could not allocate memory" << endl;
1816- return EXIT_FAILURE;
1817- }
1818- return EXIT_SUCCESS;
1819- }
1820-
1821-
18221593int
18231594main (const int argc, const char **argv) {
18241595
@@ -1831,14 +1602,14 @@ main(const int argc, const char **argv) {
18311602 " gc_extrap predict genome coverage low input\n "
18321603 " sequencing experiments\n "
18331604 " bound_pop lower bound on population size\n "
1834- " pop_size estimate number of unique species\n " ;
1605+ " pop_size estimate number of unique species\n " ;
18351606
18361607 if (argc < 2 )
18371608 cerr << usage_message << endl;
18381609
18391610 else if (strcmp (argv[1 ], " lc_extrap" ) == 0 ) {
18401611
1841- return lc_extrap (argc, argv);
1612+ return lc_extrap (0 , argc, argv);
18421613
18431614 }
18441615 else if (strcmp (argv[1 ], " c_curve" ) == 0 ) {
@@ -1858,7 +1629,7 @@ main(const int argc, const char **argv) {
18581629 }
18591630 else if (strcmp (argv[1 ], " pop_size" ) == 0 ) {
18601631
1861- return pop_size ( argc, argv);
1632+ return lc_extrap ( 1 , argc, argv);
18621633
18631634 }
18641635 else {
0 commit comments