Skip to content

Commit d201df8

Browse files
committed
merged pop_size module with lc_extrap. flag is passed into lc_extrap to indicate the module type, and corresponding default msg/output is modified
1 parent 7a0206e commit d201df8

File tree

1 file changed

+39
-268
lines changed

1 file changed

+39
-268
lines changed

src/preseq.cpp

Lines changed: 39 additions & 268 deletions
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,7 @@ write_predicted_coverage_curve(const string &outfile,
521521

522522

523523
static int
524-
lc_extrap(const int argc, const char **argv) {
524+
lc_extrap(const bool pop_size, const int argc, const char **argv) {
525525

526526
try {
527527

@@ -552,16 +552,24 @@ lc_extrap(const int argc, const char **argv) {
552552
bool BAM_FORMAT_INPUT = false;
553553
size_t MAX_SEGMENT_LENGTH = 5000;
554554
#endif
555-
556-
const string description =
557-
"Extrapolate the complexity of a library. This is the approach \
558-
described in Daley & Smith (2013). The method applies rational \
559-
function approximation via continued fractions with the \
560-
original goal of estimating the number of distinct reads that a \
561-
sequencing library would yield upon deeper sequencing. This \
562-
method has been used for many different purposes since then.";
555+
556+
string description;
557+
if (!pop_size) {
558+
description =
559+
"Extrapolate the complexity of a library. This is the approach \
560+
described in Daley & Smith (2013). The method applies rational \
561+
function approximation via continued fractions with the \
562+
original goal of estimating the number of distinct reads that a \
563+
sequencing library would yield upon deeper sequencing. This \
564+
method has been used for many different purposes since then.";
565+
}
566+
else {
567+
description =
568+
"Determine the estimate of the number of unique classes in a library.";
569+
}
563570

564571
/********** GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/
572+
565573
OptionParser opt_parse(strip_path(argv[1]), description, "<input-file>");
566574
opt_parse.add_opt("output", 'o', "yield output file (default: stdout)",
567575
false , outfile);
@@ -759,9 +767,25 @@ lc_extrap(const int argc, const char **argv) {
759767
if (VERBOSE)
760768
cerr << "[WRITING OUTPUT]" << endl;
761769

762-
write_predicted_complexity_curve(outfile, c_level, step_size,
770+
if (!pop_size) {
771+
write_predicted_complexity_curve(outfile, c_level, step_size,
763772
yield_estimates, yield_lower_ci_lognorm,
764773
yield_upper_ci_lognorm);
774+
}
775+
else {
776+
std::ofstream of;
777+
if (!outfile.empty()) of.open(outfile.c_str());
778+
std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
779+
780+
out.setf(std::ios_base::fixed, std::ios_base::floatfield);
781+
out.precision(1);
782+
783+
out << "pop_size_estimate" << '\t'
784+
<< "lower_ci" << '\t' << "upper_ci" << endl;
785+
out << yield_estimates.back() << '\t'
786+
<< yield_lower_ci_lognorm.back() << '\t'
787+
<< yield_upper_ci_lognorm.back() << endl;
788+
}
765789
}
766790
}
767791
catch (runtime_error &e) {
@@ -988,10 +1012,12 @@ gc_extrap(const int argc, const char **argv) {
9881012

9891013
if (VERBOSE)
9901014
cerr << "[WRITING OUTPUT]" << endl;
1015+
9911016
write_predicted_coverage_curve(outfile, c_level, base_step_size,
9921017
bin_size, coverage_estimates,
9931018
coverage_lower_ci_lognorm,
9941019
coverage_upper_ci_lognorm);
1020+
9951021
}
9961022
}
9971023
catch (runtime_error &e) {
@@ -1564,261 +1590,6 @@ bound_pop(const int argc, const char **argv) {
15641590
return EXIT_SUCCESS;
15651591
}
15661592

1567-
static int
1568-
pop_size(const int argc, const char **argv) {
1569-
1570-
try {
1571-
1572-
static const size_t min_required_counts = 4;
1573-
static const string min_required_counts_error_message =
1574-
"max count before zero is less than min required count (" +
1575-
to_string(min_required_counts) + ") duplicates removed";
1576-
1577-
string outfile;
1578-
1579-
size_t orig_max_terms = 100;
1580-
double max_extrap = 1.0e10;
1581-
double step_size = 1e6;
1582-
size_t n_bootstraps = 100;
1583-
int diagonal = 0;
1584-
double c_level = 0.95;
1585-
unsigned long int seed = 408;
1586-
1587-
/* FLAGS */
1588-
bool VERBOSE = false;
1589-
bool VALS_INPUT = false;
1590-
bool PAIRED_END = false;
1591-
bool HIST_INPUT = false;
1592-
bool SINGLE_ESTIMATE = false;
1593-
bool allow_defects = false;
1594-
1595-
#ifdef HAVE_HTSLIB
1596-
bool BAM_FORMAT_INPUT = false;
1597-
size_t MAX_SEGMENT_LENGTH = 5000;
1598-
#endif
1599-
1600-
const string description =
1601-
"Extrapolate the complexity of a library. This is the approach \
1602-
described in Daley & Smith (2013). The method applies rational \
1603-
function approximation via continued fractions with the \
1604-
original goal of estimating the number of distinct reads that a \
1605-
sequencing library would yield upon deeper sequencing. This \
1606-
method has been used for many different purposes since then.";
1607-
1608-
/********** GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/
1609-
OptionParser opt_parse(strip_path(argv[1]), description, "<input-file>");
1610-
opt_parse.add_opt("output", 'o', "yield output file (default: stdout)",
1611-
false , outfile);
1612-
opt_parse.add_opt("extrap",'e',"maximum extrapolation", false, max_extrap);
1613-
opt_parse.add_opt("step",'s',"extrapolation step size", false, step_size);
1614-
opt_parse.add_opt("boots",'n',"number of bootstraps", false, n_bootstraps);
1615-
opt_parse.add_opt("cval", 'c', "level for confidence intervals", false, c_level);
1616-
opt_parse.add_opt("terms",'x',"maximum terms in estimator", false, orig_max_terms);
1617-
opt_parse.add_opt("verbose", 'v', "print more info", false, VERBOSE);
1618-
#ifdef HAVE_HTSLIB
1619-
opt_parse.add_opt("bam", 'B', "input is in BAM format",
1620-
false, BAM_FORMAT_INPUT);
1621-
opt_parse.add_opt("seg_len", 'l', "maximum segment length when merging "
1622-
"paired end bam reads",
1623-
false, MAX_SEGMENT_LENGTH);
1624-
#endif
1625-
opt_parse.add_opt("pe", 'P', "input is paired end read file",
1626-
false, PAIRED_END);
1627-
opt_parse.add_opt("vals", 'V',
1628-
"input is a text file containing only the observed counts",
1629-
false, VALS_INPUT);
1630-
opt_parse.add_opt("hist", 'H',
1631-
"input is a text file containing the observed histogram",
1632-
false, HIST_INPUT);
1633-
opt_parse.add_opt("quick", 'Q',
1634-
"quick mode (no bootstraps) for confidence intervals",
1635-
false, SINGLE_ESTIMATE);
1636-
opt_parse.add_opt("defects", 'D', "no testing for defects", false, allow_defects);
1637-
opt_parse.add_opt("seed", 'r', "seed for random number generator",
1638-
false, seed);
1639-
opt_parse.set_show_defaults();
1640-
vector<string> leftover_args;
1641-
// ADS: suspect bug below; "-about" isn't working.
1642-
opt_parse.parse(argc-1, argv+1, leftover_args);
1643-
if (argc == 2 || opt_parse.help_requested()) {
1644-
cerr << opt_parse.help_message() << endl;
1645-
return EXIT_SUCCESS;
1646-
}
1647-
if (opt_parse.about_requested()) {
1648-
cerr << opt_parse.about_message() << endl;
1649-
return EXIT_SUCCESS;
1650-
}
1651-
if (opt_parse.option_missing()) {
1652-
cerr << opt_parse.option_missing_message() << endl;
1653-
return EXIT_SUCCESS;
1654-
}
1655-
if (leftover_args.empty()) {
1656-
cerr << opt_parse.help_message() << endl;
1657-
return EXIT_SUCCESS;
1658-
}
1659-
const string input_file_name = leftover_args.front();
1660-
/******************************************************************/
1661-
1662-
vector<double> counts_hist;
1663-
size_t n_reads = 0;
1664-
1665-
/************ loading input ***************************************/
1666-
if (HIST_INPUT) {
1667-
if (VERBOSE)
1668-
cerr << "HIST_INPUT" << endl;
1669-
n_reads = load_histogram(input_file_name, counts_hist);
1670-
}
1671-
else if (VALS_INPUT) {
1672-
if (VERBOSE)
1673-
cerr << "VALS_INPUT" << endl;
1674-
n_reads = load_counts(input_file_name, counts_hist);
1675-
}
1676-
#ifdef HAVE_HTSLIB
1677-
else if (BAM_FORMAT_INPUT && PAIRED_END) {
1678-
if (VERBOSE)
1679-
cerr << "PAIRED_END_BAM_INPUT" << endl;
1680-
const size_t MAX_READS_TO_HOLD = 5000000;
1681-
size_t n_paired = 0;
1682-
size_t n_mates = 0;
1683-
n_reads = load_counts_BAM_pe(VERBOSE, input_file_name,
1684-
MAX_SEGMENT_LENGTH,
1685-
MAX_READS_TO_HOLD, n_paired,
1686-
n_mates, counts_hist);
1687-
if (VERBOSE) {
1688-
cerr << "MERGED PAIRED END READS = " << n_paired << endl;
1689-
cerr << "MATES PROCESSED = " << n_mates << endl;
1690-
}
1691-
}
1692-
else if (BAM_FORMAT_INPUT) {
1693-
if (VERBOSE)
1694-
cerr << "BAM_INPUT" << endl;
1695-
n_reads = load_counts_BAM_se(input_file_name, counts_hist);
1696-
}
1697-
#endif
1698-
else if (PAIRED_END) {
1699-
if (VERBOSE)
1700-
cerr << "PAIRED_END_BED_INPUT" << endl;
1701-
n_reads = load_counts_BED_pe(input_file_name, counts_hist);
1702-
}
1703-
else { // default is single end bed file
1704-
if (VERBOSE)
1705-
cerr << "BED_INPUT" << endl;
1706-
n_reads = load_counts_BED_se(input_file_name, counts_hist);
1707-
}
1708-
/************ done loading input **********************************/
1709-
1710-
const size_t max_observed_count = counts_hist.size() - 1;
1711-
const double distinct_reads =
1712-
accumulate(begin(counts_hist), end(counts_hist), 0.0);
1713-
1714-
// ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
1715-
size_t first_zero = 1;
1716-
while (first_zero < counts_hist.size() && counts_hist[first_zero] > 0)
1717-
++first_zero;
1718-
1719-
orig_max_terms = min(orig_max_terms, first_zero - 1);
1720-
orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1);
1721-
1722-
const size_t distinct_counts =
1723-
std::count_if(begin(counts_hist), end(counts_hist),
1724-
[](const double x) {return x > 0.0;});
1725-
1726-
if (VERBOSE)
1727-
cerr << "TOTAL READS = " << n_reads << endl
1728-
<< "DISTINCT READS = " << distinct_reads << endl
1729-
<< "DISTINCT COUNTS = " << distinct_counts << endl
1730-
<< "MAX COUNT = " << max_observed_count << endl
1731-
<< "COUNTS OF 1 = " << counts_hist[1] << endl
1732-
<< "MAX TERMS = " << orig_max_terms << endl;
1733-
1734-
if (VERBOSE) {
1735-
// OUTPUT THE ORIGINAL HISTOGRAM
1736-
cerr << "OBSERVED COUNTS (" << counts_hist.size() << ")" << endl;
1737-
for (size_t i = 0; i < counts_hist.size(); i++)
1738-
if (counts_hist[i] > 0)
1739-
cerr << i << '\t' << static_cast<size_t>(counts_hist[i]) << endl;
1740-
cerr << endl;
1741-
}
1742-
// check to make sure library is not overly saturated
1743-
const double two_fold_extrap = GoodToulmin2xExtrap(counts_hist);
1744-
if (two_fold_extrap < 0.0)
1745-
throw runtime_error("Saturation expected at double initial sample size."
1746-
" Unable to extrapolate");
1747-
1748-
// const size_t total_reads = get_counts_from_hist(counts_hist);
1749-
1750-
//assert(total_reads == n_reads); // ADS: why commented out?
1751-
1752-
// check that min required count is satisfied
1753-
if (orig_max_terms < min_required_counts)
1754-
throw runtime_error(min_required_counts_error_message);
1755-
1756-
if (VERBOSE)
1757-
cerr << "[ESTIMATING YIELD CURVE]" << endl;
1758-
vector<double> yield_estimates;
1759-
1760-
if (SINGLE_ESTIMATE) {
1761-
1762-
const bool single_estimate_success =
1763-
extrap_single_estimate(VERBOSE, allow_defects, counts_hist, orig_max_terms,
1764-
diagonal, step_size, max_extrap, yield_estimates);
1765-
// IF FAILURE, EXIT
1766-
if (!single_estimate_success)
1767-
throw runtime_error("single estimate failed, run "
1768-
"full mode for estimates");
1769-
1770-
std::ofstream of;
1771-
if (!outfile.empty()) of.open(outfile.c_str());
1772-
std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
1773-
1774-
out << "TOTAL_READS\tEXPECTED_DISTINCT" << endl;
1775-
out.setf(std::ios_base::fixed, std::ios_base::floatfield);
1776-
out.precision(1);
1777-
1778-
out << 0 << '\t' << 0 << endl;
1779-
for (size_t i = 0; i < yield_estimates.size(); ++i)
1780-
out << (i + 1)*step_size << '\t' << yield_estimates[i] << endl;
1781-
}
1782-
else {
1783-
if (VERBOSE)
1784-
cerr << "[BOOTSTRAPPING HISTOGRAM]" << endl;
1785-
1786-
const size_t max_iter = 100*n_bootstraps;
1787-
1788-
vector<vector <double> > bootstrap_estimates;
1789-
extrap_bootstrap(VERBOSE, allow_defects, seed, counts_hist, n_bootstraps,
1790-
orig_max_terms, diagonal, step_size, max_extrap,
1791-
max_iter, bootstrap_estimates);
1792-
1793-
if (VERBOSE)
1794-
cerr << "[COMPUTING CONFIDENCE INTERVALS]" << endl;
1795-
// yield ci
1796-
vector<double> yield_upper_ci_lognorm, yield_lower_ci_lognorm;
1797-
1798-
vector_median_and_ci(bootstrap_estimates, c_level, yield_estimates,
1799-
yield_lower_ci_lognorm, yield_upper_ci_lognorm);
1800-
1801-
/////////////////////////////////////////////////////////////////////
1802-
if (VERBOSE)
1803-
cerr << "[WRITING OUTPUT]" << endl;
1804-
1805-
write_predicted_complexity_curve(outfile, c_level, step_size,
1806-
yield_estimates, yield_lower_ci_lognorm,
1807-
yield_upper_ci_lognorm);
1808-
}
1809-
}
1810-
catch (runtime_error &e) {
1811-
cerr << "ERROR:\t" << e.what() << endl;
1812-
return EXIT_FAILURE;
1813-
}
1814-
catch (std::bad_alloc &ba) {
1815-
cerr << "ERROR: could not allocate memory" << endl;
1816-
return EXIT_FAILURE;
1817-
}
1818-
return EXIT_SUCCESS;
1819-
}
1820-
1821-
18221593
int
18231594
main(const int argc, const char **argv) {
18241595

@@ -1831,14 +1602,14 @@ main(const int argc, const char **argv) {
18311602
" gc_extrap predict genome coverage low input\n"
18321603
" sequencing experiments\n"
18331604
" bound_pop lower bound on population size\n"
1834-
" pop_size estimate number of unique species\n";
1605+
" pop_size estimate number of unique species\n";
18351606

18361607
if (argc < 2)
18371608
cerr << usage_message << endl;
18381609

18391610
else if (strcmp(argv[1], "lc_extrap") == 0) {
18401611

1841-
return lc_extrap(argc, argv);
1612+
return lc_extrap(0, argc, argv);
18421613

18431614
}
18441615
else if (strcmp(argv[1], "c_curve") == 0) {
@@ -1858,7 +1629,7 @@ main(const int argc, const char **argv) {
18581629
}
18591630
else if (strcmp(argv[1], "pop_size") == 0) {
18601631

1861-
return pop_size(argc, argv);
1632+
return lc_extrap(1, argc, argv);
18621633

18631634
}
18641635
else {

0 commit comments

Comments
 (0)