Skip to content

Commit 2c5eb86

Browse files
committed
edited pop_size to be its own module. changed max_extrap, and now step size is based on the initial distinct reads. removed break statement from vector_median_and_ci
1 parent 0a9a44d commit 2c5eb86

File tree

1 file changed

+279
-40
lines changed

1 file changed

+279
-40
lines changed

src/preseq.cpp

Lines changed: 279 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -527,7 +527,7 @@ write_predicted_coverage_curve(const string &outfile,
527527

528528

529529
static int
530-
lc_extrap(const bool pop_size, const int argc, const char **argv) {
530+
lc_extrap(const int argc, const char **argv) {
531531

532532
try {
533533

@@ -546,12 +546,6 @@ lc_extrap(const bool pop_size, const int argc, const char **argv) {
546546
double c_level = 0.95;
547547
unsigned long int seed = 408;
548548

549-
if (pop_size) {
550-
// ADS: extrapolate far, without too many steps...
551-
max_extrap = 1.0e20;
552-
step_size = 1.0e18;
553-
}
554-
555549
/* FLAGS */
556550
bool VERBOSE = false;
557551
bool VALS_INPUT = false;
@@ -565,20 +559,12 @@ lc_extrap(const bool pop_size, const int argc, const char **argv) {
565559
size_t MAX_SEGMENT_LENGTH = 5000;
566560
#endif
567561

568-
string description;
569-
if (!pop_size) {
570-
description =
571-
"Extrapolate the complexity of a library. This is the approach \
562+
string description = "Extrapolate the complexity of a library. This is the approach \
572563
described in Daley & Smith (2013). The method applies rational \
573564
function approximation via continued fractions with the \
574565
original goal of estimating the number of distinct reads that a \
575566
sequencing library would yield upon deeper sequencing. This \
576567
method has been used for many different purposes since then.";
577-
}
578-
else {
579-
description =
580-
"Determine the estimate of the number of unique classes in a library.";
581-
}
582568

583569
/********** GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/
584570

@@ -779,28 +765,9 @@ lc_extrap(const bool pop_size, const int argc, const char **argv) {
779765
if (VERBOSE)
780766
cerr << "[WRITING OUTPUT]" << endl;
781767

782-
if (!pop_size) {
783-
write_predicted_complexity_curve(outfile, c_level, step_size,
784-
yield_estimates, yield_lower_ci_lognorm,
785-
yield_upper_ci_lognorm);
786-
}
787-
else {
788-
std::ofstream of;
789-
if (!outfile.empty()) of.open(outfile.c_str());
790-
std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
791-
792-
out.setf(std::ios_base::fixed, std::ios_base::floatfield);
793-
out.precision(1);
794-
795-
out << "pop_size_estimate" << '\t'
796-
<< "lower_ci" << '\t' << "upper_ci" << endl;
797-
out << yield_estimates[yield_estimates.size() - 2] << '\t'
798-
<< yield_lower_ci_lognorm[yield_estimates.size() - 2] << '\t'
799-
<< yield_upper_ci_lognorm[yield_estimates.size() - 2] << endl;
800-
out << yield_estimates.back() << '\t'
801-
<< yield_lower_ci_lognorm.back() << '\t'
802-
<< yield_upper_ci_lognorm.back() << endl;
803-
}
768+
write_predicted_complexity_curve(outfile, c_level, step_size,
769+
yield_estimates, yield_lower_ci_lognorm,
770+
yield_upper_ci_lognorm);
804771
}
805772
}
806773
catch (runtime_error &e) {
@@ -1605,6 +1572,278 @@ bound_pop(const int argc, const char **argv) {
16051572
return EXIT_SUCCESS;
16061573
}
16071574

1575+
static int
1576+
pop_size(const int argc, const char **argv) {
1577+
1578+
try {
1579+
1580+
static const size_t min_required_counts = 4;
1581+
static const string min_required_counts_error_message =
1582+
"max count before zero is less than min required count (" +
1583+
to_string(min_required_counts) + ") duplicates removed";
1584+
1585+
string outfile;
1586+
1587+
size_t orig_max_terms = 100;
1588+
double max_extrap = 1.0e18;
1589+
double step_size;
1590+
// TL: desired number of steps for extrap
1591+
size_t n_desired_steps = 50;
1592+
size_t n_bootstraps = 100;
1593+
int diagonal = 0;
1594+
double c_level = 0.95;
1595+
unsigned long int seed = 408;
1596+
1597+
/* FLAGS */
1598+
bool VERBOSE = false;
1599+
bool VALS_INPUT = false;
1600+
bool PAIRED_END = false;
1601+
bool HIST_INPUT = false;
1602+
bool SINGLE_ESTIMATE = false;
1603+
bool allow_defects = false;
1604+
1605+
#ifdef HAVE_HTSLIB
1606+
bool BAM_FORMAT_INPUT = false;
1607+
size_t MAX_SEGMENT_LENGTH = 5000;
1608+
#endif
1609+
1610+
string description = "Extrapolate the complexity of a library. This is the approach \
1611+
described in Daley & Smith (2013). The method applies rational \
1612+
function approximation via continued fractions with the \
1613+
original goal of estimating the number of distinct reads that a \
1614+
sequencing library would yield upon deeper sequencing. This \
1615+
method has been used for many different purposes since then.";
1616+
1617+
/********** GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/
1618+
1619+
OptionParser opt_parse(strip_path(argv[1]), description, "<input-file>");
1620+
opt_parse.add_opt("output", 'o', "yield output file (default: stdout)",
1621+
false , outfile);
1622+
opt_parse.add_opt("extrap",'e',"maximum extrapolation", false, max_extrap);
1623+
opt_parse.add_opt("desired_steps",'s',"desired number of steps", false, n_desired_steps);
1624+
opt_parse.add_opt("boots",'n',"number of bootstraps", false, n_bootstraps);
1625+
opt_parse.add_opt("cval", 'c', "level for confidence intervals", false, c_level);
1626+
opt_parse.add_opt("terms",'x',"maximum terms in estimator", false, orig_max_terms);
1627+
opt_parse.add_opt("verbose", 'v', "print more info", false, VERBOSE);
1628+
#ifdef HAVE_HTSLIB
1629+
opt_parse.add_opt("bam", 'B', "input is in BAM format",
1630+
false, BAM_FORMAT_INPUT);
1631+
opt_parse.add_opt("seg_len", 'l', "maximum segment length when merging "
1632+
"paired end bam reads",
1633+
false, MAX_SEGMENT_LENGTH);
1634+
#endif
1635+
opt_parse.add_opt("pe", 'P', "input is paired end read file",
1636+
false, PAIRED_END);
1637+
opt_parse.add_opt("vals", 'V',
1638+
"input is a text file containing only the observed counts",
1639+
false, VALS_INPUT);
1640+
opt_parse.add_opt("hist", 'H',
1641+
"input is a text file containing the observed histogram",
1642+
false, HIST_INPUT);
1643+
opt_parse.add_opt("quick", 'Q',
1644+
"quick mode (no bootstraps) for confidence intervals",
1645+
false, SINGLE_ESTIMATE);
1646+
opt_parse.add_opt("defects", 'D', "no testing for defects", false, allow_defects);
1647+
opt_parse.add_opt("seed", 'r', "seed for random number generator",
1648+
false, seed);
1649+
opt_parse.set_show_defaults();
1650+
vector<string> leftover_args;
1651+
// ADS: suspect bug below; "-about" isn't working.
1652+
opt_parse.parse(argc-1, argv+1, leftover_args);
1653+
if (argc == 2 || opt_parse.help_requested()) {
1654+
cerr << opt_parse.help_message() << endl;
1655+
return EXIT_SUCCESS;
1656+
}
1657+
if (opt_parse.about_requested()) {
1658+
cerr << opt_parse.about_message() << endl;
1659+
return EXIT_SUCCESS;
1660+
}
1661+
if (opt_parse.option_missing()) {
1662+
cerr << opt_parse.option_missing_message() << endl;
1663+
return EXIT_SUCCESS;
1664+
}
1665+
if (leftover_args.empty()) {
1666+
cerr << opt_parse.help_message() << endl;
1667+
return EXIT_SUCCESS;
1668+
}
1669+
const string input_file_name = leftover_args.front();
1670+
/******************************************************************/
1671+
1672+
vector<double> counts_hist;
1673+
size_t n_reads = 0;
1674+
1675+
/************ loading input ***************************************/
1676+
if (HIST_INPUT) {
1677+
if (VERBOSE)
1678+
cerr << "HIST_INPUT" << endl;
1679+
n_reads = load_histogram(input_file_name, counts_hist);
1680+
}
1681+
else if (VALS_INPUT) {
1682+
if (VERBOSE)
1683+
cerr << "VALS_INPUT" << endl;
1684+
n_reads = load_counts(input_file_name, counts_hist);
1685+
}
1686+
#ifdef HAVE_HTSLIB
1687+
else if (BAM_FORMAT_INPUT && PAIRED_END) {
1688+
if (VERBOSE)
1689+
cerr << "PAIRED_END_BAM_INPUT" << endl;
1690+
const size_t MAX_READS_TO_HOLD = 5000000;
1691+
size_t n_paired = 0;
1692+
size_t n_mates = 0;
1693+
n_reads = load_counts_BAM_pe(VERBOSE, input_file_name,
1694+
MAX_SEGMENT_LENGTH,
1695+
MAX_READS_TO_HOLD, n_paired,
1696+
n_mates, counts_hist);
1697+
if (VERBOSE) {
1698+
cerr << "MERGED PAIRED END READS = " << n_paired << endl;
1699+
cerr << "MATES PROCESSED = " << n_mates << endl;
1700+
}
1701+
}
1702+
else if (BAM_FORMAT_INPUT) {
1703+
if (VERBOSE)
1704+
cerr << "BAM_INPUT" << endl;
1705+
n_reads = load_counts_BAM_se(input_file_name, counts_hist);
1706+
}
1707+
#endif
1708+
else if (PAIRED_END) {
1709+
if (VERBOSE)
1710+
cerr << "PAIRED_END_BED_INPUT" << endl;
1711+
n_reads = load_counts_BED_pe(input_file_name, counts_hist);
1712+
}
1713+
else { // default is single end bed file
1714+
if (VERBOSE)
1715+
cerr << "BED_INPUT" << endl;
1716+
n_reads = load_counts_BED_se(input_file_name, counts_hist);
1717+
}
1718+
/************ done loading input **********************************/
1719+
1720+
const size_t max_observed_count = counts_hist.size() - 1;
1721+
const double distinct_reads =
1722+
accumulate(begin(counts_hist), end(counts_hist), 0.0);
1723+
1724+
// ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
1725+
size_t first_zero = 1;
1726+
while (first_zero < counts_hist.size() && counts_hist[first_zero] > 0)
1727+
++first_zero;
1728+
1729+
orig_max_terms = min(orig_max_terms, first_zero - 1);
1730+
orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1);
1731+
1732+
const size_t distinct_counts =
1733+
std::count_if(begin(counts_hist), end(counts_hist),
1734+
[](const double x) {return x > 0.0;});
1735+
1736+
if (VERBOSE)
1737+
cerr << "TOTAL READS = " << n_reads << endl
1738+
<< "DISTINCT READS = " << distinct_reads << endl
1739+
<< "DISTINCT COUNTS = " << distinct_counts << endl
1740+
<< "MAX COUNT = " << max_observed_count << endl
1741+
<< "COUNTS OF 1 = " << counts_hist[1] << endl
1742+
<< "MAX TERMS = " << orig_max_terms << endl;
1743+
1744+
if (VERBOSE) {
1745+
// OUTPUT THE ORIGINAL HISTOGRAM
1746+
cerr << "OBSERVED COUNTS (" << counts_hist.size() << ")" << endl;
1747+
for (size_t i = 0; i < counts_hist.size(); i++)
1748+
if (counts_hist[i] > 0)
1749+
cerr << i << '\t' << static_cast<size_t>(counts_hist[i]) << endl;
1750+
cerr << endl;
1751+
}
1752+
1753+
// check to make sure library is not overly saturated
1754+
const double two_fold_extrap = GoodToulmin2xExtrap(counts_hist);
1755+
if (two_fold_extrap < 0.0)
1756+
throw runtime_error("Saturation expected at double initial sample size."
1757+
" Unable to extrapolate");
1758+
1759+
// const size_t total_reads = get_counts_from_hist(counts_hist);
1760+
1761+
//assert(total_reads == n_reads); // ADS: why commented out?
1762+
1763+
// check that min required count is satisfied
1764+
if (orig_max_terms < min_required_counts)
1765+
throw runtime_error(min_required_counts_error_message);
1766+
1767+
if (VERBOSE)
1768+
cerr << "[ESTIMATING YIELD CURVE]" << endl;
1769+
1770+
// TL: determine step size based on initial counts.
1771+
1772+
step_size = (max_extrap - distinct_reads) / n_desired_steps;
1773+
1774+
//
1775+
vector<double> yield_estimates;
1776+
1777+
if (SINGLE_ESTIMATE) {
1778+
1779+
const bool single_estimate_success =
1780+
extrap_single_estimate(VERBOSE, allow_defects, counts_hist, orig_max_terms,
1781+
diagonal, step_size, max_extrap, yield_estimates);
1782+
// IF FAILURE, EXIT
1783+
if (!single_estimate_success)
1784+
throw runtime_error("single estimate failed, run "
1785+
"full mode for estimates");
1786+
1787+
std::ofstream of;
1788+
if (!outfile.empty()) of.open(outfile.c_str());
1789+
std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
1790+
1791+
out << "TOTAL_READS\tEXPECTED_DISTINCT" << endl;
1792+
out.setf(std::ios_base::fixed, std::ios_base::floatfield);
1793+
out.precision(1);
1794+
1795+
out << 0 << '\t' << 0 << endl;
1796+
for (size_t i = 0; i < yield_estimates.size(); ++i)
1797+
out << (i + 1)*step_size << '\t' << yield_estimates[i] << endl;
1798+
}
1799+
else {
1800+
if (VERBOSE)
1801+
cerr << "[BOOTSTRAPPING HISTOGRAM]" << endl;
1802+
1803+
const size_t max_iter = 100*n_bootstraps;
1804+
1805+
vector<vector <double> > bootstrap_estimates;
1806+
extrap_bootstrap(VERBOSE, allow_defects, seed, counts_hist, n_bootstraps,
1807+
orig_max_terms, diagonal, step_size, max_extrap,
1808+
max_iter, bootstrap_estimates);
1809+
1810+
if (VERBOSE)
1811+
cerr << "[COMPUTING CONFIDENCE INTERVALS]" << endl;
1812+
// yield ci
1813+
vector<double> yield_upper_ci_lognorm, yield_lower_ci_lognorm;
1814+
1815+
vector_median_and_ci(bootstrap_estimates, c_level, yield_estimates,
1816+
yield_lower_ci_lognorm, yield_upper_ci_lognorm);
1817+
1818+
/////////////////////////////////////////////////////////////////////
1819+
if (VERBOSE)
1820+
cerr << "[WRITING OUTPUT]" << endl;
1821+
1822+
std::ofstream of;
1823+
if (!outfile.empty()) of.open(outfile.c_str());
1824+
std::ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());
1825+
1826+
out.setf(std::ios_base::fixed, std::ios_base::floatfield);
1827+
out.precision(1);
1828+
1829+
out << "pop_size_estimate" << '\t'
1830+
<< "lower_ci" << '\t' << "upper_ci" << endl;
1831+
out << yield_estimates.back() << '\t'
1832+
<< yield_lower_ci_lognorm.back() << '\t'
1833+
<< yield_upper_ci_lognorm.back() << endl;
1834+
}
1835+
}
1836+
catch (runtime_error &e) {
1837+
cerr << "ERROR:\t" << e.what() << endl;
1838+
return EXIT_FAILURE;
1839+
}
1840+
catch (std::bad_alloc &ba) {
1841+
cerr << "ERROR: could not allocate memory" << endl;
1842+
return EXIT_FAILURE;
1843+
}
1844+
return EXIT_SUCCESS;
1845+
}
1846+
16081847
int
16091848
main(const int argc, const char **argv) {
16101849

@@ -1624,7 +1863,7 @@ main(const int argc, const char **argv) {
16241863

16251864
else if (strcmp(argv[1], "lc_extrap") == 0) {
16261865

1627-
return lc_extrap(0, argc, argv);
1866+
return lc_extrap(argc, argv);
16281867

16291868
}
16301869
else if (strcmp(argv[1], "c_curve") == 0) {
@@ -1644,7 +1883,7 @@ main(const int argc, const char **argv) {
16441883
}
16451884
else if (strcmp(argv[1], "pop_size") == 0) {
16461885

1647-
return lc_extrap(1, argc, argv);
1886+
return pop_size(argc, argv);
16481887

16491888
}
16501889
else {

0 commit comments

Comments
 (0)