Skip to content

Commit 6e4a080

Browse files
Adding the pop_size module
1 parent f374ba7 commit 6e4a080

File tree

1 file changed

+248
-10
lines changed

1 file changed

+248
-10
lines changed

src/preseq.cpp

Lines changed: 248 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -86,19 +86,19 @@ median_from_sorted_vector (const vector<T> sorted_data,
8686
}
8787

8888
template<typename T> T
89-
quantile_from_sorted_vector (const vector<T> sorted_data,
90-
const size_t stride, const size_t n,
91-
const double f) {
89+
quantile_from_sorted_vector (const vector<T> sorted_data,
90+
const size_t stride, const size_t n,
91+
const double f) {
9292
const double index = f * (n - 1);
9393
const size_t lhs = (int)index;
9494
const double delta = index - lhs;
9595

9696
if (n == 0 || sorted_data.empty()) return 0.0;
9797

9898
if (lhs == n - 1) return sorted_data[lhs * stride];
99-
100-
return (1 - delta) * sorted_data[lhs * stride]
101-
+ delta * sorted_data[(lhs + 1) * stride];
99+
100+
return (1 - delta) * sorted_data[lhs * stride]
101+
+ delta * sorted_data[(lhs + 1) * stride];
102102
}
103103

104104
// Confidence interval stuff
@@ -202,8 +202,8 @@ factorial (double x) {
202202

203203
double Ag = lanczos[0];
204204

205-
for (size_t k=1; k < lanczos.size(); k++)
206-
Ag += lanczos[k] / (x + k);
205+
for (size_t k=1; k < lanczos.size(); k++)
206+
Ag += lanczos[k] / (x + k);
207207

208208
double term1 = (x + 0.5) * log((x + 7.5) / Euler);
209209
double term2 = LogRootTwoPi + log(Ag);
@@ -229,7 +229,7 @@ resample_hist(std::mt19937 &gen, const vector<size_t> &vals_hist_distinct_counts
229229

230230
unsigned int distinct =
231231
accumulate(begin(distinct_counts_hist), end(distinct_counts_hist), 0.0);
232-
232+
233233
multinomial(gen, distinct_counts_hist, distinct,
234234
sample_distinct_counts_hist);
235235

@@ -781,6 +781,239 @@ lc_extrap(const int argc, const char **argv) {
781781
return EXIT_SUCCESS;
782782
}
783783

784+
////////////////////////////////////////////////////////////////////////
785+
// POP_SIZE
786+
static int
787+
pop_size(const int argc, const char **argv) {
788+
789+
try {
790+
791+
static const size_t min_required_counts = 4;
792+
static const string min_required_counts_error_message =
793+
"max count before zero is less than min required count (" +
794+
to_string(min_required_counts) + ") duplicates removed";
795+
796+
string outfile;
797+
798+
size_t orig_max_terms = 100;
799+
double max_extrap = 1.0e10;
800+
size_t n_bootstraps = 100;
801+
int diagonal = 0;
802+
double c_level = 0.95;
803+
unsigned long int seed = 408;
804+
805+
/* FLAGS */
806+
bool VERBOSE = false;
807+
bool VALS_INPUT = false;
808+
bool PAIRED_END = false;
809+
bool HIST_INPUT = false;
810+
bool SINGLE_ESTIMATE = false;
811+
bool allow_defects = false;
812+
813+
#ifdef HAVE_HTSLIB
814+
bool BAM_FORMAT_INPUT = false;
815+
size_t MAX_SEGMENT_LENGTH = 5000;
816+
#endif
817+
818+
const string description =
819+
"Needs writing";
820+
821+
/********** GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/
822+
OptionParser opt_parse(strip_path(argv[1]), description, "<input-file>");
823+
opt_parse.add_opt("output", 'o', "output file (default: stdout)",
824+
false , outfile);
825+
opt_parse.add_opt("extrap",'e',"maximum extrapolation", false, max_extrap);
826+
opt_parse.add_opt("boots",'n',"number of bootstraps", false, n_bootstraps);
827+
opt_parse.add_opt("cval", 'c', "level for confidence intervals", false, c_level);
828+
opt_parse.add_opt("terms",'x',"maximum terms in estimator", false, orig_max_terms);
829+
opt_parse.add_opt("verbose", 'v', "print more info", false, VERBOSE);
830+
#ifdef HAVE_HTSLIB
831+
opt_parse.add_opt("bam", 'B', "input is in BAM format",
832+
false, BAM_FORMAT_INPUT);
833+
opt_parse.add_opt("seg_len", 'l', "maximum segment length when merging "
834+
"paired end bam reads",
835+
false, MAX_SEGMENT_LENGTH);
836+
#endif
837+
opt_parse.add_opt("pe", 'P', "input is paired end read file",
838+
false, PAIRED_END);
839+
opt_parse.add_opt("vals", 'V',
840+
"input is a text file containing only the observed counts",
841+
false, VALS_INPUT);
842+
opt_parse.add_opt("hist", 'H',
843+
"input is a text file containing the observed histogram",
844+
false, HIST_INPUT);
845+
opt_parse.add_opt("quick", 'Q',
846+
"quick mode (no bootstraps) for confidence intervals",
847+
false, SINGLE_ESTIMATE);
848+
opt_parse.add_opt("defects", 'D', "no testing for defects", false, allow_defects);
849+
opt_parse.add_opt("seed", 'r', "seed for random number generator",
850+
false, seed);
851+
opt_parse.set_show_defaults();
852+
vector<string> leftover_args;
853+
854+
// ADS: suspect bug below; "-about" isn't working.
855+
opt_parse.parse(argc-1, argv+1, leftover_args);
856+
if (argc == 2 || opt_parse.help_requested()) {
857+
cerr << opt_parse.help_message() << endl;
858+
return EXIT_SUCCESS;
859+
}
860+
if (opt_parse.about_requested()) {
861+
cerr << opt_parse.about_message() << endl;
862+
return EXIT_SUCCESS;
863+
}
864+
if (opt_parse.option_missing()) {
865+
cerr << opt_parse.option_missing_message() << endl;
866+
return EXIT_SUCCESS;
867+
}
868+
if (leftover_args.empty()) {
869+
cerr << opt_parse.help_message() << endl;
870+
return EXIT_SUCCESS;
871+
}
872+
const string input_file_name = leftover_args.front();
873+
/******************************************************************/
874+
875+
throw runtime_error("NOT YET IMPLEMENTED");
876+
877+
vector<double> counts_hist;
878+
size_t n_reads = 0;
879+
880+
/************ loading input ***************************************/
881+
if (HIST_INPUT) {
882+
if (VERBOSE)
883+
cerr << "HIST_INPUT" << endl;
884+
n_reads = load_histogram(input_file_name, counts_hist);
885+
}
886+
else if (VALS_INPUT) {
887+
if (VERBOSE)
888+
cerr << "VALS_INPUT" << endl;
889+
n_reads = load_counts(input_file_name, counts_hist);
890+
}
891+
#ifdef HAVE_HTSLIB
892+
else if (BAM_FORMAT_INPUT && PAIRED_END) {
893+
if (VERBOSE)
894+
cerr << "PAIRED_END_BAM_INPUT" << endl;
895+
const size_t MAX_READS_TO_HOLD = 5000000;
896+
size_t n_paired = 0;
897+
size_t n_mates = 0;
898+
n_reads = load_counts_BAM_pe(VERBOSE, input_file_name,
899+
MAX_SEGMENT_LENGTH,
900+
MAX_READS_TO_HOLD, n_paired,
901+
n_mates, counts_hist);
902+
if (VERBOSE) {
903+
cerr << "MERGED PAIRED END READS = " << n_paired << endl;
904+
cerr << "MATES PROCESSED = " << n_mates << endl;
905+
}
906+
}
907+
else if (BAM_FORMAT_INPUT) {
908+
if (VERBOSE)
909+
cerr << "BAM_INPUT" << endl;
910+
n_reads = load_counts_BAM_se(input_file_name, counts_hist);
911+
}
912+
#endif
913+
else if (PAIRED_END) {
914+
if (VERBOSE)
915+
cerr << "PAIRED_END_BED_INPUT" << endl;
916+
n_reads = load_counts_BED_pe(input_file_name, counts_hist);
917+
}
918+
else { // default is single end bed file
919+
if (VERBOSE)
920+
cerr << "BED_INPUT" << endl;
921+
n_reads = load_counts_BED_se(input_file_name, counts_hist);
922+
}
923+
/************ done loading input **********************************/
924+
925+
const size_t max_observed_count = counts_hist.size() - 1;
926+
const double distinct_reads =
927+
accumulate(begin(counts_hist), end(counts_hist), 0.0);
928+
929+
// ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
930+
size_t first_zero = 1;
931+
while (first_zero < counts_hist.size() && counts_hist[first_zero] > 0)
932+
++first_zero;
933+
934+
orig_max_terms = std::min(orig_max_terms, first_zero - 1);
935+
orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1);
936+
937+
const size_t distinct_counts =
938+
std::count_if(begin(counts_hist), end(counts_hist),
939+
[](const double x) {return x > 0.0;});
940+
941+
if (VERBOSE)
942+
cerr << "TOTAL READS = " << n_reads << endl
943+
<< "DISTINCT READS = " << distinct_reads << endl
944+
<< "DISTINCT COUNTS = " << distinct_counts << endl
945+
<< "MAX COUNT = " << max_observed_count << endl
946+
<< "COUNTS OF 1 = " << counts_hist[1] << endl
947+
<< "MAX TERMS = " << orig_max_terms << endl;
948+
949+
// ADS: step size to be set based on the original number of
950+
// observations
951+
const double step_size = n_reads;
952+
953+
if (VERBOSE) {
954+
// OUTPUT THE ORIGINAL HISTOGRAM
955+
cerr << "OBSERVED COUNTS (" << counts_hist.size() << ")" << endl;
956+
for (size_t i = 0; i < counts_hist.size(); i++)
957+
if (counts_hist[i] > 0)
958+
cerr << i << '\t' << static_cast<size_t>(counts_hist[i]) << endl;
959+
cerr << endl;
960+
}
961+
962+
// check to make sure library is not overly saturated
963+
const double two_fold_extrap = GoodToulmin2xExtrap(counts_hist);
964+
if (two_fold_extrap < 0.0)
965+
throw runtime_error("Saturation expected at double initial sample size."
966+
" Unable to extrapolate");
967+
968+
// const size_t total_reads = get_counts_from_hist(counts_hist);
969+
970+
//assert(total_reads == n_reads); // ADS: why commented out?
971+
972+
// check that min required count is satisfied
973+
if (orig_max_terms < min_required_counts)
974+
throw runtime_error(min_required_counts_error_message);
975+
976+
if (VERBOSE)
977+
cerr << "[ESTIMATING YIELD CURVE]" << endl;
978+
vector<double> yield_estimates;
979+
980+
if (VERBOSE)
981+
cerr << "[BOOTSTRAPPING HISTOGRAM]" << endl;
982+
983+
const size_t max_iter = 100*n_bootstraps;
984+
985+
vector<vector <double> > bootstrap_estimates;
986+
extrap_bootstrap(VERBOSE, allow_defects, seed, counts_hist, n_bootstraps,
987+
orig_max_terms, diagonal, step_size, max_extrap,
988+
max_iter, bootstrap_estimates);
989+
990+
if (VERBOSE)
991+
cerr << "[COMPUTING CONFIDENCE INTERVALS]" << endl;
992+
// yield ci
993+
vector<double> yield_upper_ci_lognorm, yield_lower_ci_lognorm;
994+
995+
vector_median_and_ci(bootstrap_estimates, c_level, yield_estimates,
996+
yield_lower_ci_lognorm, yield_upper_ci_lognorm);
997+
998+
/////////////////////////////////////////////////////////////////////
999+
if (VERBOSE)
1000+
cerr << "[WRITING OUTPUT]" << endl;
1001+
1002+
write_predicted_complexity_curve(outfile, c_level, step_size,
1003+
yield_estimates, yield_lower_ci_lognorm,
1004+
yield_upper_ci_lognorm);
1005+
}
1006+
catch (runtime_error &e) {
1007+
cerr << "ERROR:\t" << e.what() << endl;
1008+
return EXIT_FAILURE;
1009+
}
1010+
catch (std::bad_alloc &ba) {
1011+
cerr << "ERROR: could not allocate memory" << endl;
1012+
return EXIT_FAILURE;
1013+
}
1014+
return EXIT_SUCCESS;
1015+
}
1016+
7841017

7851018
///////////////////////////////////////////////////////////////////////
7861019
//////////////////////////////////////////////////////////////////////
@@ -1475,7 +1708,7 @@ bound_pop(const int argc, const char **argv) {
14751708
bootstrap_moments.push_back(exp(factorial(i + 3) +
14761709
log(sample_hist[i + 2]) -
14771710
log(sample_hist[1])) );
1478-
}
1711+
}
14791712

14801713
size_t n_points = 0;
14811714
n_points = ensure_pos_def_mom_seq(bootstrap_moments, tolerance, VERBOSE);
@@ -1606,6 +1839,11 @@ main(const int argc, const char **argv) {
16061839

16071840
return bound_pop(argc, argv);
16081841

1842+
}
1843+
else if (strcmp(argv[1], "pop_size") == 0) {
1844+
1845+
return pop_size(argc, argv);
1846+
16091847
}
16101848
else {
16111849
cerr << "unrecognized command: " << argv[1] << endl

0 commit comments

Comments
 (0)