Skip to content

Commit b2f1d34

Browse files
authored
Merge branch 'master' into master
2 parents 2c5eb86 + 6e4a080 commit b2f1d34

File tree

3 files changed

+235
-6
lines changed

3 files changed

+235
-6
lines changed

Makefile.am

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1515
# General Public License for more details.
1616

17-
EXTRA_DIST = README.md gpl-3.0.txt preseqR data
17+
EXTRA_DIST = README.md gpl-3.0.txt preseqR data docs
1818
ACLOCAL_AMFLAGS = -I m4
1919

2020
SUBDIRS := src/smithlab_cpp

configure.ac

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,6 @@ dnl used to keep command lines short.
2121
AC_CONFIG_HEADERS([config.h])
2222
AM_INIT_AUTOMAKE([subdir-objects foreign])
2323

24-
dnl Less output to the terminal. Use "--disable-silent-rules" when
25-
dnl running ./configure to reverse this.
26-
AM_SILENT_RULES([yes])
27-
2824
AC_CONFIG_MACRO_DIR([m4])
2925
AC_LANG(C++)
3026
AC_PROG_CXX

src/preseq.cpp

Lines changed: 234 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,7 @@ quantile_from_sorted_vector (const vector<T> sorted_data,
9393
if (lhs == n - 1) return sorted_data[lhs * stride];
9494

9595
return (1 - delta) * sorted_data[lhs * stride]
96-
+ delta * sorted_data[(lhs + 1) * stride];
96+
+ delta * sorted_data[(lhs + 1) * stride];
9797
}
9898

9999
// Confidence interval stuff
@@ -781,6 +781,239 @@ lc_extrap(const int argc, const char **argv) {
781781
return EXIT_SUCCESS;
782782
}
783783

784+
////////////////////////////////////////////////////////////////////////
785+
// POP_SIZE
786+
static int
787+
pop_size(const int argc, const char **argv) {
788+
789+
try {
790+
791+
static const size_t min_required_counts = 4;
792+
static const string min_required_counts_error_message =
793+
"max count before zero is less than min required count (" +
794+
to_string(min_required_counts) + ") duplicates removed";
795+
796+
string outfile;
797+
798+
size_t orig_max_terms = 100;
799+
double max_extrap = 1.0e10;
800+
size_t n_bootstraps = 100;
801+
int diagonal = 0;
802+
double c_level = 0.95;
803+
unsigned long int seed = 408;
804+
805+
/* FLAGS */
806+
bool VERBOSE = false;
807+
bool VALS_INPUT = false;
808+
bool PAIRED_END = false;
809+
bool HIST_INPUT = false;
810+
bool SINGLE_ESTIMATE = false;
811+
bool allow_defects = false;
812+
813+
#ifdef HAVE_HTSLIB
814+
bool BAM_FORMAT_INPUT = false;
815+
size_t MAX_SEGMENT_LENGTH = 5000;
816+
#endif
817+
818+
const string description =
819+
"Needs writing";
820+
821+
/********** GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/
822+
OptionParser opt_parse(strip_path(argv[1]), description, "<input-file>");
823+
opt_parse.add_opt("output", 'o', "output file (default: stdout)",
824+
false , outfile);
825+
opt_parse.add_opt("extrap",'e',"maximum extrapolation", false, max_extrap);
826+
opt_parse.add_opt("boots",'n',"number of bootstraps", false, n_bootstraps);
827+
opt_parse.add_opt("cval", 'c', "level for confidence intervals", false, c_level);
828+
opt_parse.add_opt("terms",'x',"maximum terms in estimator", false, orig_max_terms);
829+
opt_parse.add_opt("verbose", 'v', "print more info", false, VERBOSE);
830+
#ifdef HAVE_HTSLIB
831+
opt_parse.add_opt("bam", 'B', "input is in BAM format",
832+
false, BAM_FORMAT_INPUT);
833+
opt_parse.add_opt("seg_len", 'l', "maximum segment length when merging "
834+
"paired end bam reads",
835+
false, MAX_SEGMENT_LENGTH);
836+
#endif
837+
opt_parse.add_opt("pe", 'P', "input is paired end read file",
838+
false, PAIRED_END);
839+
opt_parse.add_opt("vals", 'V',
840+
"input is a text file containing only the observed counts",
841+
false, VALS_INPUT);
842+
opt_parse.add_opt("hist", 'H',
843+
"input is a text file containing the observed histogram",
844+
false, HIST_INPUT);
845+
opt_parse.add_opt("quick", 'Q',
846+
"quick mode (no bootstraps) for confidence intervals",
847+
false, SINGLE_ESTIMATE);
848+
opt_parse.add_opt("defects", 'D', "no testing for defects", false, allow_defects);
849+
opt_parse.add_opt("seed", 'r', "seed for random number generator",
850+
false, seed);
851+
opt_parse.set_show_defaults();
852+
vector<string> leftover_args;
853+
854+
// ADS: suspect bug below; "-about" isn't working.
855+
opt_parse.parse(argc-1, argv+1, leftover_args);
856+
if (argc == 2 || opt_parse.help_requested()) {
857+
cerr << opt_parse.help_message() << endl;
858+
return EXIT_SUCCESS;
859+
}
860+
if (opt_parse.about_requested()) {
861+
cerr << opt_parse.about_message() << endl;
862+
return EXIT_SUCCESS;
863+
}
864+
if (opt_parse.option_missing()) {
865+
cerr << opt_parse.option_missing_message() << endl;
866+
return EXIT_SUCCESS;
867+
}
868+
if (leftover_args.empty()) {
869+
cerr << opt_parse.help_message() << endl;
870+
return EXIT_SUCCESS;
871+
}
872+
const string input_file_name = leftover_args.front();
873+
/******************************************************************/
874+
875+
throw runtime_error("NOT YET IMPLEMENTED");
876+
877+
vector<double> counts_hist;
878+
size_t n_reads = 0;
879+
880+
/************ loading input ***************************************/
881+
if (HIST_INPUT) {
882+
if (VERBOSE)
883+
cerr << "HIST_INPUT" << endl;
884+
n_reads = load_histogram(input_file_name, counts_hist);
885+
}
886+
else if (VALS_INPUT) {
887+
if (VERBOSE)
888+
cerr << "VALS_INPUT" << endl;
889+
n_reads = load_counts(input_file_name, counts_hist);
890+
}
891+
#ifdef HAVE_HTSLIB
892+
else if (BAM_FORMAT_INPUT && PAIRED_END) {
893+
if (VERBOSE)
894+
cerr << "PAIRED_END_BAM_INPUT" << endl;
895+
const size_t MAX_READS_TO_HOLD = 5000000;
896+
size_t n_paired = 0;
897+
size_t n_mates = 0;
898+
n_reads = load_counts_BAM_pe(VERBOSE, input_file_name,
899+
MAX_SEGMENT_LENGTH,
900+
MAX_READS_TO_HOLD, n_paired,
901+
n_mates, counts_hist);
902+
if (VERBOSE) {
903+
cerr << "MERGED PAIRED END READS = " << n_paired << endl;
904+
cerr << "MATES PROCESSED = " << n_mates << endl;
905+
}
906+
}
907+
else if (BAM_FORMAT_INPUT) {
908+
if (VERBOSE)
909+
cerr << "BAM_INPUT" << endl;
910+
n_reads = load_counts_BAM_se(input_file_name, counts_hist);
911+
}
912+
#endif
913+
else if (PAIRED_END) {
914+
if (VERBOSE)
915+
cerr << "PAIRED_END_BED_INPUT" << endl;
916+
n_reads = load_counts_BED_pe(input_file_name, counts_hist);
917+
}
918+
else { // default is single end bed file
919+
if (VERBOSE)
920+
cerr << "BED_INPUT" << endl;
921+
n_reads = load_counts_BED_se(input_file_name, counts_hist);
922+
}
923+
/************ done loading input **********************************/
924+
925+
const size_t max_observed_count = counts_hist.size() - 1;
926+
const double distinct_reads =
927+
accumulate(begin(counts_hist), end(counts_hist), 0.0);
928+
929+
// ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
930+
size_t first_zero = 1;
931+
while (first_zero < counts_hist.size() && counts_hist[first_zero] > 0)
932+
++first_zero;
933+
934+
orig_max_terms = std::min(orig_max_terms, first_zero - 1);
935+
orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1);
936+
937+
const size_t distinct_counts =
938+
std::count_if(begin(counts_hist), end(counts_hist),
939+
[](const double x) {return x > 0.0;});
940+
941+
if (VERBOSE)
942+
cerr << "TOTAL READS = " << n_reads << endl
943+
<< "DISTINCT READS = " << distinct_reads << endl
944+
<< "DISTINCT COUNTS = " << distinct_counts << endl
945+
<< "MAX COUNT = " << max_observed_count << endl
946+
<< "COUNTS OF 1 = " << counts_hist[1] << endl
947+
<< "MAX TERMS = " << orig_max_terms << endl;
948+
949+
// ADS: step size to be set based on the original number of
950+
// observations
951+
const double step_size = n_reads;
952+
953+
if (VERBOSE) {
954+
// OUTPUT THE ORIGINAL HISTOGRAM
955+
cerr << "OBSERVED COUNTS (" << counts_hist.size() << ")" << endl;
956+
for (size_t i = 0; i < counts_hist.size(); i++)
957+
if (counts_hist[i] > 0)
958+
cerr << i << '\t' << static_cast<size_t>(counts_hist[i]) << endl;
959+
cerr << endl;
960+
}
961+
962+
// check to make sure library is not overly saturated
963+
const double two_fold_extrap = GoodToulmin2xExtrap(counts_hist);
964+
if (two_fold_extrap < 0.0)
965+
throw runtime_error("Saturation expected at double initial sample size."
966+
" Unable to extrapolate");
967+
968+
// const size_t total_reads = get_counts_from_hist(counts_hist);
969+
970+
//assert(total_reads == n_reads); // ADS: why commented out?
971+
972+
// check that min required count is satisfied
973+
if (orig_max_terms < min_required_counts)
974+
throw runtime_error(min_required_counts_error_message);
975+
976+
if (VERBOSE)
977+
cerr << "[ESTIMATING YIELD CURVE]" << endl;
978+
vector<double> yield_estimates;
979+
980+
if (VERBOSE)
981+
cerr << "[BOOTSTRAPPING HISTOGRAM]" << endl;
982+
983+
const size_t max_iter = 100*n_bootstraps;
984+
985+
vector<vector <double> > bootstrap_estimates;
986+
extrap_bootstrap(VERBOSE, allow_defects, seed, counts_hist, n_bootstraps,
987+
orig_max_terms, diagonal, step_size, max_extrap,
988+
max_iter, bootstrap_estimates);
989+
990+
if (VERBOSE)
991+
cerr << "[COMPUTING CONFIDENCE INTERVALS]" << endl;
992+
// yield ci
993+
vector<double> yield_upper_ci_lognorm, yield_lower_ci_lognorm;
994+
995+
vector_median_and_ci(bootstrap_estimates, c_level, yield_estimates,
996+
yield_lower_ci_lognorm, yield_upper_ci_lognorm);
997+
998+
/////////////////////////////////////////////////////////////////////
999+
if (VERBOSE)
1000+
cerr << "[WRITING OUTPUT]" << endl;
1001+
1002+
write_predicted_complexity_curve(outfile, c_level, step_size,
1003+
yield_estimates, yield_lower_ci_lognorm,
1004+
yield_upper_ci_lognorm);
1005+
}
1006+
catch (runtime_error &e) {
1007+
cerr << "ERROR:\t" << e.what() << endl;
1008+
return EXIT_FAILURE;
1009+
}
1010+
catch (std::bad_alloc &ba) {
1011+
cerr << "ERROR: could not allocate memory" << endl;
1012+
return EXIT_FAILURE;
1013+
}
1014+
return EXIT_SUCCESS;
1015+
}
1016+
7841017

7851018
///////////////////////////////////////////////////////////////////////
7861019
//////////////////////////////////////////////////////////////////////

0 commit comments

Comments
 (0)