Merge branch 'master' into master

terencewtli · web-flow · commit b2f1d34ee7f9 · 2020-10-01T14:25:17.000-07:00
diff --git a/Makefile.am b/Makefile.am
@@ -14,7 +14,7 @@
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 
-EXTRA_DIST = README.md gpl-3.0.txt preseqR data
+EXTRA_DIST = README.md gpl-3.0.txt preseqR data docs
 ACLOCAL_AMFLAGS = -I m4
 
 SUBDIRS := src/smithlab_cpp
diff --git a/configure.ac b/configure.ac
@@ -21,10 +21,6 @@ dnl used to keep command lines short.
 AC_CONFIG_HEADERS([config.h])
 AM_INIT_AUTOMAKE([subdir-objects foreign])
 
-dnl Less output to the terminal. Use "--disable-silent-rules" when
-dnl running ./configure to reverse this.
-AM_SILENT_RULES([yes])
-
 AC_CONFIG_MACRO_DIR([m4])
 AC_LANG(C++)
 AC_PROG_CXX
diff --git a/src/preseq.cpp b/src/preseq.cpp
@@ -93,7 +93,7 @@ quantile_from_sorted_vector (const vector<T> sorted_data,
   if (lhs == n - 1) return sorted_data[lhs * stride];
 
   return (1 - delta) * sorted_data[lhs * stride]
-    + delta * sorted_data[(lhs + 1) * stride];
+           + delta * sorted_data[(lhs + 1) * stride];
 }
 
 // Confidence interval stuff
@@ -781,6 +781,239 @@ lc_extrap(const int argc, const char **argv) {
   return EXIT_SUCCESS;
 }
 
+////////////////////////////////////////////////////////////////////////
+// POP_SIZE
+static int
+pop_size(const int argc, const char **argv) {
+
+  try {
+
+    static const size_t min_required_counts = 4;
+    static const string min_required_counts_error_message =
+      "max count before zero is less than min required count (" +
+      to_string(min_required_counts) + ") duplicates removed";
+
+    string outfile;
+
+    size_t orig_max_terms = 100;
+    double max_extrap = 1.0e10;
+    size_t n_bootstraps = 100;
+    int diagonal = 0;
+    double c_level = 0.95;
+    unsigned long int seed = 408;
+
+    /* FLAGS */
+    bool VERBOSE = false;
+    bool VALS_INPUT = false;
+    bool PAIRED_END = false;
+    bool HIST_INPUT = false;
+    bool SINGLE_ESTIMATE = false;
+    bool allow_defects = false;
+
+#ifdef HAVE_HTSLIB
+    bool BAM_FORMAT_INPUT = false;
+    size_t MAX_SEGMENT_LENGTH = 5000;
+#endif
+
+    const string description =
+      "Needs writing";
+
+    /********** GET COMMAND LINE ARGUMENTS  FOR LC EXTRAP ***********/
+    OptionParser opt_parse(strip_path(argv[1]), description, "<input-file>");
+    opt_parse.add_opt("output", 'o', "output file (default: stdout)",
+                      false , outfile);
+    opt_parse.add_opt("extrap",'e',"maximum extrapolation", false, max_extrap);
+    opt_parse.add_opt("boots",'n',"number of bootstraps", false, n_bootstraps);
+    opt_parse.add_opt("cval", 'c', "level for confidence intervals", false, c_level);
+    opt_parse.add_opt("terms",'x',"maximum terms in estimator", false, orig_max_terms);
+    opt_parse.add_opt("verbose", 'v', "print more info", false, VERBOSE);
+#ifdef HAVE_HTSLIB
+    opt_parse.add_opt("bam", 'B', "input is in BAM format",
+                      false, BAM_FORMAT_INPUT);
+    opt_parse.add_opt("seg_len", 'l', "maximum segment length when merging "
+                      "paired end bam reads",
+                      false, MAX_SEGMENT_LENGTH);
+#endif
+    opt_parse.add_opt("pe", 'P', "input is paired end read file",
+                      false, PAIRED_END);
+    opt_parse.add_opt("vals", 'V',
+                      "input is a text file containing only the observed counts",
+                      false, VALS_INPUT);
+    opt_parse.add_opt("hist", 'H',
+                      "input is a text file containing the observed histogram",
+                      false, HIST_INPUT);
+    opt_parse.add_opt("quick", 'Q',
+                      "quick mode (no bootstraps) for confidence intervals",
+                      false, SINGLE_ESTIMATE);
+    opt_parse.add_opt("defects", 'D', "no testing for defects", false, allow_defects);
+    opt_parse.add_opt("seed", 'r', "seed for random number generator",
+                      false, seed);
+    opt_parse.set_show_defaults();
+    vector<string> leftover_args;
+
+    // ADS: suspect bug below; "-about" isn't working.
+    opt_parse.parse(argc-1, argv+1, leftover_args);
+    if (argc == 2 || opt_parse.help_requested()) {
+      cerr << opt_parse.help_message() << endl;
+      return EXIT_SUCCESS;
+    }
+    if (opt_parse.about_requested()) {
+      cerr << opt_parse.about_message() << endl;
+      return EXIT_SUCCESS;
+    }
+    if (opt_parse.option_missing()) {
+      cerr << opt_parse.option_missing_message() << endl;
+      return EXIT_SUCCESS;
+    }
+    if (leftover_args.empty()) {
+      cerr << opt_parse.help_message() << endl;
+      return EXIT_SUCCESS;
+    }
+    const string input_file_name = leftover_args.front();
+    /******************************************************************/
+
+    throw runtime_error("NOT YET IMPLEMENTED");
+
+    vector<double> counts_hist;
+    size_t n_reads = 0;
+
+    /************ loading input ***************************************/
+    if (HIST_INPUT) {
+      if (VERBOSE)
+        cerr << "HIST_INPUT" << endl;
+      n_reads = load_histogram(input_file_name, counts_hist);
+    }
+    else if (VALS_INPUT) {
+      if (VERBOSE)
+        cerr << "VALS_INPUT" << endl;
+      n_reads = load_counts(input_file_name, counts_hist);
+    }
+#ifdef HAVE_HTSLIB
+    else if (BAM_FORMAT_INPUT && PAIRED_END) {
+      if (VERBOSE)
+        cerr << "PAIRED_END_BAM_INPUT" << endl;
+      const size_t MAX_READS_TO_HOLD = 5000000;
+      size_t n_paired = 0;
+      size_t n_mates = 0;
+      n_reads = load_counts_BAM_pe(VERBOSE, input_file_name,
+                                   MAX_SEGMENT_LENGTH,
+                                   MAX_READS_TO_HOLD, n_paired,
+                                   n_mates, counts_hist);
+      if (VERBOSE) {
+        cerr << "MERGED PAIRED END READS = " << n_paired << endl;
+        cerr << "MATES PROCESSED = " << n_mates << endl;
+      }
+    }
+    else if (BAM_FORMAT_INPUT) {
+      if (VERBOSE)
+        cerr << "BAM_INPUT" << endl;
+      n_reads = load_counts_BAM_se(input_file_name, counts_hist);
+    }
+#endif
+    else if (PAIRED_END) {
+      if (VERBOSE)
+        cerr << "PAIRED_END_BED_INPUT" << endl;
+      n_reads = load_counts_BED_pe(input_file_name, counts_hist);
+    }
+    else { // default is single end bed file
+      if (VERBOSE)
+        cerr << "BED_INPUT" << endl;
+      n_reads = load_counts_BED_se(input_file_name, counts_hist);
+    }
+    /************ done loading input **********************************/
+
+    const size_t max_observed_count = counts_hist.size() - 1;
+    const double distinct_reads =
+      accumulate(begin(counts_hist), end(counts_hist), 0.0);
+
+    // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
+    size_t first_zero = 1;
+    while (first_zero < counts_hist.size() && counts_hist[first_zero] > 0)
+      ++first_zero;
+
+    orig_max_terms = std::min(orig_max_terms, first_zero - 1);
+    orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1);
+
+    const size_t distinct_counts =
+      std::count_if(begin(counts_hist), end(counts_hist),
+                    [](const double x) {return x > 0.0;});
+
+    if (VERBOSE)
+      cerr << "TOTAL READS     = " << n_reads << endl
+           << "DISTINCT READS  = " << distinct_reads << endl
+           << "DISTINCT COUNTS = " << distinct_counts << endl
+           << "MAX COUNT       = " << max_observed_count << endl
+           << "COUNTS OF 1     = " << counts_hist[1] << endl
+           << "MAX TERMS       = " << orig_max_terms << endl;
+
+    // ADS: step size to be set based on the original number of
+    // observations
+    const double step_size = n_reads;
+
+    if (VERBOSE) {
+      // OUTPUT THE ORIGINAL HISTOGRAM
+      cerr << "OBSERVED COUNTS (" << counts_hist.size() << ")" << endl;
+      for (size_t i = 0; i < counts_hist.size(); i++)
+        if (counts_hist[i] > 0)
+          cerr << i << '\t' << static_cast<size_t>(counts_hist[i]) << endl;
+      cerr << endl;
+    }
+
+    // check to make sure library is not overly saturated
+    const double two_fold_extrap = GoodToulmin2xExtrap(counts_hist);
+    if (two_fold_extrap < 0.0)
+      throw runtime_error("Saturation expected at double initial sample size."
+                          " Unable to extrapolate");
+
+    // const size_t total_reads = get_counts_from_hist(counts_hist);
+
+    //assert(total_reads == n_reads); // ADS: why commented out?
+
+    // check that min required count is satisfied
+    if (orig_max_terms < min_required_counts)
+      throw runtime_error(min_required_counts_error_message);
+
+    if (VERBOSE)
+      cerr << "[ESTIMATING YIELD CURVE]" << endl;
+    vector<double> yield_estimates;
+
+    if (VERBOSE)
+      cerr << "[BOOTSTRAPPING HISTOGRAM]" << endl;
+
+    const size_t max_iter = 100*n_bootstraps;
+
+    vector<vector <double> > bootstrap_estimates;
+    extrap_bootstrap(VERBOSE, allow_defects, seed, counts_hist, n_bootstraps,
+                     orig_max_terms, diagonal, step_size, max_extrap,
+                     max_iter, bootstrap_estimates);
+
+    if (VERBOSE)
+      cerr << "[COMPUTING CONFIDENCE INTERVALS]" << endl;
+    // yield ci
+    vector<double> yield_upper_ci_lognorm, yield_lower_ci_lognorm;
+
+    vector_median_and_ci(bootstrap_estimates, c_level, yield_estimates,
+                         yield_lower_ci_lognorm, yield_upper_ci_lognorm);
+
+    /////////////////////////////////////////////////////////////////////
+    if (VERBOSE)
+      cerr << "[WRITING OUTPUT]" << endl;
+
+    write_predicted_complexity_curve(outfile, c_level, step_size,
+                                     yield_estimates, yield_lower_ci_lognorm,
+                                     yield_upper_ci_lognorm);
+  }
+  catch (runtime_error &e) {
+    cerr << "ERROR:\t" << e.what() << endl;
+    return EXIT_FAILURE;
+  }
+  catch (std::bad_alloc &ba) {
+    cerr << "ERROR: could not allocate memory" << endl;
+    return EXIT_FAILURE;
+  }
+  return EXIT_SUCCESS;
+}
+
 
 ///////////////////////////////////////////////////////////////////////
 //////////////////////////////////////////////////////////////////////