@@ -86,19 +86,19 @@ median_from_sorted_vector (const vector<T> sorted_data,
8686}
8787
8888template <typename T> T
89- quantile_from_sorted_vector (const vector<T> sorted_data,
90- const size_t stride, const size_t n,
91- const double f) {
89+ quantile_from_sorted_vector (const vector<T> sorted_data,
90+ const size_t stride, const size_t n,
91+ const double f) {
9292 const double index = f * (n - 1 );
9393 const size_t lhs = (int )index;
9494 const double delta = index - lhs;
9595
9696 if (n == 0 || sorted_data.empty ()) return 0.0 ;
9797
9898 if (lhs == n - 1 ) return sorted_data[lhs * stride];
99-
100- return (1 - delta) * sorted_data[lhs * stride]
101- + delta * sorted_data[(lhs + 1 ) * stride];
99+
100+ return (1 - delta) * sorted_data[lhs * stride]
101+ + delta * sorted_data[(lhs + 1 ) * stride];
102102}
103103
104104// Confidence interval stuff
@@ -202,8 +202,8 @@ factorial (double x) {
202202
203203 double Ag = lanczos[0 ];
204204
205- for (size_t k=1 ; k < lanczos.size (); k++)
206- Ag += lanczos[k] / (x + k);
205+ for (size_t k=1 ; k < lanczos.size (); k++)
206+ Ag += lanczos[k] / (x + k);
207207
208208 double term1 = (x + 0.5 ) * log ((x + 7.5 ) / Euler);
209209 double term2 = LogRootTwoPi + log (Ag);
@@ -229,7 +229,7 @@ resample_hist(std::mt19937 &gen, const vector<size_t> &vals_hist_distinct_counts
229229
230230 unsigned int distinct =
231231 accumulate (begin (distinct_counts_hist), end (distinct_counts_hist), 0.0 );
232-
232+
233233 multinomial (gen, distinct_counts_hist, distinct,
234234 sample_distinct_counts_hist);
235235
@@ -781,6 +781,239 @@ lc_extrap(const int argc, const char **argv) {
781781 return EXIT_SUCCESS;
782782}
783783
784+ // //////////////////////////////////////////////////////////////////////
785+ // POP_SIZE
786+ static int
787+ pop_size (const int argc, const char **argv) {
788+
789+ try {
790+
791+ static const size_t min_required_counts = 4 ;
792+ static const string min_required_counts_error_message =
793+ " max count before zero is less than min required count (" +
794+ to_string (min_required_counts) + " ) duplicates removed" ;
795+
796+ string outfile;
797+
798+ size_t orig_max_terms = 100 ;
799+ double max_extrap = 1.0e10 ;
800+ size_t n_bootstraps = 100 ;
801+ int diagonal = 0 ;
802+ double c_level = 0.95 ;
803+ unsigned long int seed = 408 ;
804+
805+ /* FLAGS */
806+ bool VERBOSE = false ;
807+ bool VALS_INPUT = false ;
808+ bool PAIRED_END = false ;
809+ bool HIST_INPUT = false ;
810+ bool SINGLE_ESTIMATE = false ;
811+ bool allow_defects = false ;
812+
813+ #ifdef HAVE_HTSLIB
814+ bool BAM_FORMAT_INPUT = false ;
815+ size_t MAX_SEGMENT_LENGTH = 5000 ;
816+ #endif
817+
818+ const string description =
819+ " Needs writing" ;
820+
821+ /* ********* GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/
822+ OptionParser opt_parse (strip_path (argv[1 ]), description, " <input-file>" );
823+ opt_parse.add_opt (" output" , ' o' , " output file (default: stdout)" ,
824+ false , outfile);
825+ opt_parse.add_opt (" extrap" ,' e' ," maximum extrapolation" , false , max_extrap);
826+ opt_parse.add_opt (" boots" ,' n' ," number of bootstraps" , false , n_bootstraps);
827+ opt_parse.add_opt (" cval" , ' c' , " level for confidence intervals" , false , c_level);
828+ opt_parse.add_opt (" terms" ,' x' ," maximum terms in estimator" , false , orig_max_terms);
829+ opt_parse.add_opt (" verbose" , ' v' , " print more info" , false , VERBOSE);
830+ #ifdef HAVE_HTSLIB
831+ opt_parse.add_opt (" bam" , ' B' , " input is in BAM format" ,
832+ false , BAM_FORMAT_INPUT);
833+ opt_parse.add_opt (" seg_len" , ' l' , " maximum segment length when merging "
834+ " paired end bam reads" ,
835+ false , MAX_SEGMENT_LENGTH);
836+ #endif
837+ opt_parse.add_opt (" pe" , ' P' , " input is paired end read file" ,
838+ false , PAIRED_END);
839+ opt_parse.add_opt (" vals" , ' V' ,
840+ " input is a text file containing only the observed counts" ,
841+ false , VALS_INPUT);
842+ opt_parse.add_opt (" hist" , ' H' ,
843+ " input is a text file containing the observed histogram" ,
844+ false , HIST_INPUT);
845+ opt_parse.add_opt (" quick" , ' Q' ,
846+ " quick mode (no bootstraps) for confidence intervals" ,
847+ false , SINGLE_ESTIMATE);
848+ opt_parse.add_opt (" defects" , ' D' , " no testing for defects" , false , allow_defects);
849+ opt_parse.add_opt (" seed" , ' r' , " seed for random number generator" ,
850+ false , seed);
851+ opt_parse.set_show_defaults ();
852+ vector<string> leftover_args;
853+
854+ // ADS: suspect bug below; "-about" isn't working.
855+ opt_parse.parse (argc-1 , argv+1 , leftover_args);
856+ if (argc == 2 || opt_parse.help_requested ()) {
857+ cerr << opt_parse.help_message () << endl;
858+ return EXIT_SUCCESS;
859+ }
860+ if (opt_parse.about_requested ()) {
861+ cerr << opt_parse.about_message () << endl;
862+ return EXIT_SUCCESS;
863+ }
864+ if (opt_parse.option_missing ()) {
865+ cerr << opt_parse.option_missing_message () << endl;
866+ return EXIT_SUCCESS;
867+ }
868+ if (leftover_args.empty ()) {
869+ cerr << opt_parse.help_message () << endl;
870+ return EXIT_SUCCESS;
871+ }
872+ const string input_file_name = leftover_args.front ();
873+ /* *****************************************************************/
874+
875+ throw runtime_error (" NOT YET IMPLEMENTED" );
876+
877+ vector<double > counts_hist;
878+ size_t n_reads = 0 ;
879+
880+ /* *********** loading input ***************************************/
881+ if (HIST_INPUT) {
882+ if (VERBOSE)
883+ cerr << " HIST_INPUT" << endl;
884+ n_reads = load_histogram (input_file_name, counts_hist);
885+ }
886+ else if (VALS_INPUT) {
887+ if (VERBOSE)
888+ cerr << " VALS_INPUT" << endl;
889+ n_reads = load_counts (input_file_name, counts_hist);
890+ }
891+ #ifdef HAVE_HTSLIB
892+ else if (BAM_FORMAT_INPUT && PAIRED_END) {
893+ if (VERBOSE)
894+ cerr << " PAIRED_END_BAM_INPUT" << endl;
895+ const size_t MAX_READS_TO_HOLD = 5000000 ;
896+ size_t n_paired = 0 ;
897+ size_t n_mates = 0 ;
898+ n_reads = load_counts_BAM_pe (VERBOSE, input_file_name,
899+ MAX_SEGMENT_LENGTH,
900+ MAX_READS_TO_HOLD, n_paired,
901+ n_mates, counts_hist);
902+ if (VERBOSE) {
903+ cerr << " MERGED PAIRED END READS = " << n_paired << endl;
904+ cerr << " MATES PROCESSED = " << n_mates << endl;
905+ }
906+ }
907+ else if (BAM_FORMAT_INPUT) {
908+ if (VERBOSE)
909+ cerr << " BAM_INPUT" << endl;
910+ n_reads = load_counts_BAM_se (input_file_name, counts_hist);
911+ }
912+ #endif
913+ else if (PAIRED_END) {
914+ if (VERBOSE)
915+ cerr << " PAIRED_END_BED_INPUT" << endl;
916+ n_reads = load_counts_BED_pe (input_file_name, counts_hist);
917+ }
918+ else { // default is single end bed file
919+ if (VERBOSE)
920+ cerr << " BED_INPUT" << endl;
921+ n_reads = load_counts_BED_se (input_file_name, counts_hist);
922+ }
923+ /* *********** done loading input **********************************/
924+
925+ const size_t max_observed_count = counts_hist.size () - 1 ;
926+ const double distinct_reads =
927+ accumulate (begin (counts_hist), end (counts_hist), 0.0 );
928+
929+ // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
930+ size_t first_zero = 1 ;
931+ while (first_zero < counts_hist.size () && counts_hist[first_zero] > 0 )
932+ ++first_zero;
933+
934+ orig_max_terms = std::min (orig_max_terms, first_zero - 1 );
935+ orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1 );
936+
937+ const size_t distinct_counts =
938+ std::count_if (begin (counts_hist), end (counts_hist),
939+ [](const double x) {return x > 0.0 ;});
940+
941+ if (VERBOSE)
942+ cerr << " TOTAL READS = " << n_reads << endl
943+ << " DISTINCT READS = " << distinct_reads << endl
944+ << " DISTINCT COUNTS = " << distinct_counts << endl
945+ << " MAX COUNT = " << max_observed_count << endl
946+ << " COUNTS OF 1 = " << counts_hist[1 ] << endl
947+ << " MAX TERMS = " << orig_max_terms << endl;
948+
949+ // ADS: step size to be set based on the original number of
950+ // observations
951+ const double step_size = n_reads;
952+
953+ if (VERBOSE) {
954+ // OUTPUT THE ORIGINAL HISTOGRAM
955+ cerr << " OBSERVED COUNTS (" << counts_hist.size () << " )" << endl;
956+ for (size_t i = 0 ; i < counts_hist.size (); i++)
957+ if (counts_hist[i] > 0 )
958+ cerr << i << ' \t ' << static_cast <size_t >(counts_hist[i]) << endl;
959+ cerr << endl;
960+ }
961+
962+ // check to make sure library is not overly saturated
963+ const double two_fold_extrap = GoodToulmin2xExtrap (counts_hist);
964+ if (two_fold_extrap < 0.0 )
965+ throw runtime_error (" Saturation expected at double initial sample size."
966+ " Unable to extrapolate" );
967+
968+ // const size_t total_reads = get_counts_from_hist(counts_hist);
969+
970+ // assert(total_reads == n_reads); // ADS: why commented out?
971+
972+ // check that min required count is satisfied
973+ if (orig_max_terms < min_required_counts)
974+ throw runtime_error (min_required_counts_error_message);
975+
976+ if (VERBOSE)
977+ cerr << " [ESTIMATING YIELD CURVE]" << endl;
978+ vector<double > yield_estimates;
979+
980+ if (VERBOSE)
981+ cerr << " [BOOTSTRAPPING HISTOGRAM]" << endl;
982+
983+ const size_t max_iter = 100 *n_bootstraps;
984+
985+ vector<vector <double > > bootstrap_estimates;
986+ extrap_bootstrap (VERBOSE, allow_defects, seed, counts_hist, n_bootstraps,
987+ orig_max_terms, diagonal, step_size, max_extrap,
988+ max_iter, bootstrap_estimates);
989+
990+ if (VERBOSE)
991+ cerr << " [COMPUTING CONFIDENCE INTERVALS]" << endl;
992+ // yield ci
993+ vector<double > yield_upper_ci_lognorm, yield_lower_ci_lognorm;
994+
995+ vector_median_and_ci (bootstrap_estimates, c_level, yield_estimates,
996+ yield_lower_ci_lognorm, yield_upper_ci_lognorm);
997+
998+ // ///////////////////////////////////////////////////////////////////
999+ if (VERBOSE)
1000+ cerr << " [WRITING OUTPUT]" << endl;
1001+
1002+ write_predicted_complexity_curve (outfile, c_level, step_size,
1003+ yield_estimates, yield_lower_ci_lognorm,
1004+ yield_upper_ci_lognorm);
1005+ }
1006+ catch (runtime_error &e) {
1007+ cerr << " ERROR:\t " << e.what () << endl;
1008+ return EXIT_FAILURE;
1009+ }
1010+ catch (std::bad_alloc &ba) {
1011+ cerr << " ERROR: could not allocate memory" << endl;
1012+ return EXIT_FAILURE;
1013+ }
1014+ return EXIT_SUCCESS;
1015+ }
1016+
7841017
7851018// /////////////////////////////////////////////////////////////////////
7861019// ////////////////////////////////////////////////////////////////////
@@ -1475,7 +1708,7 @@ bound_pop(const int argc, const char **argv) {
14751708 bootstrap_moments.push_back (exp (factorial (i + 3 ) +
14761709 log (sample_hist[i + 2 ]) -
14771710 log (sample_hist[1 ])) );
1478- }
1711+ }
14791712
14801713 size_t n_points = 0 ;
14811714 n_points = ensure_pos_def_mom_seq (bootstrap_moments, tolerance, VERBOSE);
@@ -1606,6 +1839,11 @@ main(const int argc, const char **argv) {
16061839
16071840 return bound_pop (argc, argv);
16081841
1842+ }
1843+ else if (strcmp (argv[1 ], " pop_size" ) == 0 ) {
1844+
1845+ return pop_size (argc, argv);
1846+
16091847 }
16101848 else {
16111849 cerr << " unrecognized command: " << argv[1 ] << endl
0 commit comments