@@ -127,7 +127,6 @@ vector_median_and_ci(const vector<vector<double> > &bootstrap_estimates,
127127
128128 const size_t n_est = bootstrap_estimates.size ();
129129 vector<double > estimates_row (n_est, 0.0 );
130- double prev_estimate = 0 ;
131130 for (size_t i = 0 ; i < bootstrap_estimates[0 ].size (); i++) {
132131
133132 // estimates is in wrong order, work locally on const val
@@ -139,14 +138,9 @@ vector_median_and_ci(const vector<vector<double> > &bootstrap_estimates,
139138 lower_ci_estimate, upper_ci_estimate);
140139 sort (begin (estimates_row), end (estimates_row));
141140
142- if (median_estimate - prev_estimate < 1.0 )
143- break ;
144-
145141 yield_estimates.push_back (median_estimate);
146142 lower_ci_lognorm.push_back (lower_ci_estimate);
147143 upper_ci_lognorm.push_back (upper_ci_estimate);
148-
149- prev_estimate = median_estimate;
150144 }
151145}
152146
@@ -781,240 +775,6 @@ lc_extrap(const int argc, const char **argv) {
781775 return EXIT_SUCCESS;
782776}
783777
784- // //////////////////////////////////////////////////////////////////////
785- // POP_SIZE
786- static int
787- pop_size (const int argc, const char **argv) {
788-
789- try {
790-
791- static const size_t min_required_counts = 4 ;
792- static const string min_required_counts_error_message =
793- " max count before zero is less than min required count (" +
794- to_string (min_required_counts) + " ) duplicates removed" ;
795-
796- string outfile;
797-
798- size_t orig_max_terms = 100 ;
799- double max_extrap = 1.0e10 ;
800- size_t n_bootstraps = 100 ;
801- int diagonal = 0 ;
802- double c_level = 0.95 ;
803- unsigned long int seed = 408 ;
804-
805- /* FLAGS */
806- bool VERBOSE = false ;
807- bool VALS_INPUT = false ;
808- bool PAIRED_END = false ;
809- bool HIST_INPUT = false ;
810- bool SINGLE_ESTIMATE = false ;
811- bool allow_defects = false ;
812-
813- #ifdef HAVE_HTSLIB
814- bool BAM_FORMAT_INPUT = false ;
815- size_t MAX_SEGMENT_LENGTH = 5000 ;
816- #endif
817-
818- const string description =
819- " Needs writing" ;
820-
821- /* ********* GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/
822- OptionParser opt_parse (strip_path (argv[1 ]), description, " <input-file>" );
823- opt_parse.add_opt (" output" , ' o' , " output file (default: stdout)" ,
824- false , outfile);
825- opt_parse.add_opt (" extrap" ,' e' ," maximum extrapolation" , false , max_extrap);
826- opt_parse.add_opt (" boots" ,' n' ," number of bootstraps" , false , n_bootstraps);
827- opt_parse.add_opt (" cval" , ' c' , " level for confidence intervals" , false , c_level);
828- opt_parse.add_opt (" terms" ,' x' ," maximum terms in estimator" , false , orig_max_terms);
829- opt_parse.add_opt (" verbose" , ' v' , " print more info" , false , VERBOSE);
830- #ifdef HAVE_HTSLIB
831- opt_parse.add_opt (" bam" , ' B' , " input is in BAM format" ,
832- false , BAM_FORMAT_INPUT);
833- opt_parse.add_opt (" seg_len" , ' l' , " maximum segment length when merging "
834- " paired end bam reads" ,
835- false , MAX_SEGMENT_LENGTH);
836- #endif
837- opt_parse.add_opt (" pe" , ' P' , " input is paired end read file" ,
838- false , PAIRED_END);
839- opt_parse.add_opt (" vals" , ' V' ,
840- " input is a text file containing only the observed counts" ,
841- false , VALS_INPUT);
842- opt_parse.add_opt (" hist" , ' H' ,
843- " input is a text file containing the observed histogram" ,
844- false , HIST_INPUT);
845- opt_parse.add_opt (" quick" , ' Q' ,
846- " quick mode (no bootstraps) for confidence intervals" ,
847- false , SINGLE_ESTIMATE);
848- opt_parse.add_opt (" defects" , ' D' , " no testing for defects" , false , allow_defects);
849- opt_parse.add_opt (" seed" , ' r' , " seed for random number generator" ,
850- false , seed);
851- opt_parse.set_show_defaults ();
852- vector<string> leftover_args;
853-
854- // ADS: suspect bug below; "-about" isn't working.
855- opt_parse.parse (argc-1 , argv+1 , leftover_args);
856- if (argc == 2 || opt_parse.help_requested ()) {
857- cerr << opt_parse.help_message () << endl;
858- return EXIT_SUCCESS;
859- }
860- if (opt_parse.about_requested ()) {
861- cerr << opt_parse.about_message () << endl;
862- return EXIT_SUCCESS;
863- }
864- if (opt_parse.option_missing ()) {
865- cerr << opt_parse.option_missing_message () << endl;
866- return EXIT_SUCCESS;
867- }
868- if (leftover_args.empty ()) {
869- cerr << opt_parse.help_message () << endl;
870- return EXIT_SUCCESS;
871- }
872- const string input_file_name = leftover_args.front ();
873- /* *****************************************************************/
874-
875- throw runtime_error (" NOT YET IMPLEMENTED" );
876-
877- vector<double > counts_hist;
878- size_t n_reads = 0 ;
879-
880- /* *********** loading input ***************************************/
881- if (HIST_INPUT) {
882- if (VERBOSE)
883- cerr << " HIST_INPUT" << endl;
884- n_reads = load_histogram (input_file_name, counts_hist);
885- }
886- else if (VALS_INPUT) {
887- if (VERBOSE)
888- cerr << " VALS_INPUT" << endl;
889- n_reads = load_counts (input_file_name, counts_hist);
890- }
891- #ifdef HAVE_HTSLIB
892- else if (BAM_FORMAT_INPUT && PAIRED_END) {
893- if (VERBOSE)
894- cerr << " PAIRED_END_BAM_INPUT" << endl;
895- const size_t MAX_READS_TO_HOLD = 5000000 ;
896- size_t n_paired = 0 ;
897- size_t n_mates = 0 ;
898- n_reads = load_counts_BAM_pe (VERBOSE, input_file_name,
899- MAX_SEGMENT_LENGTH,
900- MAX_READS_TO_HOLD, n_paired,
901- n_mates, counts_hist);
902- if (VERBOSE) {
903- cerr << " MERGED PAIRED END READS = " << n_paired << endl;
904- cerr << " MATES PROCESSED = " << n_mates << endl;
905- }
906- }
907- else if (BAM_FORMAT_INPUT) {
908- if (VERBOSE)
909- cerr << " BAM_INPUT" << endl;
910- n_reads = load_counts_BAM_se (input_file_name, counts_hist);
911- }
912- #endif
913- else if (PAIRED_END) {
914- if (VERBOSE)
915- cerr << " PAIRED_END_BED_INPUT" << endl;
916- n_reads = load_counts_BED_pe (input_file_name, counts_hist);
917- }
918- else { // default is single end bed file
919- if (VERBOSE)
920- cerr << " BED_INPUT" << endl;
921- n_reads = load_counts_BED_se (input_file_name, counts_hist);
922- }
923- /* *********** done loading input **********************************/
924-
925- const size_t max_observed_count = counts_hist.size () - 1 ;
926- const double distinct_reads =
927- accumulate (begin (counts_hist), end (counts_hist), 0.0 );
928-
929- // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
930- size_t first_zero = 1 ;
931- while (first_zero < counts_hist.size () && counts_hist[first_zero] > 0 )
932- ++first_zero;
933-
934- orig_max_terms = std::min (orig_max_terms, first_zero - 1 );
935- orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1 );
936-
937- const size_t distinct_counts =
938- std::count_if (begin (counts_hist), end (counts_hist),
939- [](const double x) {return x > 0.0 ;});
940-
941- if (VERBOSE)
942- cerr << " TOTAL READS = " << n_reads << endl
943- << " DISTINCT READS = " << distinct_reads << endl
944- << " DISTINCT COUNTS = " << distinct_counts << endl
945- << " MAX COUNT = " << max_observed_count << endl
946- << " COUNTS OF 1 = " << counts_hist[1 ] << endl
947- << " MAX TERMS = " << orig_max_terms << endl;
948-
949- // ADS: step size to be set based on the original number of
950- // observations
951- const double step_size = n_reads;
952-
953- if (VERBOSE) {
954- // OUTPUT THE ORIGINAL HISTOGRAM
955- cerr << " OBSERVED COUNTS (" << counts_hist.size () << " )" << endl;
956- for (size_t i = 0 ; i < counts_hist.size (); i++)
957- if (counts_hist[i] > 0 )
958- cerr << i << ' \t ' << static_cast <size_t >(counts_hist[i]) << endl;
959- cerr << endl;
960- }
961-
962- // check to make sure library is not overly saturated
963- const double two_fold_extrap = GoodToulmin2xExtrap (counts_hist);
964- if (two_fold_extrap < 0.0 )
965- throw runtime_error (" Saturation expected at double initial sample size."
966- " Unable to extrapolate" );
967-
968- // const size_t total_reads = get_counts_from_hist(counts_hist);
969-
970- // assert(total_reads == n_reads); // ADS: why commented out?
971-
972- // check that min required count is satisfied
973- if (orig_max_terms < min_required_counts)
974- throw runtime_error (min_required_counts_error_message);
975-
976- if (VERBOSE)
977- cerr << " [ESTIMATING YIELD CURVE]" << endl;
978- vector<double > yield_estimates;
979-
980- if (VERBOSE)
981- cerr << " [BOOTSTRAPPING HISTOGRAM]" << endl;
982-
983- const size_t max_iter = 100 *n_bootstraps;
984-
985- vector<vector <double > > bootstrap_estimates;
986- extrap_bootstrap (VERBOSE, allow_defects, seed, counts_hist, n_bootstraps,
987- orig_max_terms, diagonal, step_size, max_extrap,
988- max_iter, bootstrap_estimates);
989-
990- if (VERBOSE)
991- cerr << " [COMPUTING CONFIDENCE INTERVALS]" << endl;
992- // yield ci
993- vector<double > yield_upper_ci_lognorm, yield_lower_ci_lognorm;
994-
995- vector_median_and_ci (bootstrap_estimates, c_level, yield_estimates,
996- yield_lower_ci_lognorm, yield_upper_ci_lognorm);
997-
998- // ///////////////////////////////////////////////////////////////////
999- if (VERBOSE)
1000- cerr << " [WRITING OUTPUT]" << endl;
1001-
1002- write_predicted_complexity_curve (outfile, c_level, step_size,
1003- yield_estimates, yield_lower_ci_lognorm,
1004- yield_upper_ci_lognorm);
1005- }
1006- catch (runtime_error &e) {
1007- cerr << " ERROR:\t " << e.what () << endl;
1008- return EXIT_FAILURE;
1009- }
1010- catch (std::bad_alloc &ba) {
1011- cerr << " ERROR: could not allocate memory" << endl;
1012- return EXIT_FAILURE;
1013- }
1014- return EXIT_SUCCESS;
1015- }
1016-
1017-
1018778// /////////////////////////////////////////////////////////////////////
1019779// ////////////////////////////////////////////////////////////////////
1020780// /// GC_EXTRAP: predicting genomic coverage
0 commit comments