@@ -93,7 +93,7 @@ quantile_from_sorted_vector (const vector<T> sorted_data,
9393 if (lhs == n - 1 ) return sorted_data[lhs * stride];
9494
9595 return (1 - delta) * sorted_data[lhs * stride]
96- + delta * sorted_data[(lhs + 1 ) * stride];
96+ + delta * sorted_data[(lhs + 1 ) * stride];
9797}
9898
9999// Confidence interval stuff
@@ -781,6 +781,239 @@ lc_extrap(const int argc, const char **argv) {
781781 return EXIT_SUCCESS;
782782}
783783
784+ // //////////////////////////////////////////////////////////////////////
785+ // POP_SIZE
786+ static int
787+ pop_size (const int argc, const char **argv) {
788+
789+ try {
790+
791+ static const size_t min_required_counts = 4 ;
792+ static const string min_required_counts_error_message =
793+ " max count before zero is less than min required count (" +
794+ to_string (min_required_counts) + " ) duplicates removed" ;
795+
796+ string outfile;
797+
798+ size_t orig_max_terms = 100 ;
799+ double max_extrap = 1.0e10 ;
800+ size_t n_bootstraps = 100 ;
801+ int diagonal = 0 ;
802+ double c_level = 0.95 ;
803+ unsigned long int seed = 408 ;
804+
805+ /* FLAGS */
806+ bool VERBOSE = false ;
807+ bool VALS_INPUT = false ;
808+ bool PAIRED_END = false ;
809+ bool HIST_INPUT = false ;
810+ bool SINGLE_ESTIMATE = false ;
811+ bool allow_defects = false ;
812+
813+ #ifdef HAVE_HTSLIB
814+ bool BAM_FORMAT_INPUT = false ;
815+ size_t MAX_SEGMENT_LENGTH = 5000 ;
816+ #endif
817+
818+ const string description =
819+ " Needs writing" ;
820+
821+ /* ********* GET COMMAND LINE ARGUMENTS FOR LC EXTRAP ***********/
822+ OptionParser opt_parse (strip_path (argv[1 ]), description, " <input-file>" );
823+ opt_parse.add_opt (" output" , ' o' , " output file (default: stdout)" ,
824+ false , outfile);
825+ opt_parse.add_opt (" extrap" ,' e' ," maximum extrapolation" , false , max_extrap);
826+ opt_parse.add_opt (" boots" ,' n' ," number of bootstraps" , false , n_bootstraps);
827+ opt_parse.add_opt (" cval" , ' c' , " level for confidence intervals" , false , c_level);
828+ opt_parse.add_opt (" terms" ,' x' ," maximum terms in estimator" , false , orig_max_terms);
829+ opt_parse.add_opt (" verbose" , ' v' , " print more info" , false , VERBOSE);
830+ #ifdef HAVE_HTSLIB
831+ opt_parse.add_opt (" bam" , ' B' , " input is in BAM format" ,
832+ false , BAM_FORMAT_INPUT);
833+ opt_parse.add_opt (" seg_len" , ' l' , " maximum segment length when merging "
834+ " paired end bam reads" ,
835+ false , MAX_SEGMENT_LENGTH);
836+ #endif
837+ opt_parse.add_opt (" pe" , ' P' , " input is paired end read file" ,
838+ false , PAIRED_END);
839+ opt_parse.add_opt (" vals" , ' V' ,
840+ " input is a text file containing only the observed counts" ,
841+ false , VALS_INPUT);
842+ opt_parse.add_opt (" hist" , ' H' ,
843+ " input is a text file containing the observed histogram" ,
844+ false , HIST_INPUT);
845+ opt_parse.add_opt (" quick" , ' Q' ,
846+ " quick mode (no bootstraps) for confidence intervals" ,
847+ false , SINGLE_ESTIMATE);
848+ opt_parse.add_opt (" defects" , ' D' , " no testing for defects" , false , allow_defects);
849+ opt_parse.add_opt (" seed" , ' r' , " seed for random number generator" ,
850+ false , seed);
851+ opt_parse.set_show_defaults ();
852+ vector<string> leftover_args;
853+
854+ // ADS: suspect bug below; "-about" isn't working.
855+ opt_parse.parse (argc-1 , argv+1 , leftover_args);
856+ if (argc == 2 || opt_parse.help_requested ()) {
857+ cerr << opt_parse.help_message () << endl;
858+ return EXIT_SUCCESS;
859+ }
860+ if (opt_parse.about_requested ()) {
861+ cerr << opt_parse.about_message () << endl;
862+ return EXIT_SUCCESS;
863+ }
864+ if (opt_parse.option_missing ()) {
865+ cerr << opt_parse.option_missing_message () << endl;
866+ return EXIT_SUCCESS;
867+ }
868+ if (leftover_args.empty ()) {
869+ cerr << opt_parse.help_message () << endl;
870+ return EXIT_SUCCESS;
871+ }
872+ const string input_file_name = leftover_args.front ();
873+ /* *****************************************************************/
874+
875+ throw runtime_error (" NOT YET IMPLEMENTED" );
876+
877+ vector<double > counts_hist;
878+ size_t n_reads = 0 ;
879+
880+ /* *********** loading input ***************************************/
881+ if (HIST_INPUT) {
882+ if (VERBOSE)
883+ cerr << " HIST_INPUT" << endl;
884+ n_reads = load_histogram (input_file_name, counts_hist);
885+ }
886+ else if (VALS_INPUT) {
887+ if (VERBOSE)
888+ cerr << " VALS_INPUT" << endl;
889+ n_reads = load_counts (input_file_name, counts_hist);
890+ }
891+ #ifdef HAVE_HTSLIB
892+ else if (BAM_FORMAT_INPUT && PAIRED_END) {
893+ if (VERBOSE)
894+ cerr << " PAIRED_END_BAM_INPUT" << endl;
895+ const size_t MAX_READS_TO_HOLD = 5000000 ;
896+ size_t n_paired = 0 ;
897+ size_t n_mates = 0 ;
898+ n_reads = load_counts_BAM_pe (VERBOSE, input_file_name,
899+ MAX_SEGMENT_LENGTH,
900+ MAX_READS_TO_HOLD, n_paired,
901+ n_mates, counts_hist);
902+ if (VERBOSE) {
903+ cerr << " MERGED PAIRED END READS = " << n_paired << endl;
904+ cerr << " MATES PROCESSED = " << n_mates << endl;
905+ }
906+ }
907+ else if (BAM_FORMAT_INPUT) {
908+ if (VERBOSE)
909+ cerr << " BAM_INPUT" << endl;
910+ n_reads = load_counts_BAM_se (input_file_name, counts_hist);
911+ }
912+ #endif
913+ else if (PAIRED_END) {
914+ if (VERBOSE)
915+ cerr << " PAIRED_END_BED_INPUT" << endl;
916+ n_reads = load_counts_BED_pe (input_file_name, counts_hist);
917+ }
918+ else { // default is single end bed file
919+ if (VERBOSE)
920+ cerr << " BED_INPUT" << endl;
921+ n_reads = load_counts_BED_se (input_file_name, counts_hist);
922+ }
923+ /* *********** done loading input **********************************/
924+
925+ const size_t max_observed_count = counts_hist.size () - 1 ;
926+ const double distinct_reads =
927+ accumulate (begin (counts_hist), end (counts_hist), 0.0 );
928+
929+ // ENSURE THAT THE MAX TERMS ARE ACCEPTABLE
930+ size_t first_zero = 1 ;
931+ while (first_zero < counts_hist.size () && counts_hist[first_zero] > 0 )
932+ ++first_zero;
933+
934+ orig_max_terms = std::min (orig_max_terms, first_zero - 1 );
935+ orig_max_terms = orig_max_terms - (orig_max_terms % 2 == 1 );
936+
937+ const size_t distinct_counts =
938+ std::count_if (begin (counts_hist), end (counts_hist),
939+ [](const double x) {return x > 0.0 ;});
940+
941+ if (VERBOSE)
942+ cerr << " TOTAL READS = " << n_reads << endl
943+ << " DISTINCT READS = " << distinct_reads << endl
944+ << " DISTINCT COUNTS = " << distinct_counts << endl
945+ << " MAX COUNT = " << max_observed_count << endl
946+ << " COUNTS OF 1 = " << counts_hist[1 ] << endl
947+ << " MAX TERMS = " << orig_max_terms << endl;
948+
949+ // ADS: step size to be set based on the original number of
950+ // observations
951+ const double step_size = n_reads;
952+
953+ if (VERBOSE) {
954+ // OUTPUT THE ORIGINAL HISTOGRAM
955+ cerr << " OBSERVED COUNTS (" << counts_hist.size () << " )" << endl;
956+ for (size_t i = 0 ; i < counts_hist.size (); i++)
957+ if (counts_hist[i] > 0 )
958+ cerr << i << ' \t ' << static_cast <size_t >(counts_hist[i]) << endl;
959+ cerr << endl;
960+ }
961+
962+ // check to make sure library is not overly saturated
963+ const double two_fold_extrap = GoodToulmin2xExtrap (counts_hist);
964+ if (two_fold_extrap < 0.0 )
965+ throw runtime_error (" Saturation expected at double initial sample size."
966+ " Unable to extrapolate" );
967+
968+ // const size_t total_reads = get_counts_from_hist(counts_hist);
969+
970+ // assert(total_reads == n_reads); // ADS: why commented out?
971+
972+ // check that min required count is satisfied
973+ if (orig_max_terms < min_required_counts)
974+ throw runtime_error (min_required_counts_error_message);
975+
976+ if (VERBOSE)
977+ cerr << " [ESTIMATING YIELD CURVE]" << endl;
978+ vector<double > yield_estimates;
979+
980+ if (VERBOSE)
981+ cerr << " [BOOTSTRAPPING HISTOGRAM]" << endl;
982+
983+ const size_t max_iter = 100 *n_bootstraps;
984+
985+ vector<vector <double > > bootstrap_estimates;
986+ extrap_bootstrap (VERBOSE, allow_defects, seed, counts_hist, n_bootstraps,
987+ orig_max_terms, diagonal, step_size, max_extrap,
988+ max_iter, bootstrap_estimates);
989+
990+ if (VERBOSE)
991+ cerr << " [COMPUTING CONFIDENCE INTERVALS]" << endl;
992+ // yield ci
993+ vector<double > yield_upper_ci_lognorm, yield_lower_ci_lognorm;
994+
995+ vector_median_and_ci (bootstrap_estimates, c_level, yield_estimates,
996+ yield_lower_ci_lognorm, yield_upper_ci_lognorm);
997+
998+ // ///////////////////////////////////////////////////////////////////
999+ if (VERBOSE)
1000+ cerr << " [WRITING OUTPUT]" << endl;
1001+
1002+ write_predicted_complexity_curve (outfile, c_level, step_size,
1003+ yield_estimates, yield_lower_ci_lognorm,
1004+ yield_upper_ci_lognorm);
1005+ }
1006+ catch (runtime_error &e) {
1007+ cerr << " ERROR:\t " << e.what () << endl;
1008+ return EXIT_FAILURE;
1009+ }
1010+ catch (std::bad_alloc &ba) {
1011+ cerr << " ERROR: could not allocate memory" << endl;
1012+ return EXIT_FAILURE;
1013+ }
1014+ return EXIT_SUCCESS;
1015+ }
1016+
7841017
7851018// /////////////////////////////////////////////////////////////////////
7861019// ////////////////////////////////////////////////////////////////////
0 commit comments