3434#include < fstream>
3535#include < iostream>
3636#include < sstream>
37-
38- #include < gsl/gsl_cdf.h>
39- #include < gsl/gsl_randist.h>
40- #include < gsl/gsl_statistics_double.h>
41- #include < gsl/gsl_sf_gamma.h>
37+ #include < random>
4238
4339#include < OptionParser.hpp>
4440#include < smithlab_utils.hpp>
@@ -65,7 +61,7 @@ using std::unordered_map;
6561using std::runtime_error;
6662using std::to_string;
6763
68- static const string preseq_version = " 3.0.0 " ;
64+ static const string preseq_version = " 3.0.1 " ;
6965
7066template <typename T> T
7167get_counts_from_hist (const vector<T> &h) {
@@ -75,6 +71,40 @@ get_counts_from_hist(const vector<T> &h) {
7571 return c;
7672}
7773
74+ template <class T >
75+ T
76+ median_from_sorted_vector (const vector<T> sorted_data,
77+ const size_t stride, const size_t n) {
78+
79+ if (n == 0 || sorted_data.empty ()) return 0 ;
80+
81+ const size_t lhs = (n - 1 ) / 2 ;
82+ const size_t rhs = n / 2 ;
83+
84+ if (lhs == rhs) return sorted_data[lhs * stride];
85+
86+ else return (sorted_data[lhs * stride] + sorted_data[rhs * stride])/2.0 ;
87+ }
88+
89+ template <class T >
90+ T
91+ quantile_from_sorted_vector (const vector<T> sorted_data,
92+ const size_t stride, const size_t n,
93+ const double f) {
94+ const double index = f * (n - 1 );
95+ const size_t lhs = (int )index;
96+ const double delta = index - lhs;
97+ double result;
98+
99+ if (n == 0 || sorted_data.empty ()) return 0.0 ;
100+
101+ if (lhs == n - 1 ) return sorted_data[lhs * stride];
102+
103+ else return result = (1 - delta) * sorted_data[lhs * stride]
104+ + delta * sorted_data[(lhs + 1 ) * stride];
105+ }
106+
107+
78108// Confidence interval stuff
79109static void
80110median_and_ci (vector<double > estimates, // by val so we can sort them
@@ -86,14 +116,11 @@ median_and_ci(vector<double> estimates, // by val so we can sort them
86116
87117 const double alpha = 1.0 - ci_level;
88118 const size_t N = estimates.size ();
89- median_estimate =
90- gsl_stats_median_from_sorted_data (&estimates[0 ], 1 , N);
91- lower_ci_estimate =
92- gsl_stats_quantile_from_sorted_data (&estimates[0 ], 1 , N, alpha/2 );
93- upper_ci_estimate =
94- gsl_stats_quantile_from_sorted_data (&estimates[0 ], 1 , N, 1.0 - alpha/2 );
95- }
96119
120+ median_estimate = median_from_sorted_vector (estimates, 1 , N);
121+ lower_ci_estimate = quantile_from_sorted_vector (estimates, 1 , N, alpha/2 );
122+ upper_ci_estimate = quantile_from_sorted_vector (estimates, 1 , N, 1.0 - alpha/2 );
123+ }
97124
98125static void
99126vector_median_and_ci (const vector<vector<double > > &bootstrap_estimates,
@@ -126,6 +153,67 @@ vector_median_and_ci(const vector<vector<double> > &bootstrap_estimates,
126153 }
127154}
128155
156+ template <typename uint_type>
157+ void
158+ multinomial (std::mt19937 &gen, const vector<double > &mult_probs,
159+ uint_type trials, vector<uint_type> &result) {
160+
161+ typedef std::binomial_distribution<unsigned int > binom_dist;
162+
163+ result.clear ();
164+ result.resize (mult_probs.size ());
165+
166+ double remaining_prob = accumulate (begin (mult_probs), end (mult_probs), 0.0 );
167+
168+ auto r (begin (result));
169+ auto p (begin (mult_probs));
170+
171+ while (p != end (mult_probs)) { // iterate to sample for each category
172+
173+ *r = binom_dist (trials, (*p)/remaining_prob)(gen); // take the sample
174+
175+ remaining_prob -= *p++; // update remaining probability mass
176+ trials -= *r++; // update remaining trials needed
177+ }
178+
179+ if (trials > 0 )
180+ throw runtime_error (" multinomial sampling failed" );
181+ }
182+
183+ // Lanczos approximation for gamma function for x >= 0.5 - essentially an
184+ // approximation for (x-1)!
185+ static double
186+ factorial (double x) {
187+
188+ // constants
189+ double LogRootTwoPi = 0.9189385332046727 ;
190+ double Euler = 2.71828182845904523536028747135 ;
191+
192+ // Approximation for factorial is actually x-1
193+ x -= 1.0 ;
194+
195+ vector<double > lanczos {
196+ 0.99999999999980993227684700473478 ,
197+ 676.520368121885098567009190444019 ,
198+ -1259.13921672240287047156078755283 ,
199+ 771.3234287776530788486528258894 ,
200+ -176.61502916214059906584551354 ,
201+ 12.507343278686904814458936853 ,
202+ -0.13857109526572011689554707 ,
203+ 9.984369578019570859563e-6 ,
204+ 1.50563273514931155834e-7
205+ };
206+
207+ double Ag = lanczos[0 ];
208+
209+ for (size_t k=1 ; k < lanczos.size (); k++)
210+ Ag += lanczos[k] / (x + k);
211+
212+ double term1 = (x + 0.5 ) * log ((x + 7.5 ) / Euler);
213+ double term2 = LogRootTwoPi + log (Ag);
214+
215+ return term1 + (term2 - 7.0 );
216+ }
129217
130218// //////////////////////////////////////////////////////////////////////
131219// /// EXTRAP MODE BELOW HERE
@@ -136,19 +224,18 @@ vector_median_and_ci(const vector<vector<double> > &bootstrap_estimates,
136224// distinct_counts_hist[k] = vals_hist[vals_hist_distinct_counts[k]]
137225// stores the kth positive value of vals_hist
138226void
139- resample_hist (const gsl_rng *rng , const vector<size_t > &vals_hist_distinct_counts,
227+ resample_hist (std::mt19937 &gen , const vector<size_t > &vals_hist_distinct_counts,
140228 const vector<double > &distinct_counts_hist,
141229 vector<double > &out_hist) {
142230
143231 const size_t hist_size = distinct_counts_hist.size ();
144232 vector<unsigned int > sample_distinct_counts_hist (hist_size, 0 );
145233
146- const unsigned int distinct =
234+ unsigned int distinct =
147235 accumulate (begin (distinct_counts_hist), end (distinct_counts_hist), 0.0 );
148-
149- gsl_ran_multinomial (rng, hist_size, distinct,
150- &distinct_counts_hist.front (),
151- &sample_distinct_counts_hist.front ());
236+
237+ multinomial (gen, distinct_counts_hist, distinct,
238+ sample_distinct_counts_hist);
152239
153240 out_hist.clear ();
154241 out_hist.resize (vals_hist_distinct_counts.back () + 1 , 0.0 );
@@ -164,17 +251,19 @@ resample_hist(const gsl_rng *rng, const vector<size_t> &vals_hist_distinct_count
164251static double
165252interpolate_distinct (const vector<double > &hist, const size_t N,
166253 const size_t S, const size_t n) {
167- const double denom =
168- gsl_sf_lngamma (N + 1 ) - gsl_sf_lngamma (n + 1 ) - gsl_sf_lngamma (N - n + 1 );
254+
255+ const double denom = factorial (N + 1 ) - factorial (n + 1 ) - factorial (N - n + 1 );
256+
169257 vector<double > numer (hist.size (), 0 );
170258 for (size_t i = 1 ; i < hist.size (); i++) {
171259 // N - i -n + 1 should be greater than 0
172260 if (N < i + n) {
173261 numer[i] = 0 ;
174262 }
175263 else {
176- const double x = (gsl_sf_lngamma (N - i + 1 ) - gsl_sf_lngamma (n + 1 ) -
177- gsl_sf_lngamma (N - i - n + 1 ));
264+
265+ const double x = (factorial (N - i + 1 ) - factorial (n + 1 ) -
266+ factorial (N - i - n + 1 ));
178267 numer[i] = exp (x - denom)*hist[i];
179268 }
180269 }
@@ -213,9 +302,7 @@ extrap_bootstrap(const bool VERBOSE, const bool allow_defects,
213302
214303 // setup rng
215304 srand (time (0 ) + getpid ());
216- gsl_rng_env_setup ();
217- gsl_rng *rng = gsl_rng_alloc (gsl_rng_default);
218- gsl_rng_set (rng, seed);
305+ std::mt19937 rng (seed);
219306
220307 // const double vals_sum = get_counts_from_hist(orig_hist);
221308 const double initial_distinct =
@@ -461,7 +548,7 @@ lc_extrap(const int argc, const char **argv) {
461548 size_t n_bootstraps = 100 ;
462549 int diagonal = 0 ;
463550 double c_level = 0.95 ;
464- unsigned long int seed = 0 ;
551+ unsigned long int seed = 408 ;
465552
466553 /* FLAGS */
467554 bool VERBOSE = false ;
@@ -538,11 +625,6 @@ lc_extrap(const int argc, const char **argv) {
538625 const string input_file_name = leftover_args.front ();
539626 /* *****************************************************************/
540627
541- // if seed is not set, make it random
542- if (seed == 0 ) {
543- seed = rand ();
544- }
545-
546628 vector<double > counts_hist;
547629 size_t n_reads = 0 ;
548630
@@ -726,7 +808,7 @@ gc_extrap(const int argc, const char **argv) {
726808 bool SINGLE_ESTIMATE = false ;
727809 double max_extrap = 1.0e12 ;
728810 size_t n_bootstraps = 100 ;
729- unsigned long int seed = 0 ;
811+ unsigned long int seed = 408 ;
730812 bool allow_defects = false ;
731813
732814 bool NO_SEQUENCE = false ;
@@ -952,7 +1034,7 @@ c_curve(const int argc, const char **argv) {
9521034 bool PAIRED_END = false ;
9531035 bool HIST_INPUT = false ;
9541036 bool VALS_INPUT = false ;
955- unsigned long int seed = 0 ;
1037+ unsigned long int seed = 408 ;
9561038
9571039 string outfile;
9581040
@@ -1016,14 +1098,9 @@ c_curve(const int argc, const char **argv) {
10161098 const string input_file_name = leftover_args.front ();
10171099 /* *****************************************************************/
10181100
1019- if (seed == 0 ) {
1020- seed = rand ();
1021- }
10221101 // Setup the random number generator
1023- gsl_rng_env_setup ();
1024- gsl_rng *rng = gsl_rng_alloc (gsl_rng_default); // use default type
10251102 srand (time (0 ) + getpid ()); // give the random fxn a new seed
1026- gsl_rng_set (rng, seed); // initialize random number generator with the seed
1103+ std::mt19937 rng ( seed);
10271104
10281105 vector<double > counts_hist;
10291106 size_t n_reads = 0 ;
@@ -1136,7 +1213,12 @@ c_curve(const int argc, const char **argv) {
11361213static int
11371214bound_pop (const int argc, const char **argv) {
11381215
1139- try {
1216+ try {
1217+
1218+ #ifndef HAVE_GSL
1219+ cerr << " GSL has not been installed for the bound_pop module. Please recompile with GSL support." << endl;
1220+ return EXIT_SUCCESS;
1221+ #endif
11401222
11411223 bool VERBOSE = false ;
11421224 bool PAIRED_END = false ;
@@ -1156,7 +1238,7 @@ bound_pop(const int argc, const char **argv) {
11561238 size_t n_bootstraps = 500 ;
11571239 double c_level = 0.95 ;
11581240 size_t max_iter = 100 ;
1159- unsigned long int seed = 0 ;
1241+ unsigned long int seed = 408 ;
11601242
11611243 const string description =
11621244 " Estimate the size of the underlying population based on counts \
@@ -1270,7 +1352,8 @@ bound_pop(const int argc, const char **argv) {
12701352 // mu_r = (r + 1)! n_{r+1} / n_1
12711353 size_t idx = 1 ;
12721354 while (counts_hist[idx] > 0 && idx <= counts_hist.size ()) {
1273- measure_moments.push_back (exp (gsl_sf_lnfact (idx) +
1355+ // idx + 1 because function calculates (x-1)!
1356+ measure_moments.push_back (exp (factorial (idx + 1 ) +
12741357 log (counts_hist[idx]) -
12751358 log (counts_hist[1 ])));
12761359 if (!std::isfinite (measure_moments.back ())) {
@@ -1303,7 +1386,9 @@ bound_pop(const int argc, const char **argv) {
13031386 else
13041387 measure_moments.resize (2 *max_num_points);
13051388 size_t n_points = 0 ;
1389+ #ifdef HAVE_GSL
13061390 n_points = ensure_pos_def_mom_seq (measure_moments, tolerance, VERBOSE);
1391+ #endif
13071392 if (VERBOSE)
13081393 cerr << " n_points = " << n_points << endl;
13091394
@@ -1376,13 +1461,8 @@ bound_pop(const int argc, const char **argv) {
13761461 vector<double > quad_estimates;
13771462
13781463 // setup rng
1379- if (seed == 0 ) {
1380- seed = rand ();
1381- }
13821464 srand (time (0 ) + getpid ());
1383- gsl_rng_env_setup ();
1384- gsl_rng *rng = gsl_rng_alloc (gsl_rng_default);
1385- gsl_rng_set (rng, seed);
1465+ std::mt19937 rng (seed);
13861466
13871467 // hist may be sparse, to speed up bootstrapping
13881468 // sample only from positive entries
@@ -1407,13 +1487,16 @@ bound_pop(const int argc, const char **argv) {
14071487 // initialize moments, 0th moment is 1
14081488 vector<double > bootstrap_moments (1 , 1.0 );
14091489 // moments[r] = (r + 1)! n_{r+1} / n_1
1410- for (size_t i = 0 ; i < 2 *max_num_points; i++)
1411- bootstrap_moments.push_back (exp (gsl_sf_lnfact (i + 2 ) +
1490+ for (size_t i = 0 ; i < 2 *max_num_points; i++) {
1491+ bootstrap_moments.push_back (exp (factorial (i + 3 ) +
14121492 log (sample_hist[i + 2 ]) -
14131493 log (sample_hist[1 ])) );
1494+ }
14141495
14151496 size_t n_points = 0 ;
1497+ #ifdef HAVE_GSL
14161498 n_points = ensure_pos_def_mom_seq (bootstrap_moments, tolerance, VERBOSE);
1499+ #endif
14171500 n_points = std::min (n_points, max_num_points);
14181501 if (VERBOSE)
14191502 cerr << " n_points = " << n_points << endl;
0 commit comments