Skip to content

Commit 3f09c9d

Browse files
committed
major code changes: removed gsl in preseq.cpp except for bound_pop. updated configure.ac to enable gsl for bound_pop.
1 parent cf751f7 commit 3f09c9d

File tree

2 files changed

+151
-55
lines changed

2 files changed

+151
-55
lines changed

configure.ac

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
1414
dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
1515
dnl General Public License for more details.
1616

17-
AC_INIT([preseq], [3.0], [[email protected]],
17+
AC_INIT([preseq], [3.0.1], [[email protected]],
1818
[preseq], [https://github.com/smithlabcode/preseq])
1919
dnl the config.h is not currently #included in the source, and only
2020
dnl used to keep command lines short.
@@ -50,9 +50,22 @@ AS_IF([test "x$enable_hts" = "xyes"],
5050
)
5151
AM_CONDITIONAL([ENABLE_HTS], [test "x$enable_hts" = "xyes"])
5252

53-
dnl check for required libraries
54-
AC_SEARCH_LIBS([cblas_dgemm], [gslcblas])
55-
AC_SEARCH_LIBS([gsl_blas_dgemm], [gsl])
53+
dnl check for GSL if requested
54+
gsl_fail_msg="
55+
56+
Failed to locate GSL on your system.
57+
"
58+
59+
AC_ARG_ENABLE([gsl],
60+
[AS_HELP_STRING([--enable-gsl], [Enable GSLLib @<:@yes@:>@])],
61+
[enable_gsl=yes], [enable_gsl=no])
62+
AS_IF([test "x$enable_gsl" = "xyes"],
63+
AC_CHECK_LIB([m],[cos])
64+
AC_CHECK_LIB([gslcblas],[cblas_dgemm])
65+
[AC_CHECK_LIB([gsl], [gsl_blas_dgemm], [],
66+
[AC_MSG_FAILURE([$gsl_fail_msg])])]
67+
)
68+
AM_CONDITIONAL([ENABLE_GSL], [test "x$enable_gsl" = "xyes"])
5669

5770
AC_CONFIG_FILES([Makefile])
5871
AC_OUTPUT

src/preseq.cpp

Lines changed: 134 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,7 @@
3434
#include <fstream>
3535
#include <iostream>
3636
#include <sstream>
37-
38-
#include <gsl/gsl_cdf.h>
39-
#include <gsl/gsl_randist.h>
40-
#include <gsl/gsl_statistics_double.h>
41-
#include <gsl/gsl_sf_gamma.h>
37+
#include <random>
4238

4339
#include <OptionParser.hpp>
4440
#include <smithlab_utils.hpp>
@@ -65,7 +61,7 @@ using std::unordered_map;
6561
using std::runtime_error;
6662
using std::to_string;
6763

68-
static const string preseq_version = "3.0.0";
64+
static const string preseq_version = "3.0.1";
6965

7066
template <typename T> T
7167
get_counts_from_hist(const vector<T> &h) {
@@ -75,6 +71,40 @@ get_counts_from_hist(const vector<T> &h) {
7571
return c;
7672
}
7773

74+
template<class T>
75+
T
76+
median_from_sorted_vector (const vector<T> sorted_data,
77+
const size_t stride, const size_t n) {
78+
79+
if (n == 0 || sorted_data.empty()) return 0;
80+
81+
const size_t lhs = (n - 1) / 2;
82+
const size_t rhs = n / 2;
83+
84+
if (lhs == rhs) return sorted_data[lhs * stride];
85+
86+
else return (sorted_data[lhs * stride] + sorted_data[rhs * stride])/2.0;
87+
}
88+
89+
template<class T>
90+
T
91+
quantile_from_sorted_vector (const vector<T> sorted_data,
92+
const size_t stride, const size_t n,
93+
const double f) {
94+
const double index = f * (n - 1);
95+
const size_t lhs = (int)index;
96+
const double delta = index - lhs;
97+
double result;
98+
99+
if (n == 0 || sorted_data.empty()) return 0.0;
100+
101+
if (lhs == n - 1) return sorted_data[lhs * stride];
102+
103+
else return result = (1 - delta) * sorted_data[lhs * stride]
104+
+ delta * sorted_data[(lhs + 1) * stride];
105+
}
106+
107+
78108
// Confidence interval stuff
79109
static void
80110
median_and_ci(vector<double> estimates, // by val so we can sort them
@@ -86,14 +116,11 @@ median_and_ci(vector<double> estimates, // by val so we can sort them
86116

87117
const double alpha = 1.0 - ci_level;
88118
const size_t N = estimates.size();
89-
median_estimate =
90-
gsl_stats_median_from_sorted_data(&estimates[0], 1, N);
91-
lower_ci_estimate =
92-
gsl_stats_quantile_from_sorted_data(&estimates[0], 1, N, alpha/2);
93-
upper_ci_estimate =
94-
gsl_stats_quantile_from_sorted_data(&estimates[0], 1, N, 1.0 - alpha/2);
95-
}
96119

120+
median_estimate = median_from_sorted_vector(estimates, 1, N);
121+
lower_ci_estimate = quantile_from_sorted_vector(estimates, 1, N, alpha/2);
122+
upper_ci_estimate = quantile_from_sorted_vector(estimates, 1, N, 1.0 - alpha/2);
123+
}
97124

98125
static void
99126
vector_median_and_ci(const vector<vector<double> > &bootstrap_estimates,
@@ -126,6 +153,67 @@ vector_median_and_ci(const vector<vector<double> > &bootstrap_estimates,
126153
}
127154
}
128155

156+
template <typename uint_type>
157+
void
158+
multinomial(std::mt19937 &gen, const vector<double> &mult_probs,
159+
uint_type trials, vector<uint_type> &result) {
160+
161+
typedef std::binomial_distribution<unsigned int> binom_dist;
162+
163+
result.clear();
164+
result.resize(mult_probs.size());
165+
166+
double remaining_prob = accumulate(begin(mult_probs), end(mult_probs), 0.0);
167+
168+
auto r(begin(result));
169+
auto p(begin(mult_probs));
170+
171+
while (p != end(mult_probs)) { // iterate to sample for each category
172+
173+
*r = binom_dist(trials, (*p)/remaining_prob)(gen); // take the sample
174+
175+
remaining_prob -= *p++; // update remaining probability mass
176+
trials -= *r++; // update remaining trials needed
177+
}
178+
179+
if (trials > 0)
180+
throw runtime_error("multinomial sampling failed");
181+
}
182+
183+
// Lanczos approximation for gamma function for x >= 0.5 - essentially an
184+
// approximation for (x-1)!
185+
static double
186+
factorial (double x) {
187+
188+
// constants
189+
double LogRootTwoPi = 0.9189385332046727;
190+
double Euler = 2.71828182845904523536028747135;
191+
192+
// Approximation for factorial is actually x-1
193+
x -= 1.0;
194+
195+
vector<double> lanczos {
196+
0.99999999999980993227684700473478,
197+
676.520368121885098567009190444019,
198+
-1259.13921672240287047156078755283,
199+
771.3234287776530788486528258894,
200+
-176.61502916214059906584551354,
201+
12.507343278686904814458936853,
202+
-0.13857109526572011689554707,
203+
9.984369578019570859563e-6,
204+
1.50563273514931155834e-7
205+
};
206+
207+
double Ag = lanczos[0];
208+
209+
for (size_t k=1; k < lanczos.size(); k++)
210+
Ag += lanczos[k] / (x + k);
211+
212+
double term1 = (x + 0.5) * log((x + 7.5) / Euler);
213+
double term2 = LogRootTwoPi + log(Ag);
214+
215+
return term1 + (term2 - 7.0);
216+
}
129217

130218
////////////////////////////////////////////////////////////////////////
131219
///// EXTRAP MODE BELOW HERE
@@ -136,19 +224,18 @@ vector_median_and_ci(const vector<vector<double> > &bootstrap_estimates,
136224
// distinct_counts_hist[k] = vals_hist[vals_hist_distinct_counts[k]]
137225
// stores the kth positive value of vals_hist
138226
void
139-
resample_hist(const gsl_rng *rng, const vector<size_t> &vals_hist_distinct_counts,
227+
resample_hist(std::mt19937 &gen, const vector<size_t> &vals_hist_distinct_counts,
140228
const vector<double> &distinct_counts_hist,
141229
vector<double> &out_hist) {
142230

143231
const size_t hist_size = distinct_counts_hist.size();
144232
vector<unsigned int> sample_distinct_counts_hist(hist_size, 0);
145233

146-
const unsigned int distinct =
234+
unsigned int distinct =
147235
accumulate(begin(distinct_counts_hist), end(distinct_counts_hist), 0.0);
148-
149-
gsl_ran_multinomial(rng, hist_size, distinct,
150-
&distinct_counts_hist.front(),
151-
&sample_distinct_counts_hist.front());
236+
237+
multinomial(gen, distinct_counts_hist, distinct,
238+
sample_distinct_counts_hist);
152239

153240
out_hist.clear();
154241
out_hist.resize(vals_hist_distinct_counts.back() + 1, 0.0);
@@ -164,17 +251,19 @@ resample_hist(const gsl_rng *rng, const vector<size_t> &vals_hist_distinct_count
164251
static double
165252
interpolate_distinct(const vector<double> &hist, const size_t N,
166253
const size_t S, const size_t n) {
167-
const double denom =
168-
gsl_sf_lngamma(N + 1) - gsl_sf_lngamma(n + 1) - gsl_sf_lngamma(N - n + 1);
254+
255+
const double denom = factorial(N + 1) - factorial(n + 1) - factorial(N - n + 1);
256+
169257
vector<double> numer(hist.size(), 0);
170258
for (size_t i = 1; i < hist.size(); i++) {
171259
// N - i -n + 1 should be greater than 0
172260
if (N < i + n) {
173261
numer[i] = 0;
174262
}
175263
else {
176-
const double x = (gsl_sf_lngamma(N - i + 1) - gsl_sf_lngamma(n + 1) -
177-
gsl_sf_lngamma(N - i - n + 1));
264+
265+
const double x = (factorial(N - i + 1) - factorial(n + 1) -
266+
factorial(N - i - n + 1));
178267
numer[i] = exp(x - denom)*hist[i];
179268
}
180269
}
@@ -213,9 +302,7 @@ extrap_bootstrap(const bool VERBOSE, const bool allow_defects,
213302

214303
//setup rng
215304
srand(time(0) + getpid());
216-
gsl_rng_env_setup();
217-
gsl_rng *rng = gsl_rng_alloc(gsl_rng_default);
218-
gsl_rng_set(rng, seed);
305+
std::mt19937 rng(seed);
219306

220307
// const double vals_sum = get_counts_from_hist(orig_hist);
221308
const double initial_distinct =
@@ -461,7 +548,7 @@ lc_extrap(const int argc, const char **argv) {
461548
size_t n_bootstraps = 100;
462549
int diagonal = 0;
463550
double c_level = 0.95;
464-
unsigned long int seed = 0;
551+
unsigned long int seed = 408;
465552

466553
/* FLAGS */
467554
bool VERBOSE = false;
@@ -538,11 +625,6 @@ lc_extrap(const int argc, const char **argv) {
538625
const string input_file_name = leftover_args.front();
539626
/******************************************************************/
540627

541-
// if seed is not set, make it random
542-
if (seed == 0) {
543-
seed = rand();
544-
}
545-
546628
vector<double> counts_hist;
547629
size_t n_reads = 0;
548630

@@ -726,7 +808,7 @@ gc_extrap(const int argc, const char **argv) {
726808
bool SINGLE_ESTIMATE = false;
727809
double max_extrap = 1.0e12;
728810
size_t n_bootstraps = 100;
729-
unsigned long int seed = 0;
811+
unsigned long int seed = 408;
730812
bool allow_defects = false;
731813

732814
bool NO_SEQUENCE = false;
@@ -952,7 +1034,7 @@ c_curve(const int argc, const char **argv) {
9521034
bool PAIRED_END = false;
9531035
bool HIST_INPUT = false;
9541036
bool VALS_INPUT = false;
955-
unsigned long int seed = 0;
1037+
unsigned long int seed = 408;
9561038

9571039
string outfile;
9581040

@@ -1016,14 +1098,9 @@ c_curve(const int argc, const char **argv) {
10161098
const string input_file_name = leftover_args.front();
10171099
/******************************************************************/
10181100

1019-
if (seed == 0) {
1020-
seed = rand();
1021-
}
10221101
// Setup the random number generator
1023-
gsl_rng_env_setup();
1024-
gsl_rng *rng = gsl_rng_alloc(gsl_rng_default); // use default type
10251102
srand(time(0) + getpid()); //give the random fxn a new seed
1026-
gsl_rng_set(rng, seed); //initialize random number generator with the seed
1103+
std::mt19937 rng(seed);
10271104

10281105
vector<double> counts_hist;
10291106
size_t n_reads = 0;
@@ -1136,7 +1213,12 @@ c_curve(const int argc, const char **argv) {
11361213
static int
11371214
bound_pop(const int argc, const char **argv) {
11381215

1139-
try{
1216+
try {
1217+
1218+
#ifndef HAVE_GSL
1219+
cerr << "GSL has not been installed for the bound_pop module. Please recompile with GSL support." << endl;
1220+
return EXIT_SUCCESS;
1221+
#endif
11401222

11411223
bool VERBOSE = false;
11421224
bool PAIRED_END = false;
@@ -1156,7 +1238,7 @@ bound_pop(const int argc, const char **argv) {
11561238
size_t n_bootstraps = 500;
11571239
double c_level = 0.95;
11581240
size_t max_iter = 100;
1159-
unsigned long int seed = 0;
1241+
unsigned long int seed = 408;
11601242

11611243
const string description =
11621244
"Estimate the size of the underlying population based on counts \
@@ -1270,7 +1352,8 @@ bound_pop(const int argc, const char **argv) {
12701352
// mu_r = (r + 1)! n_{r+1} / n_1
12711353
size_t idx = 1;
12721354
while (counts_hist[idx] > 0 && idx <= counts_hist.size()) {
1273-
measure_moments.push_back(exp(gsl_sf_lnfact(idx) +
1355+
// idx + 1 because function calculates (x-1)!
1356+
measure_moments.push_back(exp(factorial(idx + 1) +
12741357
log(counts_hist[idx]) -
12751358
log(counts_hist[1])));
12761359
if (!std::isfinite(measure_moments.back())) {
@@ -1303,7 +1386,9 @@ bound_pop(const int argc, const char **argv) {
13031386
else
13041387
measure_moments.resize(2*max_num_points);
13051388
size_t n_points = 0;
1389+
#ifdef HAVE_GSL
13061390
n_points = ensure_pos_def_mom_seq(measure_moments, tolerance, VERBOSE);
1391+
#endif
13071392
if (VERBOSE)
13081393
cerr << "n_points = " << n_points << endl;
13091394

@@ -1376,13 +1461,8 @@ bound_pop(const int argc, const char **argv) {
13761461
vector<double> quad_estimates;
13771462

13781463
//setup rng
1379-
if (seed == 0) {
1380-
seed = rand();
1381-
}
13821464
srand(time(0) + getpid());
1383-
gsl_rng_env_setup();
1384-
gsl_rng *rng = gsl_rng_alloc(gsl_rng_default);
1385-
gsl_rng_set(rng, seed);
1465+
std::mt19937 rng(seed);
13861466

13871467
// hist may be sparse, to speed up bootstrapping
13881468
// sample only from positive entries
@@ -1407,13 +1487,16 @@ bound_pop(const int argc, const char **argv) {
14071487
// initialize moments, 0th moment is 1
14081488
vector<double> bootstrap_moments(1, 1.0);
14091489
// moments[r] = (r + 1)! n_{r+1} / n_1
1410-
for (size_t i = 0; i < 2*max_num_points; i++)
1411-
bootstrap_moments.push_back(exp(gsl_sf_lnfact(i + 2) +
1490+
for (size_t i = 0; i < 2*max_num_points; i++) {
1491+
bootstrap_moments.push_back(exp(factorial(i + 3) +
14121492
log(sample_hist[i + 2]) -
14131493
log(sample_hist[1])) );
1494+
}
14141495

14151496
size_t n_points = 0;
1497+
#ifdef HAVE_GSL
14161498
n_points = ensure_pos_def_mom_seq(bootstrap_moments, tolerance, VERBOSE);
1499+
#endif
14171500
n_points = std::min(n_points, max_num_points);
14181501
if (VERBOSE)
14191502
cerr << "n_points = " << n_points << endl;

0 commit comments

Comments
 (0)