Skip to content

Commit 6169997

Browse files
mlml: adding checks for valid input files where some were lacking, also formatted some code and using c++17 filesystem for filesize
1 parent e3a1147 commit 6169997

File tree

1 file changed

+47
-55
lines changed

1 file changed

+47
-55
lines changed

src/mlml/mlml.cpp

Lines changed: 47 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <cmath>
2525
#include <sys/stat.h>
2626
#include <unistd.h>
27+
#include <filesystem>
2728

2829
#include "OptionParser.hpp"
2930
#include "smithlab_os.hpp"
@@ -58,8 +59,8 @@ wilson_ci_for_binomial(const double alpha, const double n,
5859
static int
5960
binom_null(const double alpha, const double n,
6061
const double p_hat, const double p) {
61-
// ADS: function tests if final argument p is outside the (1 -
62-
// alpha) CI for p_hat given counts n.
62+
// ADS: function tests if final argument p is outside the (1-alpha)
63+
// CI for p_hat given counts n.
6364
double lower = 0.0;
6465
double upper = 0.0;
6566
wilson_ci_for_binomial(alpha, n, p_hat, lower, upper);
@@ -140,6 +141,7 @@ static void
140141
expectation(const size_t a, const size_t x,
141142
const double p, const double q,
142143
vector<vector<double>> &coeff) {
144+
143145
assert(p > 0.0 && q > 0.0);
144146
assert(p + q <= 1.0);
145147

@@ -156,8 +158,7 @@ expectation(const size_t a, const size_t x,
156158
coeff = vector<vector<double>>(x + 1, vector<double>(a + 1, 0.0));
157159

158160
for (size_t k = 0; k <= x; ++k) {
159-
const double x_c_k =
160-
lnchoose(x, k) + log_p*k + log_1mpq*(x - k) - log_1mq*x;
161+
const double x_c_k = lnchoose(x, k) + log_p*k + log_1mpq*(x-k) - log_1mq*x;
161162
for (size_t j = 0; j <= a; ++j)
162163
coeff[k][j] = exp(a_c_j[j] + x_c_k);
163164
}
@@ -201,8 +202,6 @@ expectation_maximization(const bool DEBUG,
201202
const double tolerance,
202203
double &p, double &q) {
203204
constexpr auto max_iterations = 500;
204-
size_t iter = 0;
205-
double delta = std::numeric_limits<double>::max();
206205

207206
if (DEBUG) {
208207
cerr << "t:" << a << ", u:" << b
@@ -211,6 +210,8 @@ expectation_maximization(const bool DEBUG,
211210
<< "p:" << p << ", q:" << q << endl;
212211
}
213212

213+
uint32_t iter = 0u;
214+
double delta = std::numeric_limits<double>::max();
214215
do {
215216
vector<vector<double>> coeff;
216217

@@ -226,9 +227,7 @@ expectation_maximization(const bool DEBUG,
226227
q = max(tolerance, min(q, 1-tolerance-p));
227228
delta = max(fabs(p_old - p), fabs(q_old - q));
228229

229-
++iter;
230-
231-
} while (delta > tolerance && iter <= max_iterations);
230+
} while (delta > tolerance && iter++ < max_iterations);
232231

233232
if (DEBUG) {
234233
cerr << iter << '\t'
@@ -242,39 +241,33 @@ expectation_maximization(const bool DEBUG,
242241
/// only 2 input files ///
243242
//////////////////////////
244243
static double
245-
log_L(const size_t x, const size_t y,
246-
const size_t z, const size_t w,
244+
log_L(const size_t x, const size_t y, const size_t z, const size_t w,
247245
const double p, const double q) {
248-
assert(p+q <= 1);
249-
double log_lkhd = lnchoose(x+y, x) + lnchoose(z+w, z);
250-
if (p > 0) log_lkhd += x*log(p);
251-
if (p < 1) log_lkhd += y*log(1-p);
246+
assert(p + q <= 1);
247+
double log_lkhd = lnchoose(x + y, x) + lnchoose(z + w, z);
248+
if (p > 0) log_lkhd += x * log(p);
249+
if (p < 1) log_lkhd += y * log(1 - p);
252250

253-
if (q > 0) log_lkhd += z*log(q);
254-
if (q < 1) log_lkhd += w*log(1-q);
251+
if (q > 0) log_lkhd += z * log(q);
252+
if (q < 1) log_lkhd += w * log(1 - q);
255253

256-
return(log_lkhd);
254+
return (log_lkhd);
257255
}
258256

259257
/* Get start point for p and q, if only 2 inputs are available */
260-
static void
261-
get_start_point(const size_t x, const size_t y,
262-
const size_t z, const size_t w,
263-
const double tolerance,
264-
double &p, double &q) {
265-
266-
p = tolerance + 1.0*x/(x + y);
267-
q = tolerance + 1.0*z/(z + w);
258+
static inline void
259+
get_start_point(const size_t x, const size_t y, const size_t z, const size_t w,
260+
const double tolerance, double &p, double &q) {
261+
p = tolerance + 1.0 * x / (x + y);
262+
q = tolerance + 1.0 * z / (z + w);
268263
if (p + q >= 1.0) {
269-
p = max(tolerance, (1.0 - tolerance)*p/(p+q));
264+
p = max(tolerance, (1.0 - tolerance) * p / (p + q));
270265
q = max(tolerance, 1.0 - p - tolerance);
271266
}
272-
273267
}
274268

275269
static void
276-
expectation(const size_t y,
277-
const double p, const double q,
270+
expectation(const size_t y, const double p, const double q,
278271
vector<double> &coeff) {
279272
assert(p > 0.0 && q > 0.0);
280273
assert(p + q < 1.0);
@@ -283,30 +276,24 @@ expectation(const size_t y,
283276
const double log_q = log(q);
284277
const double log_1mp = log(1.0 - p);
285278
for (size_t j = 0; j <= y; ++j)
286-
coeff.push_back(exp(lnchoose(y, j) + log_q*j
287-
+ log_1mpq*(y-j) - log_1mp*y));
279+
coeff.push_back(
280+
exp(lnchoose(y, j) + log_q * j + log_1mpq * (y - j) - log_1mp * y));
288281
}
289282

290-
static double
291-
maximization(const size_t x, const size_t y,
292-
const vector<double> &coeff) {
293-
double num = x, denom = x+y;
294-
for (size_t j = 0; j <= y; ++j) {
295-
denom -= coeff[j]*j;
296-
}
297-
return num/denom;
283+
static inline double
284+
maximization(const size_t x, const size_t y, const vector<double> &coeff) {
285+
double num = x, denom = x + y;
286+
for (size_t j = 0; j <= y; ++j) { denom -= coeff[j] * j; }
287+
return num / denom;
298288
}
299289

300-
static double
301-
update_q(const size_t x, const size_t y,
302-
const size_t z, const size_t w,
290+
static inline double
291+
update_q(const size_t x, const size_t y, const size_t z, const size_t w,
303292
const vector<double> &coeff) {
304293
double num = z;
305-
double denom = x+y+z+w;
306-
for (size_t j =0; j <=y; ++j) {
307-
num += coeff[j]*j;
308-
}
309-
return num/denom;
294+
const double denom = x + y + z + w;
295+
for (size_t j = 0; j <= y; ++j) num += coeff[j] * j;
296+
return num / denom;
310297
}
311298

312299
static void
@@ -340,12 +327,9 @@ expectation_maximization(const bool DEBUG,
340327
}
341328
}
342329

343-
344-
static bool
330+
static inline bool
345331
check_file_non_empty(const string &filename) {
346-
struct stat buf;
347-
stat(filename.c_str(), &buf);
348-
return buf.st_size != 0;
332+
return std::filesystem::file_size(filename) > 0;
349333
}
350334

351335
static void
@@ -561,7 +545,8 @@ process_two_types(const double alpha,
561545
size_t &total_sites,
562546
size_t &overshoot_sites,
563547
size_t &conflict_sites) {
564-
constexpr auto max_read_count = 500;
548+
549+
constexpr auto max_read_count = 500u;
565550

566551
ofstream of;
567552
if (!outfile.empty()) {
@@ -587,23 +572,30 @@ process_two_types(const double alpha,
587572
if (oxbs_file.empty()) {
588573
s_rev = true;
589574
f_in.open(hydroxy_file);
575+
if (!f_in) throw dnmt_error("failed to open file: " + hydroxy_file);
590576
s_in.open(bs_seq_file);
577+
if (!s_in) throw dnmt_error("failed to open file: " + bs_seq_file);
591578
}
592579
else if (hydroxy_file.empty()) {
593580
f_rev = true;
594581
f_in.open(bs_seq_file);
582+
if (!f_in) throw dnmt_error("failed to open file: " + bs_seq_file);
595583
s_in.open(oxbs_file);
584+
if (!s_in) throw dnmt_error("failed to open file: " + oxbs_file);
596585
}
597586
else {
598587
f_in.open(oxbs_file);
588+
if (!f_in) throw dnmt_error("failed to open file: " + oxbs_file);
599589
s_in.open(hydroxy_file);
590+
if (!s_in) throw dnmt_error("failed to open file: " + hydroxy_file);
600591
}
601592

593+
602594
MSite f, s;
603595
while (f_in >> f && s_in >> s) {
604596

605597
if (f.chrom != s.chrom || f.pos != s.pos)
606-
throw runtime_error("error: sites not synchronized between files");
598+
throw dnmt_error("error: sites not synchronized between files");
607599

608600
if (f.n_reads > max_read_count) f.n_reads = max_read_count;
609601
size_t x = f.n_meth();

0 commit comments

Comments
 (0)