Skip to content

Commit 7a5a10a

Browse files
Merge pull request #142 from smithlabcode/mlml-test
Adding a test script for MLML, along with configuration and test data. Also made modifications to mlml that check for validity of input files in places where the check was neglected.
2 parents 05b45ac + 89fd5ed commit 7a5a10a

File tree

6 files changed

+81
-58
lines changed

6 files changed

+81
-58
lines changed

Makefile.am

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ EXTRA_DIST = \
6666
data/radmeth_test_design.txt \
6767
data/two_epialleles.states \
6868
data/araTha1_simulated.counts.gz \
69+
data/mlml_test_data.tgz \
6970
test_scripts/test_abismalidx.test \
7071
test_scripts/test_abismal.test \
7172
test_scripts/test_format.test \
@@ -81,7 +82,8 @@ EXTRA_DIST = \
8182
test_scripts/test_states.test \
8283
test_scripts/test_amrfinder.test \
8384
test_scripts/test_hypermr.test \
84-
test_scripts/test_diff.test
85+
test_scripts/test_diff.test \
86+
test_scripts/test_mlml.test
8587

8688
### ADS: the file md5sum.txt can be regenerated by running the tests
8789
### with "make check" and then doing `md5sum tests/* > md5sum.txt`
@@ -118,7 +120,8 @@ TESTS = test_scripts/test_abismalidx.test \
118120
test_scripts/test_states.test \
119121
test_scripts/test_amrfinder.test \
120122
test_scripts/test_hypermr.test \
121-
test_scripts/test_diff.test
123+
test_scripts/test_diff.test \
124+
test_scripts/test_mlml.test
122125

123126
TEST_EXTENSIONS = .test
124127

@@ -243,4 +246,5 @@ CLEANFILES = \
243246
tests/reads.epiread \
244247
tests/two_epialleles.amr \
245248
tests/araTha1_simulated.hypermr \
246-
tests/methylome_ab.diff
249+
tests/methylome_ab.diff \
250+
tests/mlml.out

configure.ac

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ tests/two_epialleles.states:data/two_epialleles.states
7575
tests/araTha1_simulated.counts.gz:data/araTha1_simulated.counts.gz
7676
tests/methylome_a.counts.sym:data/methylome_a.counts.sym
7777
tests/methylome_b.counts.sym:data/methylome_b.counts.sym
78+
tests/mlml_test_data.tgz:data/mlml_test_data.tgz
7879
])
7980

8081
AC_OUTPUT

data/md5sum.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,3 +18,4 @@ bcbf01be810cbf4051292813eb6b9225 tests/tRex1.idx
1818
7c84dafb85252a04a45704da71eba7c4 tests/tRex1_promoters.roi.bed
1919
e7e9590475a7f9b1ae41701d81892e57 tests/two_epialleles.amr
2020
ec6a686617cad31e9f7a37a3d378e6ed tests/two_epialleles.states
21+
93e38b20d162062a5d147c4290095a13 tests/mlml.out

data/mlml_test_data.tgz

10 KB
Binary file not shown.

src/mlml/mlml.cpp

Lines changed: 47 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <cmath>
2525
#include <sys/stat.h>
2626
#include <unistd.h>
27+
#include <filesystem>
2728

2829
#include "OptionParser.hpp"
2930
#include "smithlab_os.hpp"
@@ -58,8 +59,8 @@ wilson_ci_for_binomial(const double alpha, const double n,
5859
static int
5960
binom_null(const double alpha, const double n,
6061
const double p_hat, const double p) {
61-
// ADS: function tests if final argument p is outside the (1 -
62-
// alpha) CI for p_hat given counts n.
62+
// ADS: function tests if final argument p is outside the (1-alpha)
63+
// CI for p_hat given counts n.
6364
double lower = 0.0;
6465
double upper = 0.0;
6566
wilson_ci_for_binomial(alpha, n, p_hat, lower, upper);
@@ -140,6 +141,7 @@ static void
140141
expectation(const size_t a, const size_t x,
141142
const double p, const double q,
142143
vector<vector<double>> &coeff) {
144+
143145
assert(p > 0.0 && q > 0.0);
144146
assert(p + q <= 1.0);
145147

@@ -156,8 +158,7 @@ expectation(const size_t a, const size_t x,
156158
coeff = vector<vector<double>>(x + 1, vector<double>(a + 1, 0.0));
157159

158160
for (size_t k = 0; k <= x; ++k) {
159-
const double x_c_k =
160-
lnchoose(x, k) + log_p*k + log_1mpq*(x - k) - log_1mq*x;
161+
const double x_c_k = lnchoose(x, k) + log_p*k + log_1mpq*(x-k) - log_1mq*x;
161162
for (size_t j = 0; j <= a; ++j)
162163
coeff[k][j] = exp(a_c_j[j] + x_c_k);
163164
}
@@ -201,8 +202,6 @@ expectation_maximization(const bool DEBUG,
201202
const double tolerance,
202203
double &p, double &q) {
203204
constexpr auto max_iterations = 500;
204-
size_t iter = 0;
205-
double delta = std::numeric_limits<double>::max();
206205

207206
if (DEBUG) {
208207
cerr << "t:" << a << ", u:" << b
@@ -211,6 +210,8 @@ expectation_maximization(const bool DEBUG,
211210
<< "p:" << p << ", q:" << q << endl;
212211
}
213212

213+
uint32_t iter = 0u;
214+
double delta = std::numeric_limits<double>::max();
214215
do {
215216
vector<vector<double>> coeff;
216217

@@ -226,9 +227,7 @@ expectation_maximization(const bool DEBUG,
226227
q = max(tolerance, min(q, 1-tolerance-p));
227228
delta = max(fabs(p_old - p), fabs(q_old - q));
228229

229-
++iter;
230-
231-
} while (delta > tolerance && iter <= max_iterations);
230+
} while (delta > tolerance && iter++ < max_iterations);
232231

233232
if (DEBUG) {
234233
cerr << iter << '\t'
@@ -242,39 +241,33 @@ expectation_maximization(const bool DEBUG,
242241
/// only 2 input files ///
243242
//////////////////////////
244243
static double
245-
log_L(const size_t x, const size_t y,
246-
const size_t z, const size_t w,
244+
log_L(const size_t x, const size_t y, const size_t z, const size_t w,
247245
const double p, const double q) {
248-
assert(p+q <= 1);
249-
double log_lkhd = lnchoose(x+y, x) + lnchoose(z+w, z);
250-
if (p > 0) log_lkhd += x*log(p);
251-
if (p < 1) log_lkhd += y*log(1-p);
246+
assert(p + q <= 1);
247+
double log_lkhd = lnchoose(x + y, x) + lnchoose(z + w, z);
248+
if (p > 0) log_lkhd += x * log(p);
249+
if (p < 1) log_lkhd += y * log(1 - p);
252250

253-
if (q > 0) log_lkhd += z*log(q);
254-
if (q < 1) log_lkhd += w*log(1-q);
251+
if (q > 0) log_lkhd += z * log(q);
252+
if (q < 1) log_lkhd += w * log(1 - q);
255253

256-
return(log_lkhd);
254+
return (log_lkhd);
257255
}
258256

259257
/* Get start point for p and q, if only 2 inputs are available */
260-
static void
261-
get_start_point(const size_t x, const size_t y,
262-
const size_t z, const size_t w,
263-
const double tolerance,
264-
double &p, double &q) {
265-
266-
p = tolerance + 1.0*x/(x + y);
267-
q = tolerance + 1.0*z/(z + w);
258+
static inline void
259+
get_start_point(const size_t x, const size_t y, const size_t z, const size_t w,
260+
const double tolerance, double &p, double &q) {
261+
p = tolerance + 1.0 * x / (x + y);
262+
q = tolerance + 1.0 * z / (z + w);
268263
if (p + q >= 1.0) {
269-
p = max(tolerance, (1.0 - tolerance)*p/(p+q));
264+
p = max(tolerance, (1.0 - tolerance) * p / (p + q));
270265
q = max(tolerance, 1.0 - p - tolerance);
271266
}
272-
273267
}
274268

275269
static void
276-
expectation(const size_t y,
277-
const double p, const double q,
270+
expectation(const size_t y, const double p, const double q,
278271
vector<double> &coeff) {
279272
assert(p > 0.0 && q > 0.0);
280273
assert(p + q < 1.0);
@@ -283,30 +276,24 @@ expectation(const size_t y,
283276
const double log_q = log(q);
284277
const double log_1mp = log(1.0 - p);
285278
for (size_t j = 0; j <= y; ++j)
286-
coeff.push_back(exp(lnchoose(y, j) + log_q*j
287-
+ log_1mpq*(y-j) - log_1mp*y));
279+
coeff.push_back(
280+
exp(lnchoose(y, j) + log_q * j + log_1mpq * (y - j) - log_1mp * y));
288281
}
289282

290-
static double
291-
maximization(const size_t x, const size_t y,
292-
const vector<double> &coeff) {
293-
double num = x, denom = x+y;
294-
for (size_t j = 0; j <= y; ++j) {
295-
denom -= coeff[j]*j;
296-
}
297-
return num/denom;
283+
static inline double
284+
maximization(const size_t x, const size_t y, const vector<double> &coeff) {
285+
double num = x, denom = x + y;
286+
for (size_t j = 0; j <= y; ++j) { denom -= coeff[j] * j; }
287+
return num / denom;
298288
}
299289

300-
static double
301-
update_q(const size_t x, const size_t y,
302-
const size_t z, const size_t w,
290+
static inline double
291+
update_q(const size_t x, const size_t y, const size_t z, const size_t w,
303292
const vector<double> &coeff) {
304293
double num = z;
305-
double denom = x+y+z+w;
306-
for (size_t j =0; j <=y; ++j) {
307-
num += coeff[j]*j;
308-
}
309-
return num/denom;
294+
const double denom = x + y + z + w;
295+
for (size_t j = 0; j <= y; ++j) num += coeff[j] * j;
296+
return num / denom;
310297
}
311298

312299
static void
@@ -340,12 +327,9 @@ expectation_maximization(const bool DEBUG,
340327
}
341328
}
342329

343-
344-
static bool
330+
static inline bool
345331
check_file_non_empty(const string &filename) {
346-
struct stat buf;
347-
stat(filename.c_str(), &buf);
348-
return buf.st_size != 0;
332+
return std::filesystem::file_size(filename) > 0;
349333
}
350334

351335
static void
@@ -561,7 +545,8 @@ process_two_types(const double alpha,
561545
size_t &total_sites,
562546
size_t &overshoot_sites,
563547
size_t &conflict_sites) {
564-
constexpr auto max_read_count = 500;
548+
549+
constexpr auto max_read_count = 500u;
565550

566551
ofstream of;
567552
if (!outfile.empty()) {
@@ -587,23 +572,30 @@ process_two_types(const double alpha,
587572
if (oxbs_file.empty()) {
588573
s_rev = true;
589574
f_in.open(hydroxy_file);
575+
if (!f_in) throw dnmt_error("failed to open file: " + hydroxy_file);
590576
s_in.open(bs_seq_file);
577+
if (!s_in) throw dnmt_error("failed to open file: " + bs_seq_file);
591578
}
592579
else if (hydroxy_file.empty()) {
593580
f_rev = true;
594581
f_in.open(bs_seq_file);
582+
if (!f_in) throw dnmt_error("failed to open file: " + bs_seq_file);
595583
s_in.open(oxbs_file);
584+
if (!s_in) throw dnmt_error("failed to open file: " + oxbs_file);
596585
}
597586
else {
598587
f_in.open(oxbs_file);
588+
if (!f_in) throw dnmt_error("failed to open file: " + oxbs_file);
599589
s_in.open(hydroxy_file);
590+
if (!s_in) throw dnmt_error("failed to open file: " + hydroxy_file);
600591
}
601592

593+
602594
MSite f, s;
603595
while (f_in >> f && s_in >> s) {
604596

605597
if (f.chrom != s.chrom || f.pos != s.pos)
606-
throw runtime_error("error: sites not synchronized between files");
598+
throw dnmt_error("error: sites not synchronized between files");
607599

608600
if (f.n_reads > max_read_count) f.n_reads = max_read_count;
609601
size_t x = f.n_meth();

test_scripts/test_mlml.test

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
#!/usr/bin/env bash
2+
3+
archive=tests/mlml_test_data.tgz
4+
if [[ ! -e "${archive}" ]] ; then
5+
echo "input missing; skipping test";
6+
exit 77;
7+
fi
8+
9+
infile1=tests/bs.counts
10+
infile2=tests/tab.counts
11+
infile3=tests/oxbs.counts
12+
outfile=tests/mlml.out
13+
tar -xf ${archive} -C tests
14+
if [[ -e "${infile1}" || -e "${infile2}" || -e "${infile3}" ]]; then
15+
./dnmtools mlml -o ${outfile} -bsseq ${infile1} -tabseq ${infile2} -oxbsseq ${infile3}
16+
x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
17+
rm -f ${infile1} ${infile2} ${infile3}
18+
if [[ "${x}" != "OK" ]]; then
19+
exit 1;
20+
fi
21+
else
22+
echo "input missing; skipping test";
23+
rm -f ${infile1} ${infile2} ${infile3}
24+
exit 77;
25+
fi

0 commit comments

Comments
 (0)