Skip to content

Commit d0ceb1f

Browse files
Merge pull request #149 from smithlabcode/pmd-tests
Pmd tests
2 parents 7539f2c + 25b470c commit d0ceb1f

File tree

6 files changed

+50
-31
lines changed

6 files changed

+50
-31
lines changed

Makefile.am

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ EXTRA_DIST = \
6767
data/two_epialleles.states \
6868
data/araTha1_simulated.counts.gz \
6969
data/mlml_test_data.tgz \
70+
data/pmd_test_data.counts.sym.gz \
7071
test_scripts/test_abismalidx.test \
7172
test_scripts/test_abismal.test \
7273
test_scripts/test_format.test \
@@ -76,6 +77,7 @@ EXTRA_DIST = \
7677
test_scripts/test_levels.test \
7778
test_scripts/test_sym.test \
7879
test_scripts/test_hmr.test \
80+
test_scripts/test_pmd.test \
7981
test_scripts/test_roi.test \
8082
test_scripts/test_selectsites.test \
8183
test_scripts/test_radmeth.test \
@@ -114,6 +116,7 @@ TESTS = test_scripts/test_abismalidx.test \
114116
test_scripts/test_levels.test \
115117
test_scripts/test_sym.test \
116118
test_scripts/test_hmr.test \
119+
test_scripts/test_pmd.test \
117120
test_scripts/test_roi.test \
118121
test_scripts/test_selectsites.test \
119122
test_scripts/test_radmeth.test \
@@ -247,4 +250,5 @@ CLEANFILES = \
247250
tests/two_epialleles.amr \
248251
tests/araTha1_simulated.hypermr \
249252
tests/methylome_ab.diff \
253+
tests/methylome.pmd \
250254
tests/mlml.out

configure.ac

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ tests/araTha1_simulated.counts.gz:data/araTha1_simulated.counts.gz
7676
tests/methylome_a.counts.sym:data/methylome_a.counts.sym
7777
tests/methylome_b.counts.sym:data/methylome_b.counts.sym
7878
tests/mlml_test_data.tgz:data/mlml_test_data.tgz
79+
tests/pmd_test_data.counts.sym.gz:data/pmd_test_data.counts.sym.gz
7980
])
8081

8182
AC_OUTPUT

data/md5sum.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ bcbf01be810cbf4051292813eb6b9225 tests/tRex1.idx
1919
e7e9590475a7f9b1ae41701d81892e57 tests/two_epialleles.amr
2020
ec6a686617cad31e9f7a37a3d378e6ed tests/two_epialleles.states
2121
93e38b20d162062a5d147c4290095a13 tests/mlml.out
22+
d947fe3d61ef7b1564558a69608f0e64 tests/methylome.pmd

data/pmd_test_data.counts.sym.gz

27.7 KB
Binary file not shown.

src/analysis/pmd.cpp

Lines changed: 30 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -55,13 +55,15 @@ using std::to_string;
5555

5656
using bamxx::bgzf_file;
5757

58+
template<typename T> using num_lim = std::numeric_limits<T>;
59+
5860
struct pmd_summary {
5961
pmd_summary(const vector<GenomicRegion> &pmds) {
6062
pmd_count = pmds.size();
6163
pmd_total_size = accumulate(cbegin(pmds), cend(pmds), 0,
6264
[](const uint64_t t, const GenomicRegion &p) {
6365
return t + p.get_width(); });
64-
pmd_mean_size =
66+
pmd_mean_size =
6567
static_cast<double>(pmd_total_size)/std::max(1ul, pmd_count);
6668
}
6769
// pmd_count is the number of identified PMDs.
@@ -166,7 +168,7 @@ find_best_bound(const bool IS_RIGHT_BOUNDARY,
166168
}
167169

168170
size_t best_idx = 0;
169-
double best_score = -numeric_limits<double>::max();
171+
double best_score = -num_lim<double>::max();
170172
if (meth_tot.size() > 0)
171173
for (size_t i = 1; i < meth_tot.size()-1; ++i) {
172174
size_t N_low, k_low, N_hi, k_hi;
@@ -199,8 +201,8 @@ find_best_bound(const bool IS_RIGHT_BOUNDARY,
199201
}
200202
}
201203
}
202-
return (best_score > -numeric_limits<double>::max()) ?
203-
positions[best_idx] : numeric_limits<size_t>::max();
204+
return (best_score > -num_lim<double>::max()) ?
205+
positions[best_idx] : num_lim<size_t>::max();
204206
}
205207

206208

@@ -419,10 +421,10 @@ optimize_boundaries(const size_t bin_size,
419421
/////
420422
for (size_t i = 0; i < pmds.size(); ++i) {
421423
const size_t start_site = bound_site[2*i];
422-
if (start_site != numeric_limits<size_t>::max())
424+
if (start_site != num_lim<size_t>::max())
423425
pmds[i].set_start(start_site);
424426
const size_t end_site = bound_site[2*i + 1];
425-
if (end_site != numeric_limits<size_t>::max())
427+
if (end_site != num_lim<size_t>::max())
426428
pmds[i].set_end(end_site + 1);
427429
}
428430

@@ -476,15 +478,15 @@ optimize_boundaries(const size_t bin_size,
476478
double
477479
get_score_cutoff_for_fdr(const vector<double> &scores, const double fdr) {
478480
if (fdr <= 0)
479-
return numeric_limits<double>::max();
481+
return num_lim<double>::max();
480482
else if (fdr > 1)
481-
return numeric_limits<double>::min();
483+
return num_lim<double>::min();
482484
vector<double> local(scores);
483485
std::sort(begin(local), end(local));
484486
size_t i = 0;
485487
for (; i < local.size() - 1 &&
486488
local[i+1] < fdr*static_cast<double>(i+1)/local.size(); ++i);
487-
return local[i];
489+
return local[i] + 1.0/scores.size();
488490
}
489491

490492

@@ -616,7 +618,7 @@ separate_regions(const bool VERBOSE, const size_t desert_size,
616618
size_t prev_cpg = 0;
617619
for (size_t i = 0; i < bins[0].size(); ++i) {
618620
const size_t dist = (i > 0 && bins[0][i].same_chrom(bins[0][i - 1])) ?
619-
bins[0][i].get_start() - prev_cpg : numeric_limits<size_t>::max();
621+
bins[0][i].get_start() - prev_cpg : num_lim<size_t>::max();
620622
if (dist > desert_size)
621623
reset_points.push_back(i);
622624
prev_cpg = bins[0][i].get_start();
@@ -662,16 +664,14 @@ static void
662664
assign_p_values(const vector<double> &random_scores,
663665
const vector<double> &observed_scores,
664666
vector<double> &p_values) {
665-
const double n_randoms =
666-
random_scores.size() == 0 ? 1 : random_scores.size();
667-
for (size_t i = 0; i < observed_scores.size(); ++i)
668-
p_values.push_back((end(random_scores) -
669-
upper_bound(begin(random_scores),
670-
end(random_scores),
671-
observed_scores[i]))/n_randoms);
667+
const double n_randoms = random_scores.size() == 0 ? 1 : random_scores.size();
668+
for (auto scr : observed_scores) {
669+
const auto scr_itr =
670+
upper_bound(cbegin(random_scores), cend(random_scores), scr);
671+
p_values.push_back(std::distance(scr_itr, cend(random_scores)) / n_randoms);
672+
}
672673
}
673674

674-
675675
static void
676676
read_params_file(const bool VERBOSE,
677677
const string &params_file,
@@ -794,8 +794,8 @@ load_array_data(const size_t bin_size,
794794
MSite site;
795795
while (read_site(in, site)) {
796796
// TODO: MN: I think that the block below should be placed later
797-
// in this scope. At this location, the methylation level of the
798-
// first site in a new chrom is contributed to the last bin of the
797+
// in this scope. At this location, the methylation level of the
798+
// first site in a new chrom is contributed to the last bin of the
799799
// previous chrom.
800800
if (site.n_reads > 0) { // its covered by a probe
801801
++num_probes_in_bin;
@@ -1026,7 +1026,7 @@ get_min_reads_for_confidence(const double conf_level) {
10261026
}
10271027

10281028

1029-
// ADS: this function will return numeric_limits<size_t>::max() if the
1029+
// ADS: this function will return num_lim<size_t>::max() if the
10301030
// fraction of "good" bins is zero for all attempted bin sizes.
10311031
static size_t
10321032
binsize_selection(const bool &VERBOSE, const size_t resolution,
@@ -1049,8 +1049,8 @@ binsize_selection(const bool &VERBOSE, const size_t resolution,
10491049
if (frac_passed < min_frac_passed)
10501050
bin_size += resolution;
10511051
}
1052-
return frac_passed <= numeric_limits<double>::min() ?
1053-
std::numeric_limits<size_t>::max() : bin_size;
1052+
return frac_passed <= num_lim<double>::min() ?
1053+
num_lim<size_t>::max() : bin_size;
10541054
}
10551055

10561056

@@ -1090,7 +1090,7 @@ get_union_of_bins(const vector<vector<SimpleGenomicRegion> > &orig,
10901090
std::inplace_merge(first, middle, middle + orig[i].size());
10911091
}
10921092
// ensure unique bins
1093-
bins.resize(std::distance(begin(bins), unique(begin(bins), end(bins))));
1093+
bins.erase(unique(begin(bins), end(bins)), end(bins));
10941094
bins.shrink_to_fit();
10951095

10961096
// make sure all bins are aligned at same boundaries
@@ -1243,7 +1243,7 @@ main_pmd(int argc, const char **argv) {
12431243
min_bin_size, max_bin_size,
12441244
confidence_interval, prop_accept,
12451245
cpgs_file[i]);
1246-
if (bin_size == std::numeric_limits<size_t>::max())
1246+
if (bin_size == num_lim<size_t>::max())
12471247
insufficient_data = true;
12481248
desert_size = 5*bin_size; // TODO: explore extrapolation number
12491249
}
@@ -1288,7 +1288,7 @@ main_pmd(int argc, const char **argv) {
12881288
reads[i], array_status);
12891289
const double total_observations =
12901290
accumulate(begin(reads[i]), end(reads[i]), 0);
1291-
if (total_observations <= numeric_limits<double>::min())
1291+
if (total_observations <= num_lim<double>::min())
12921292
insufficient_data = true;
12931293
if (VERBOSE)
12941294
cerr << "TOTAL BINS: " << bins[i].size() << endl
@@ -1336,7 +1336,7 @@ main_pmd(int argc, const char **argv) {
13361336
vector<double> reps_fg_beta(n_replicates, 0.95);
13371337
vector<double> reps_bg_alpha(n_replicates, 0.95);
13381338
vector<double> reps_bg_beta(n_replicates, 0.05);
1339-
double score_cutoff_for_fdr = numeric_limits<double>::max();
1339+
double score_cutoff_for_fdr = num_lim<double>::max();
13401340

13411341
if (!params_in_file.empty()) {
13421342
// read parameters files
@@ -1387,9 +1387,8 @@ main_pmd(int argc, const char **argv) {
13871387
reps_bg_alpha, reps_bg_beta, classes,
13881388
scores, array_status);
13891389

1390-
if (!posteriors_out_prefix.empty()) {
1391-
write_posteriors_file(posteriors_out_prefix + ".emissions", bins, scores);
1392-
}
1390+
if (!posteriors_out_prefix.empty())
1391+
write_posteriors_file(posteriors_out_prefix + ".posteriors", bins, scores);
13931392

13941393
vector<double> domain_scores;
13951394
get_domain_scores(classes, meth, reset_points, domain_scores);
@@ -1405,7 +1404,7 @@ main_pmd(int argc, const char **argv) {
14051404
vector<double> p_values;
14061405
assign_p_values(random_scores, domain_scores, p_values);
14071406

1408-
if (score_cutoff_for_fdr == numeric_limits<double>::max() &&
1407+
if (score_cutoff_for_fdr == num_lim<double>::max() &&
14091408
!p_values.empty())
14101409
score_cutoff_for_fdr = get_score_cutoff_for_fdr(p_values, 0.01);
14111410

test_scripts/test_pmd.test

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
#!/usr/bin/env bash
2+
3+
infile=tests/pmd_test_data.counts.sym.gz
4+
outfile=tests/methylome.pmd
5+
if [[ -e "${infile}" ]]; then
6+
./dnmtools pmd -o ${outfile} ${infile}
7+
x=$(md5sum -c tests/md5sum.txt | grep "${outfile}:" | cut -d ' ' -f 2)
8+
if [[ "${x}" != "OK" ]]; then
9+
exit 1;
10+
fi
11+
else
12+
echo "${infile} not found; skipping remaining tests";
13+
exit 77;
14+
fi

0 commit comments

Comments
 (0)