Skip to content

Commit 18e1f5c

Browse files
Merge pull request #133 from smithlabcode/pmd-summary
Pmd summary
2 parents 571d95d + da4d125 commit 18e1f5c

File tree

7 files changed

+151
-18
lines changed

7 files changed

+151
-18
lines changed

docs/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,15 +19,15 @@ pip install -U mkdocs
1919

2020
Build the HTML documentation by running
2121
```
22-
mkdocs build
22+
mkdocs build -f docs/mkdocs.yml
2323
```
2424
which will create a `site` directory where markdown files are
2525
converted to HTML
2626

2727
Create a local host for the HTML documentation by running
2828

2929
```
30-
mkdocs serve
30+
mkdocs serve -f docs/mkdocs.yml
3131
```
3232

3333
This will create the documentation, usually at http://localhost:8000 .

docs/content/hmr.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,3 +170,9 @@ File in which to write parameters learned during the current run.
170170
A random number seed. Randomization is used in a shuffling step prior
171171
to filering candidate HMRs. This parameter is typically only used for
172172
testing (default: 408).
173+
174+
```txt
175+
-S, -summary
176+
```
177+
Write the analysis summary to this file. The summary is not
178+
reported unless a file is specified here. This option is correct as of v1.4.0.

docs/content/pmd.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,3 +186,10 @@ Write the trained parameters to this file.
186186
-s, -seed
187187
```
188188
Specify a random seed value.
189+
190+
```txt
191+
-S, -summary
192+
```
193+
Write the analysis summary to this file. The summary is not
194+
reported unless a file is specified here. This option is correct as of v1.4.0.
195+

docs/content/uniq.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,10 @@ most helpful when the input and output are both BAM, where the threads
4949
can really speed things up.
5050

5151
```txt
52-
-S, -stats
52+
-S, -summary
5353
```
5454
Save statistics on duplication rates to this file. The statistics are not
55-
reported unless a file is specified here.
55+
reported unless a file is specified here. This option is correct as of v1.4.0.
5656

5757
```txt
5858
-hist

src/analysis/hmr.cpp

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,34 @@ using std::unordered_set;
5050

5151
using bamxx::bgzf_file;
5252

53+
struct hmr_summary {
54+
hmr_summary(const vector<GenomicRegion> &hmrs) {
55+
hmr_count = hmrs.size();
56+
hmr_total_size = accumulate(cbegin(hmrs), cend(hmrs), 0,
57+
[](const uint64_t t, const GenomicRegion &p) {
58+
return t + p.get_width(); });
59+
hmr_mean_size =
60+
static_cast<double>(hmr_total_size)/std::max(1ul, hmr_count);
61+
}
62+
// hmr_count is the number of identified HMRs.
63+
uint64_t hmr_count{};
64+
// total_hmr_size is the sum of the sizes of the identified HMRs
65+
uint64_t hmr_total_size{};
66+
// mean_hmr_size is the mean size of the identified HMRs
67+
double hmr_mean_size{};
68+
69+
string tostring() {
70+
std::ostringstream oss;
71+
oss << "hmr_count: " << hmr_count << endl
72+
<< "hmr_total_size: " << hmr_total_size << endl
73+
<< "hmr_mean_size: "
74+
<< std::fixed << std::setprecision(2) << hmr_mean_size;
75+
76+
return oss.str();
77+
}
78+
};
79+
80+
5381
static GenomicRegion
5482
as_gen_rgn(const MSite &s) {
5583
return GenomicRegion(s.chrom, s.pos, s.pos + 1);
@@ -386,6 +414,8 @@ main_hmr(int argc, const char **argv) {
386414
// corrections for small values
387415
const double tolerance = 1e-10;
388416

417+
string summary_file;
418+
389419
string params_in_file;
390420
string params_out_file;
391421

@@ -414,11 +444,16 @@ main_hmr(int argc, const char **argv) {
414444
opt_parse.add_opt("post-meth", '\0', "output file for single-CpG posteiror "
415445
"methylation probability (default: none)",
416446
false, meth_post_outfile);
447+
opt_parse.add_opt("post-meth", '\0', "output file for single-CpG posteiror "
448+
"methylation probability (default: none)",
449+
false, meth_post_outfile);
417450
opt_parse.add_opt("params-in", 'P', "HMM parameter file "
418451
"(override training)", false, params_in_file);
419452
opt_parse.add_opt("params-out", 'p', "write HMM parameters to this "
420453
"file (default: none)", false, params_out_file);
421454
opt_parse.add_opt("seed", 's', "specify random seed", false, rng_seed);
455+
opt_parse.add_opt("summary", 'S', "write summary output here", false,
456+
summary_file);
422457
opt_parse.set_show_defaults();
423458
vector<string> leftover_args;
424459
opt_parse.parse(argc, argv, leftover_args);
@@ -557,6 +592,11 @@ main_hmr(int argc, const char **argv) {
557592
out << cpg << '\n';
558593
}
559594
}
595+
if (!summary_file.empty()) {
596+
std::ofstream summary_out(summary_file);
597+
if (!summary_out) throw runtime_error("failed to open: " + summary_file);
598+
summary_out << hmr_summary(domains).tostring() << endl;
599+
}
560600
}
561601
catch (runtime_error &e) {
562602
cerr << e.what() << endl;

src/analysis/pmd.cpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
#include <stdexcept>
2727
#include <unordered_set>
2828
#include <random>
29+
#include <sstream>
2930

3031
#include "smithlab_utils.hpp"
3132
#include "smithlab_os.hpp"
@@ -54,6 +55,33 @@ using std::to_string;
5455

5556
using bamxx::bgzf_file;
5657

58+
struct pmd_summary {
59+
pmd_summary(const vector<GenomicRegion> &pmds) {
60+
pmd_count = pmds.size();
61+
pmd_total_size = accumulate(cbegin(pmds), cend(pmds), 0,
62+
[](const uint64_t t, const GenomicRegion &p) {
63+
return t + p.get_width(); });
64+
pmd_mean_size =
65+
static_cast<double>(pmd_total_size)/std::max(1ul, pmd_count);
66+
}
67+
// pmd_count is the number of identified PMDs.
68+
uint64_t pmd_count{};
69+
// total_pmd_size is the sum of the sizes of the identified PMDs
70+
uint64_t pmd_total_size{};
71+
// mean_pmd_size is the mean size of the identified PMDs
72+
double pmd_mean_size{};
73+
74+
string tostring() {
75+
std::ostringstream oss;
76+
oss << "pmd_count: " << pmd_count << endl
77+
<< "pmd_total_size: " << pmd_total_size << endl
78+
<< "pmd_mean_size: "
79+
<< std::fixed << std::setprecision(2) << pmd_mean_size;
80+
81+
return oss.str();
82+
}
83+
};
84+
5785
static void
5886
get_adjacent_distances(const vector<GenomicRegion> &pmds,
5987
vector<size_t> &dists) {
@@ -1105,6 +1133,8 @@ main_pmd(int argc, const char **argv) {
11051133
static const double tolerance = 1e-5;
11061134
static const double min_prob = 1e-10;
11071135

1136+
string summary_file;
1137+
11081138
string params_in_files;
11091139
string params_out_file;
11101140
string posteriors_out_prefix;
@@ -1138,6 +1168,9 @@ main_pmd(int argc, const char **argv) {
11381168
opt_parse.add_opt("posteriors-out", 'r',
11391169
"write out posterior probabilities in methcounts format",
11401170
false, posteriors_out_prefix);
1171+
opt_parse.add_opt("summary", 'S',
1172+
"write summary output here",
1173+
false, summary_file);
11411174
opt_parse.add_opt("params-out", 'p', "write HMM parameters to this file",
11421175
false, params_out_file);
11431176
opt_parse.add_opt("seed", 's', "specify random seed",
@@ -1385,6 +1418,12 @@ main_pmd(int argc, const char **argv) {
13851418
reps_fg_alpha, reps_fg_beta,
13861419
reps_bg_alpha, reps_bg_beta);
13871420

1421+
if (!summary_file.empty()) {
1422+
ofstream summary_out(summary_file);
1423+
if (!summary_out) throw runtime_error("failed to open: " + summary_file);
1424+
summary_out << pmd_summary(good_domains).tostring() << endl;
1425+
}
1426+
13881427
ofstream of;
13891428
if (!outfile.empty()) of.open(outfile);
13901429
ostream out(outfile.empty() ? std::cout.rdbuf() : of.rdbuf());

src/utils/uniq.cpp

Lines changed: 55 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -77,25 +77,66 @@ struct rd_stats { // keep track of good bases/reads in and out
7777
}
7878
};
7979

80+
81+
struct uniq_summary {
82+
uniq_summary(const rd_stats &rs_in, const rd_stats &rs_out,
83+
const size_t reads_duped) {
84+
total_reads = rs_in.reads;
85+
total_bases = rs_in.bases;
86+
unique_reads = rs_out.reads;
87+
unique_read_bases = rs_out.bases;
88+
reads_removed = rs_in.reads - rs_out.reads;
89+
non_duplicate_fraction = static_cast<double>(rs_out.reads - reads_duped) /
90+
std::max(1ul, rs_in.reads);
91+
duplication_rate = static_cast<double>(reads_removed + reads_duped) /
92+
std::max(1ul, reads_duped);
93+
duplicate_reads = reads_duped;
94+
}
95+
96+
// total_reads is the number of input reads
97+
size_t total_reads{};
98+
// total_bases is the total number of input bases
99+
size_t total_bases{};
100+
// unique_reads is the number of unique reads
101+
size_t unique_reads{};
102+
// unique_read_bases is the total number of bases for the unique reads
103+
size_t unique_read_bases{};
104+
// non_duplicate_fraction is the ratio of the number of unique reads with
105+
// no duplicates to that of the input reads
106+
double non_duplicate_fraction{};
107+
// duplicate_reads is the number of unique reads with at least one duplicate
108+
size_t duplicate_reads{};
109+
// reads_removed is the number of duplicate reads that have been removed
110+
size_t reads_removed{};
111+
// duplication_rate is the average number of duplicates for the reads with
112+
// at least one duplicate (>1 by definition)
113+
double duplication_rate{};
114+
115+
string tostring() {
116+
std::ostringstream oss;
117+
oss << "total_reads: " << total_reads << endl
118+
<< "total_bases: " << total_bases << endl
119+
<< "unique_reads: " << unique_reads << endl
120+
<< "unique_read_bases: " << unique_read_bases << endl
121+
<< "non_duplicate_fraction: " << non_duplicate_fraction << endl
122+
<< "duplicate_reads: " << duplicate_reads << endl
123+
<< "reads_removed: " << reads_removed << endl
124+
<< "duplication_rate: " << duplication_rate;
125+
126+
return oss.str();
127+
}
128+
};
129+
130+
131+
80132
static void
81133
write_stats_output(const rd_stats &rs_in, const rd_stats &rs_out,
82134
const size_t reads_duped, const string &statfile) {
83135
if (!statfile.empty()) {
84-
const size_t reads_removed = rs_in.reads - rs_out.reads;
85-
const double non_dup_frac =
86-
(rs_out.reads - reads_duped) / static_cast<double>(rs_in.reads);
87-
const double dup_rate =
88-
(reads_removed + reads_duped) / static_cast<double>(reads_duped);
136+
uniq_summary summary(rs_in, rs_out, reads_duped);
89137
ofstream out_stat(statfile);
90138
if (!out_stat) throw runtime_error("bad stats output file");
91-
out_stat << "total_reads: " << rs_in.reads << endl
92-
<< "total_bases: " << rs_in.bases << endl
93-
<< "unique_reads: " << rs_out.reads << endl
94-
<< "unique_read_bases: " << rs_out.bases << endl
95-
<< "non_duplicate_fraction: " << non_dup_frac << endl
96-
<< "duplicate_reads: " << reads_duped << endl
97-
<< "reads_removed: " << reads_removed << endl
98-
<< "duplication_rate: " << dup_rate << endl;
139+
out_stat << summary.tostring() << endl;
99140
}
100141
}
101142

@@ -245,7 +286,7 @@ main_uniq(int argc, const char **argv) {
245286
"sorted mapped reads",
246287
"<in-file> [out-file]", 2);
247288
opt_parse.add_opt("threads", 't', "number of threads", false, n_threads);
248-
opt_parse.add_opt("stats", 'S', "statistics output file", false, statfile);
289+
opt_parse.add_opt("summary", 'S', "statistics output file", false, statfile);
249290
opt_parse.add_opt("add-count", 'a', "add duplicate counts to reads", false,
250291
add_dup_count);
251292
opt_parse.add_opt("hist", '\0',

0 commit comments

Comments
 (0)