Skip to content

Commit ccb9c83

Browse files
Merge pull request #126 from smithlabcode/metagene
Metagene
2 parents 04acbb0 + 665b4ef commit ccb9c83

File tree

6 files changed

+326
-51
lines changed

6 files changed

+326
-51
lines changed

Makefile.am

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,7 @@ dnmtools_SOURCES += src/analysis/hmr.cpp
186186
dnmtools_SOURCES += src/analysis/hmr-rep.cpp
187187
dnmtools_SOURCES += src/analysis/levels.cpp
188188
dnmtools_SOURCES += src/analysis/hypermr.cpp
189+
dnmtools_SOURCES += src/analysis/metagene.cpp
189190

190191
dnmtools_SOURCES += src/utils/clean-hairpins.cpp
191192
dnmtools_SOURCES += src/utils/guessprotocol.cpp

src/analysis/metagene.cpp

Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
/* metagene (tsscpgplot): get data to plot methylation level around a TSS
2+
*
3+
* Copyright (C) 2010-2023 Andrew D. Smith
4+
*
5+
* Authors: Andrew D. Smith
6+
*
7+
* This program is free software: you can redistribute it and/or
8+
* modify it under the terms of the GNU General Public License as
9+
* published by the Free Software Foundation, either version 3 of the
10+
* License, or (at your option) any later version.
11+
*
12+
* This program is distributed in the hope that it will be useful, but
13+
* WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15+
* General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with this program. If not, see
19+
* <http://www.gnu.org/licenses/>.
20+
*/
21+
22+
#include <fstream>
23+
#include <stdexcept>
24+
#include <unordered_map>
25+
26+
#include "GenomicRegion.hpp"
27+
#include "LevelsCounter.hpp"
28+
#include "MSite.hpp"
29+
#include "OptionParser.hpp"
30+
#include "bsutils.hpp"
31+
#include "smithlab_utils.hpp"
32+
33+
using std::cerr;
34+
using std::cout;
35+
using std::endl;
36+
using std::ostream;
37+
using std::pair;
38+
using std::runtime_error;
39+
using std::sort;
40+
using std::string;
41+
using std::to_string;
42+
using std::unordered_map;
43+
using std::vector;
44+
45+
static MSite
46+
tss_from_gene(const GenomicRegion &r) {
47+
MSite s;
48+
s.chrom = r.get_chrom();
49+
s.pos = (r.pos_strand() ? r.get_start() : r.get_end());
50+
s.strand = r.get_strand();
51+
return s;
52+
}
53+
54+
static void
55+
process_chrom(const uint32_t region_size, const vector<GenomicRegion> &genes,
56+
const pair<uint32_t, uint32_t> &bounds,
57+
const vector<MSite> &sites, vector<LevelsCounter> &levels) {
58+
const uint32_t twice_rs = 2 * region_size;
59+
60+
auto comp = [](const MSite &a, const MSite &b) { return a.pos < b.pos; };
61+
62+
for (auto i = bounds.first; i < bounds.second; ++i) {
63+
const MSite tss = tss_from_gene(genes[i]);
64+
MSite left = tss;
65+
left.pos = left.pos > region_size ? left.pos - region_size : 0;
66+
67+
MSite right = tss;
68+
right.pos += region_size;
69+
70+
const auto lim = upper_bound(cbegin(sites), cend(sites), right, comp);
71+
auto itr = lower_bound(cbegin(sites), cend(sites), left, comp);
72+
for (; itr != lim; ++itr) {
73+
const auto dist = itr->pos - left.pos;
74+
const auto base_idx = tss.strand == '+' ? dist : twice_rs - dist;
75+
levels[base_idx].update(*itr);
76+
}
77+
}
78+
}
79+
80+
template<typename T> static void
81+
collapse_bins(const uint32_t bin_size, vector<T> &v) {
82+
const uint32_t n_bins = std::ceil(static_cast<double>(v.size()) / bin_size);
83+
vector<T> vv(n_bins);
84+
for (auto i = 0u; i < v.size(); ++i) vv[i / bin_size] += v[i];
85+
v.swap(vv);
86+
}
87+
88+
int
89+
metagene(int argc, const char **argv) {
90+
constexpr auto description =
91+
"Compute the information needed for metagene plots of DNA methylation \
92+
levels. The columns in the output correspond to the fields calculated \
93+
globally by the `levels` and per-region by the `roi` command. Input \
94+
for features is in BED format, and when present the 6th column is used \
95+
to indicate strand. For features of non-zero width (where the 2nd and \
96+
3rd columns are not identical) the negative strand will indicate that \
97+
3rd column should be used. This means, for example, if the features are \
98+
genes, and the promoters are of interest, the strand will be used \
99+
correctly.";
100+
101+
try {
102+
string outfile;
103+
uint32_t region_size = 5000;
104+
bool verbose = false;
105+
bool show_progress = false;
106+
uint32_t bin_size = 50;
107+
108+
/****************** GET COMMAND LINE ARGUMENTS ***************************/
109+
OptionParser opt_parse(strip_path(argv[0]), description,
110+
"<features-bed> <counts>");
111+
opt_parse.add_opt("output", 'o', "output file (default: terminal)", false,
112+
outfile);
113+
opt_parse.add_opt("size", 's', "analyze this size in both directions",
114+
false, region_size);
115+
opt_parse.add_opt("bin", 'b', "bin size", false, bin_size);
116+
opt_parse.add_opt("progress", '\0', "show progress", false, show_progress);
117+
opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose);
118+
vector<string> leftover_args;
119+
opt_parse.parse(argc, argv, leftover_args);
120+
if (argc == 1 || opt_parse.help_requested()) {
121+
cerr << opt_parse.help_message() << endl;
122+
return EXIT_SUCCESS;
123+
}
124+
if (opt_parse.about_requested()) {
125+
cerr << opt_parse.about_message() << endl;
126+
return EXIT_SUCCESS;
127+
}
128+
if (opt_parse.option_missing()) {
129+
cerr << opt_parse.option_missing_message() << endl;
130+
return EXIT_SUCCESS;
131+
}
132+
if (leftover_args.size() != 2) {
133+
cerr << opt_parse.help_message() << endl;
134+
return EXIT_SUCCESS;
135+
}
136+
const string features_file_name = leftover_args.front();
137+
const string cpg_file_name = leftover_args.back();
138+
/**********************************************************************/
139+
140+
if (verbose) cerr << "[loading feature annotations data]" << endl;
141+
vector<GenomicRegion> features;
142+
ReadBEDFile(features_file_name, features);
143+
sort(begin(features), end(features));
144+
if (verbose)
145+
cerr << "[number of features: " << features.size() << "]" << endl;
146+
147+
// identify the start and end of ranges for each chromosome
148+
unordered_map<string, pair<uint32_t, uint32_t>> lookup;
149+
typedef decltype(lookup)::iterator lookup_itr;
150+
151+
string chrom_name;
152+
auto prev_idx = 0u;
153+
for (auto i = 0u; i < features.size(); ++i)
154+
if (features[i].get_chrom() != chrom_name) {
155+
if (!chrom_name.empty()) lookup.insert({chrom_name, {prev_idx, i}});
156+
prev_idx = i;
157+
chrom_name = features[i].get_chrom();
158+
}
159+
lookup.insert({chrom_name, {prev_idx, features.size()}});
160+
if (verbose)
161+
cerr << "[number of chroms with features: " << lookup.size() << "]\n";
162+
163+
auto pair_diff = [&lookup](const lookup_itr x) {
164+
return (x != end(lookup)) ? x->second.second - x->second.first : 0u;
165+
};
166+
167+
vector<LevelsCounter> levels(2 * region_size);
168+
169+
bamxx::bgzf_file cpgin(cpg_file_name, "r");
170+
if (!cpgin) throw runtime_error("failed to open file: " + cpg_file_name);
171+
172+
uint32_t total_features = 0u;
173+
174+
vector<MSite> sites;
175+
string line;
176+
chrom_name.clear();
177+
while (getline(cpgin, line)) {
178+
auto the_site = MSite(line);
179+
if (the_site.chrom != chrom_name) {
180+
if (!sites.empty()) {
181+
const auto bounds = lookup.find(chrom_name);
182+
if (bounds != end(lookup))
183+
process_chrom(region_size, features, bounds->second, sites, levels);
184+
const auto n_features = pair_diff(bounds);
185+
if (show_progress)
186+
cerr << "[sites=" << sites.size() << " features=" << n_features
187+
<< "]" << endl;
188+
total_features += n_features;
189+
sites.clear();
190+
}
191+
if (show_progress) cerr << "[processing: " << the_site.chrom << "]";
192+
chrom_name = the_site.chrom;
193+
}
194+
sites.push_back(std::move(the_site));
195+
}
196+
197+
if (!sites.empty()) {
198+
const auto bounds = lookup.find(chrom_name);
199+
if (bounds != end(lookup))
200+
process_chrom(region_size, features, bounds->second, sites, levels);
201+
const auto n_features = pair_diff(bounds);
202+
if (show_progress)
203+
cerr << "[sites=" << sites.size() << " features=" << n_features << "]"
204+
<< endl;
205+
total_features += n_features;
206+
}
207+
208+
collapse_bins(bin_size, levels);
209+
210+
if (verbose)
211+
cerr << "output columns:\n"
212+
<< LevelsCounter::tostring_as_row_header() << endl;
213+
214+
std::ofstream of;
215+
if (!outfile.empty()) of.open(outfile);
216+
std::ostream out(of.is_open() ? of.rdbuf() : std::cout.rdbuf());
217+
218+
for (auto i = 0u; i < levels.size(); ++i)
219+
out << i * bin_size << '\t' << levels[i].tostring_as_row() << endl;
220+
}
221+
catch (std::exception &e) {
222+
cerr << e.what() << endl;
223+
return EXIT_FAILURE;
224+
}
225+
return EXIT_SUCCESS;
226+
}

src/analysis/methcounts.cpp

Lines changed: 14 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -183,40 +183,36 @@ get_tag_from_genome(const string &s, const size_t pos) {
183183
return 4; // shouldn't be used for anything
184184
}
185185

186-
187186
/* This "has_mutated" function looks on the opposite strand to see
188187
* if the apparent conversion from C->T was actually already in the
189188
* DNA because of a mutation or SNP.
190189
*/
191190
static bool
192191
has_mutated(const char base, const CountSet &cs) {
193192
static const double MUTATION_DEFINING_FRACTION = 0.5;
194-
return is_cytosine(base) ?
195-
(cs.nG < MUTATION_DEFINING_FRACTION*(cs.neg_total())) :
196-
(cs.pG < MUTATION_DEFINING_FRACTION*(cs.pos_total()));
193+
return is_cytosine(base)
194+
? (cs.nG < MUTATION_DEFINING_FRACTION * (cs.neg_total()))
195+
: (cs.pG < MUTATION_DEFINING_FRACTION * (cs.pos_total()));
197196
}
198197

199-
200198
static inline bool
201199
is_cpg_site(const string &s, const size_t pos) {
202-
return (is_cytosine(s[pos]) ? is_guanine(s[pos + 1]) :
203-
(is_guanine(s[pos]) ?
204-
(pos > 0 && is_cytosine(s[pos - 1])) : false));
200+
return is_cytosine(s[pos])
201+
? is_guanine(s[pos + 1])
202+
: (is_guanine(s[pos]) ? (pos > 0 && is_cytosine(s[pos - 1]))
203+
: false);
205204
}
206205

207-
208206
static inline size_t
209207
get_chrom_id(const string &chrom_name,
210208
const unordered_map<string, size_t> &cl) {
211-
212209
auto the_chrom(cl.find(chrom_name));
213210
if (the_chrom == end(cl))
214211
throw dnmt_error("could not find chrom: " + chrom_name);
215212

216213
return the_chrom->second;
217214
}
218215

219-
220216
static const char *tag_values[] = {
221217
"CpG", // 0
222218
"CHH", // 1
@@ -253,39 +249,26 @@ write_output(const bamxx::bam_header &hdr, bamxx::bgzf_file &out,
253249
if (CPG_ONLY && the_tag != 0) continue;
254250

255251
const bool is_c = is_cytosine(base);
256-
const double unconverted = is_c ?
257-
counts[i].unconverted_cytosine() : counts[i].unconverted_guanine();
258-
const double converted = is_c ?
259-
counts[i].converted_cytosine() : counts[i].converted_guanine();
252+
const double unconverted = is_c ? counts[i].unconverted_cytosine()
253+
: counts[i].unconverted_guanine();
254+
const double converted =
255+
is_c ? counts[i].converted_cytosine() : counts[i].converted_guanine();
260256
const bool mut = has_mutated(base, counts[i]);
261257
const size_t n_reads = unconverted + converted;
262258
buf.clear();
263259
// ADS: here is where we make an MSite, but not using MSite
264-
buf << sam_hdr_tid2name(hdr, tid) << '\t'
265-
<< i << '\t'
260+
buf << sam_hdr_tid2name(hdr, tid) << '\t' << i << '\t'
266261
<< (is_c ? '+' : '-') << '\t'
267262
<< tag_values[tag_with_mut(the_tag, mut)] << '\t'
268-
<< (n_reads > 0 ? unconverted/n_reads : 0.0) << '\t'
269-
<< n_reads << '\n';
263+
<< (n_reads > 0 ? unconverted / n_reads : 0.0) << '\t' << n_reads
264+
<< '\n';
270265
if (!out.write(buf.c_str(), buf.tellp()))
271266
throw dnmt_error("error writing output");
272267
}
273268
}
274269
}
275270

276271

277-
/* ADS: was tempted to use these until I passed in a "b++"... */
278-
// #define get_tid(b) ((b)->core.tid)
279-
// #define get_rlen(b) (bam_cigar2rlen((b)->core.n_cigar, bam_get_cigar(b)))
280-
// #define get_pos(b) ((b)->core.pos)
281-
// #define get_qlen(b) ((b)->core.l_qseq)
282-
283-
284-
285-
286-
287-
288-
289272
static void
290273
count_states_pos(const bam_rec &aln, vector<CountSet> &counts) {
291274
/* Move through cigar, reference and read positions without

0 commit comments

Comments
 (0)