Skip to content

Commit 665b4ef

Browse files
metagene: adding the metagene.cpp source file
1 parent ec5e1c5 commit 665b4ef

File tree

1 file changed

+226
-0
lines changed

1 file changed

+226
-0
lines changed

src/analysis/metagene.cpp

Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
/* metagene (tsscpgplot): get data to plot methylation level around a TSS
2+
*
3+
* Copyright (C) 2010-2023 Andrew D. Smith
4+
*
5+
* Authors: Andrew D. Smith
6+
*
7+
* This program is free software: you can redistribute it and/or
8+
* modify it under the terms of the GNU General Public License as
9+
* published by the Free Software Foundation, either version 3 of the
10+
* License, or (at your option) any later version.
11+
*
12+
* This program is distributed in the hope that it will be useful, but
13+
* WITHOUT ANY WARRANTY; without even the implied warranty of
14+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15+
* General Public License for more details.
16+
*
17+
* You should have received a copy of the GNU General Public License
18+
* along with this program. If not, see
19+
* <http://www.gnu.org/licenses/>.
20+
*/
21+
22+
#include <fstream>
23+
#include <stdexcept>
24+
#include <unordered_map>
25+
26+
#include "GenomicRegion.hpp"
27+
#include "LevelsCounter.hpp"
28+
#include "MSite.hpp"
29+
#include "OptionParser.hpp"
30+
#include "bsutils.hpp"
31+
#include "smithlab_utils.hpp"
32+
33+
using std::cerr;
34+
using std::cout;
35+
using std::endl;
36+
using std::ostream;
37+
using std::pair;
38+
using std::runtime_error;
39+
using std::sort;
40+
using std::string;
41+
using std::to_string;
42+
using std::unordered_map;
43+
using std::vector;
44+
45+
static MSite
46+
tss_from_gene(const GenomicRegion &r) {
47+
MSite s;
48+
s.chrom = r.get_chrom();
49+
s.pos = (r.pos_strand() ? r.get_start() : r.get_end());
50+
s.strand = r.get_strand();
51+
return s;
52+
}
53+
54+
static void
55+
process_chrom(const uint32_t region_size, const vector<GenomicRegion> &genes,
56+
const pair<uint32_t, uint32_t> &bounds,
57+
const vector<MSite> &sites, vector<LevelsCounter> &levels) {
58+
const uint32_t twice_rs = 2 * region_size;
59+
60+
auto comp = [](const MSite &a, const MSite &b) { return a.pos < b.pos; };
61+
62+
for (auto i = bounds.first; i < bounds.second; ++i) {
63+
const MSite tss = tss_from_gene(genes[i]);
64+
MSite left = tss;
65+
left.pos = left.pos > region_size ? left.pos - region_size : 0;
66+
67+
MSite right = tss;
68+
right.pos += region_size;
69+
70+
const auto lim = upper_bound(cbegin(sites), cend(sites), right, comp);
71+
auto itr = lower_bound(cbegin(sites), cend(sites), left, comp);
72+
for (; itr != lim; ++itr) {
73+
const auto dist = itr->pos - left.pos;
74+
const auto base_idx = tss.strand == '+' ? dist : twice_rs - dist;
75+
levels[base_idx].update(*itr);
76+
}
77+
}
78+
}
79+
80+
template<typename T> static void
81+
collapse_bins(const uint32_t bin_size, vector<T> &v) {
82+
const uint32_t n_bins = std::ceil(static_cast<double>(v.size()) / bin_size);
83+
vector<T> vv(n_bins);
84+
for (auto i = 0u; i < v.size(); ++i) vv[i / bin_size] += v[i];
85+
v.swap(vv);
86+
}
87+
88+
int
89+
metagene(int argc, const char **argv) {
90+
constexpr auto description =
91+
"Compute the information needed for metagene plots of DNA methylation \
92+
levels. The columns in the output correspond to the fields calculated \
93+
globally by the `levels` and per-region by the `roi` command. Input \
94+
for features is in BED format, and when present the 6th column is used \
95+
to indicate strand. For features of non-zero width (where the 2nd and \
96+
3rd columns are not identical) the negative strand will indicate that \
97+
3rd column should be used. This means, for example, if the features are \
98+
genes, and the promoters are of interest, the strand will be used \
99+
correctly.";
100+
101+
try {
102+
string outfile;
103+
uint32_t region_size = 5000;
104+
bool verbose = false;
105+
bool show_progress = false;
106+
uint32_t bin_size = 50;
107+
108+
/****************** GET COMMAND LINE ARGUMENTS ***************************/
109+
OptionParser opt_parse(strip_path(argv[0]), description,
110+
"<features-bed> <counts>");
111+
opt_parse.add_opt("output", 'o', "output file (default: terminal)", false,
112+
outfile);
113+
opt_parse.add_opt("size", 's', "analyze this size in both directions",
114+
false, region_size);
115+
opt_parse.add_opt("bin", 'b', "bin size", false, bin_size);
116+
opt_parse.add_opt("progress", '\0', "show progress", false, show_progress);
117+
opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose);
118+
vector<string> leftover_args;
119+
opt_parse.parse(argc, argv, leftover_args);
120+
if (argc == 1 || opt_parse.help_requested()) {
121+
cerr << opt_parse.help_message() << endl;
122+
return EXIT_SUCCESS;
123+
}
124+
if (opt_parse.about_requested()) {
125+
cerr << opt_parse.about_message() << endl;
126+
return EXIT_SUCCESS;
127+
}
128+
if (opt_parse.option_missing()) {
129+
cerr << opt_parse.option_missing_message() << endl;
130+
return EXIT_SUCCESS;
131+
}
132+
if (leftover_args.size() != 2) {
133+
cerr << opt_parse.help_message() << endl;
134+
return EXIT_SUCCESS;
135+
}
136+
const string features_file_name = leftover_args.front();
137+
const string cpg_file_name = leftover_args.back();
138+
/**********************************************************************/
139+
140+
if (verbose) cerr << "[loading feature annotations data]" << endl;
141+
vector<GenomicRegion> features;
142+
ReadBEDFile(features_file_name, features);
143+
sort(begin(features), end(features));
144+
if (verbose)
145+
cerr << "[number of features: " << features.size() << "]" << endl;
146+
147+
// identify the start and end of ranges for each chromosome
148+
unordered_map<string, pair<uint32_t, uint32_t>> lookup;
149+
typedef decltype(lookup)::iterator lookup_itr;
150+
151+
string chrom_name;
152+
auto prev_idx = 0u;
153+
for (auto i = 0u; i < features.size(); ++i)
154+
if (features[i].get_chrom() != chrom_name) {
155+
if (!chrom_name.empty()) lookup.insert({chrom_name, {prev_idx, i}});
156+
prev_idx = i;
157+
chrom_name = features[i].get_chrom();
158+
}
159+
lookup.insert({chrom_name, {prev_idx, features.size()}});
160+
if (verbose)
161+
cerr << "[number of chroms with features: " << lookup.size() << "]\n";
162+
163+
auto pair_diff = [&lookup](const lookup_itr x) {
164+
return (x != end(lookup)) ? x->second.second - x->second.first : 0u;
165+
};
166+
167+
vector<LevelsCounter> levels(2 * region_size);
168+
169+
bamxx::bgzf_file cpgin(cpg_file_name, "r");
170+
if (!cpgin) throw runtime_error("failed to open file: " + cpg_file_name);
171+
172+
uint32_t total_features = 0u;
173+
174+
vector<MSite> sites;
175+
string line;
176+
chrom_name.clear();
177+
while (getline(cpgin, line)) {
178+
auto the_site = MSite(line);
179+
if (the_site.chrom != chrom_name) {
180+
if (!sites.empty()) {
181+
const auto bounds = lookup.find(chrom_name);
182+
if (bounds != end(lookup))
183+
process_chrom(region_size, features, bounds->second, sites, levels);
184+
const auto n_features = pair_diff(bounds);
185+
if (show_progress)
186+
cerr << "[sites=" << sites.size() << " features=" << n_features
187+
<< "]" << endl;
188+
total_features += n_features;
189+
sites.clear();
190+
}
191+
if (show_progress) cerr << "[processing: " << the_site.chrom << "]";
192+
chrom_name = the_site.chrom;
193+
}
194+
sites.push_back(std::move(the_site));
195+
}
196+
197+
if (!sites.empty()) {
198+
const auto bounds = lookup.find(chrom_name);
199+
if (bounds != end(lookup))
200+
process_chrom(region_size, features, bounds->second, sites, levels);
201+
const auto n_features = pair_diff(bounds);
202+
if (show_progress)
203+
cerr << "[sites=" << sites.size() << " features=" << n_features << "]"
204+
<< endl;
205+
total_features += n_features;
206+
}
207+
208+
collapse_bins(bin_size, levels);
209+
210+
if (verbose)
211+
cerr << "output columns:\n"
212+
<< LevelsCounter::tostring_as_row_header() << endl;
213+
214+
std::ofstream of;
215+
if (!outfile.empty()) of.open(outfile);
216+
std::ostream out(of.is_open() ? of.rdbuf() : std::cout.rdbuf());
217+
218+
for (auto i = 0u; i < levels.size(); ++i)
219+
out << i * bin_size << '\t' << levels[i].tostring_as_row() << endl;
220+
}
221+
catch (std::exception &e) {
222+
cerr << e.what() << endl;
223+
return EXIT_FAILURE;
224+
}
225+
return EXIT_SUCCESS;
226+
}

0 commit comments

Comments
 (0)