|
| 1 | +/* metagene (tsscpgplot): get data to plot methylation level around a TSS |
| 2 | + * |
| 3 | + * Copyright (C) 2010-2023 Andrew D. Smith |
| 4 | + * |
| 5 | + * Authors: Andrew D. Smith |
| 6 | + * |
| 7 | + * This program is free software: you can redistribute it and/or |
| 8 | + * modify it under the terms of the GNU General Public License as |
| 9 | + * published by the Free Software Foundation, either version 3 of the |
| 10 | + * License, or (at your option) any later version. |
| 11 | + * |
| 12 | + * This program is distributed in the hope that it will be useful, but |
| 13 | + * WITHOUT ANY WARRANTY; without even the implied warranty of |
| 14 | + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 15 | + * General Public License for more details. |
| 16 | + * |
| 17 | + * You should have received a copy of the GNU General Public License |
| 18 | + * along with this program. If not, see |
| 19 | + * <http://www.gnu.org/licenses/>. |
| 20 | + */ |
| 21 | + |
| 22 | +#include <fstream> |
| 23 | +#include <stdexcept> |
| 24 | +#include <unordered_map> |
| 25 | + |
| 26 | +#include "GenomicRegion.hpp" |
| 27 | +#include "LevelsCounter.hpp" |
| 28 | +#include "MSite.hpp" |
| 29 | +#include "OptionParser.hpp" |
| 30 | +#include "bsutils.hpp" |
| 31 | +#include "smithlab_utils.hpp" |
| 32 | + |
| 33 | +using std::cerr; |
| 34 | +using std::cout; |
| 35 | +using std::endl; |
| 36 | +using std::ostream; |
| 37 | +using std::pair; |
| 38 | +using std::runtime_error; |
| 39 | +using std::sort; |
| 40 | +using std::string; |
| 41 | +using std::to_string; |
| 42 | +using std::unordered_map; |
| 43 | +using std::vector; |
| 44 | + |
| 45 | +static MSite |
| 46 | +tss_from_gene(const GenomicRegion &r) { |
| 47 | + MSite s; |
| 48 | + s.chrom = r.get_chrom(); |
| 49 | + s.pos = (r.pos_strand() ? r.get_start() : r.get_end()); |
| 50 | + s.strand = r.get_strand(); |
| 51 | + return s; |
| 52 | +} |
| 53 | + |
| 54 | +static void |
| 55 | +process_chrom(const uint32_t region_size, const vector<GenomicRegion> &genes, |
| 56 | + const pair<uint32_t, uint32_t> &bounds, |
| 57 | + const vector<MSite> &sites, vector<LevelsCounter> &levels) { |
| 58 | + const uint32_t twice_rs = 2 * region_size; |
| 59 | + |
| 60 | + auto comp = [](const MSite &a, const MSite &b) { return a.pos < b.pos; }; |
| 61 | + |
| 62 | + for (auto i = bounds.first; i < bounds.second; ++i) { |
| 63 | + const MSite tss = tss_from_gene(genes[i]); |
| 64 | + MSite left = tss; |
| 65 | + left.pos = left.pos > region_size ? left.pos - region_size : 0; |
| 66 | + |
| 67 | + MSite right = tss; |
| 68 | + right.pos += region_size; |
| 69 | + |
| 70 | + const auto lim = upper_bound(cbegin(sites), cend(sites), right, comp); |
| 71 | + auto itr = lower_bound(cbegin(sites), cend(sites), left, comp); |
| 72 | + for (; itr != lim; ++itr) { |
| 73 | + const auto dist = itr->pos - left.pos; |
| 74 | + const auto base_idx = tss.strand == '+' ? dist : twice_rs - dist; |
| 75 | + levels[base_idx].update(*itr); |
| 76 | + } |
| 77 | + } |
| 78 | +} |
| 79 | + |
| 80 | +template<typename T> static void |
| 81 | +collapse_bins(const uint32_t bin_size, vector<T> &v) { |
| 82 | + const uint32_t n_bins = std::ceil(static_cast<double>(v.size()) / bin_size); |
| 83 | + vector<T> vv(n_bins); |
| 84 | + for (auto i = 0u; i < v.size(); ++i) vv[i / bin_size] += v[i]; |
| 85 | + v.swap(vv); |
| 86 | +} |
| 87 | + |
| 88 | +int |
| 89 | +metagene(int argc, const char **argv) { |
| 90 | + constexpr auto description = |
| 91 | + "Compute the information needed for metagene plots of DNA methylation \ |
| 92 | + levels. The columns in the output correspond to the fields calculated \ |
| 93 | + globally by the `levels` and per-region by the `roi` command. Input \ |
| 94 | + for features is in BED format, and when present the 6th column is used \ |
| 95 | + to indicate strand. For features of non-zero width (where the 2nd and \ |
| 96 | + 3rd columns are not identical) the negative strand will indicate that \ |
| 97 | + 3rd column should be used. This means, for example, if the features are \ |
| 98 | + genes, and the promoters are of interest, the strand will be used \ |
| 99 | + correctly."; |
| 100 | + |
| 101 | + try { |
| 102 | + string outfile; |
| 103 | + uint32_t region_size = 5000; |
| 104 | + bool verbose = false; |
| 105 | + bool show_progress = false; |
| 106 | + uint32_t bin_size = 50; |
| 107 | + |
| 108 | + /****************** GET COMMAND LINE ARGUMENTS ***************************/ |
| 109 | + OptionParser opt_parse(strip_path(argv[0]), description, |
| 110 | + "<features-bed> <counts>"); |
| 111 | + opt_parse.add_opt("output", 'o', "output file (default: terminal)", false, |
| 112 | + outfile); |
| 113 | + opt_parse.add_opt("size", 's', "analyze this size in both directions", |
| 114 | + false, region_size); |
| 115 | + opt_parse.add_opt("bin", 'b', "bin size", false, bin_size); |
| 116 | + opt_parse.add_opt("progress", '\0', "show progress", false, show_progress); |
| 117 | + opt_parse.add_opt("verbose", 'v', "print more run info", false, verbose); |
| 118 | + vector<string> leftover_args; |
| 119 | + opt_parse.parse(argc, argv, leftover_args); |
| 120 | + if (argc == 1 || opt_parse.help_requested()) { |
| 121 | + cerr << opt_parse.help_message() << endl; |
| 122 | + return EXIT_SUCCESS; |
| 123 | + } |
| 124 | + if (opt_parse.about_requested()) { |
| 125 | + cerr << opt_parse.about_message() << endl; |
| 126 | + return EXIT_SUCCESS; |
| 127 | + } |
| 128 | + if (opt_parse.option_missing()) { |
| 129 | + cerr << opt_parse.option_missing_message() << endl; |
| 130 | + return EXIT_SUCCESS; |
| 131 | + } |
| 132 | + if (leftover_args.size() != 2) { |
| 133 | + cerr << opt_parse.help_message() << endl; |
| 134 | + return EXIT_SUCCESS; |
| 135 | + } |
| 136 | + const string features_file_name = leftover_args.front(); |
| 137 | + const string cpg_file_name = leftover_args.back(); |
| 138 | + /**********************************************************************/ |
| 139 | + |
| 140 | + if (verbose) cerr << "[loading feature annotations data]" << endl; |
| 141 | + vector<GenomicRegion> features; |
| 142 | + ReadBEDFile(features_file_name, features); |
| 143 | + sort(begin(features), end(features)); |
| 144 | + if (verbose) |
| 145 | + cerr << "[number of features: " << features.size() << "]" << endl; |
| 146 | + |
| 147 | + // identify the start and end of ranges for each chromosome |
| 148 | + unordered_map<string, pair<uint32_t, uint32_t>> lookup; |
| 149 | + typedef decltype(lookup)::iterator lookup_itr; |
| 150 | + |
| 151 | + string chrom_name; |
| 152 | + auto prev_idx = 0u; |
| 153 | + for (auto i = 0u; i < features.size(); ++i) |
| 154 | + if (features[i].get_chrom() != chrom_name) { |
| 155 | + if (!chrom_name.empty()) lookup.insert({chrom_name, {prev_idx, i}}); |
| 156 | + prev_idx = i; |
| 157 | + chrom_name = features[i].get_chrom(); |
| 158 | + } |
| 159 | + lookup.insert({chrom_name, {prev_idx, features.size()}}); |
| 160 | + if (verbose) |
| 161 | + cerr << "[number of chroms with features: " << lookup.size() << "]\n"; |
| 162 | + |
| 163 | + auto pair_diff = [&lookup](const lookup_itr x) { |
| 164 | + return (x != end(lookup)) ? x->second.second - x->second.first : 0u; |
| 165 | + }; |
| 166 | + |
| 167 | + vector<LevelsCounter> levels(2 * region_size); |
| 168 | + |
| 169 | + bamxx::bgzf_file cpgin(cpg_file_name, "r"); |
| 170 | + if (!cpgin) throw runtime_error("failed to open file: " + cpg_file_name); |
| 171 | + |
| 172 | + uint32_t total_features = 0u; |
| 173 | + |
| 174 | + vector<MSite> sites; |
| 175 | + string line; |
| 176 | + chrom_name.clear(); |
| 177 | + while (getline(cpgin, line)) { |
| 178 | + auto the_site = MSite(line); |
| 179 | + if (the_site.chrom != chrom_name) { |
| 180 | + if (!sites.empty()) { |
| 181 | + const auto bounds = lookup.find(chrom_name); |
| 182 | + if (bounds != end(lookup)) |
| 183 | + process_chrom(region_size, features, bounds->second, sites, levels); |
| 184 | + const auto n_features = pair_diff(bounds); |
| 185 | + if (show_progress) |
| 186 | + cerr << "[sites=" << sites.size() << " features=" << n_features |
| 187 | + << "]" << endl; |
| 188 | + total_features += n_features; |
| 189 | + sites.clear(); |
| 190 | + } |
| 191 | + if (show_progress) cerr << "[processing: " << the_site.chrom << "]"; |
| 192 | + chrom_name = the_site.chrom; |
| 193 | + } |
| 194 | + sites.push_back(std::move(the_site)); |
| 195 | + } |
| 196 | + |
| 197 | + if (!sites.empty()) { |
| 198 | + const auto bounds = lookup.find(chrom_name); |
| 199 | + if (bounds != end(lookup)) |
| 200 | + process_chrom(region_size, features, bounds->second, sites, levels); |
| 201 | + const auto n_features = pair_diff(bounds); |
| 202 | + if (show_progress) |
| 203 | + cerr << "[sites=" << sites.size() << " features=" << n_features << "]" |
| 204 | + << endl; |
| 205 | + total_features += n_features; |
| 206 | + } |
| 207 | + |
| 208 | + collapse_bins(bin_size, levels); |
| 209 | + |
| 210 | + if (verbose) |
| 211 | + cerr << "output columns:\n" |
| 212 | + << LevelsCounter::tostring_as_row_header() << endl; |
| 213 | + |
| 214 | + std::ofstream of; |
| 215 | + if (!outfile.empty()) of.open(outfile); |
| 216 | + std::ostream out(of.is_open() ? of.rdbuf() : std::cout.rdbuf()); |
| 217 | + |
| 218 | + for (auto i = 0u; i < levels.size(); ++i) |
| 219 | + out << i * bin_size << '\t' << levels[i].tostring_as_row() << endl; |
| 220 | + } |
| 221 | + catch (std::exception &e) { |
| 222 | + cerr << e.what() << endl; |
| 223 | + return EXIT_FAILURE; |
| 224 | + } |
| 225 | + return EXIT_SUCCESS; |
| 226 | +} |
0 commit comments