1- /* allelicmeth:
2- *
3- * Copyright (C) 2014-2022 University of Southern California,
1+ /* Copyright (C) 2014-2022 University of Southern California,
42 * Andrew D. Smith, and Benjamin E. Decato
53 *
64 * Authors: Andrew D. Smith and Benjamin E. Decato
1917#include " Epiread.hpp"
2018#include " MSite.hpp"
2119
22- #include " GenomicRegion.hpp"
2320#include " OptionParser.hpp"
2421#include " smithlab_os.hpp"
2522#include " smithlab_utils.hpp"
4239#include < utility>
4340#include < vector>
4441
45- using std::cerr;
46- using std::cout;
47- using std::max;
48- using std::min;
49- using std::runtime_error;
50- using std::string;
51- using std::unordered_map;
52- using std::unordered_set;
53- using std::vector;
54-
55- // NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions,*-prefer-member-initializer)
42+ // NOLINTBEGIN(*-avoid-magic-numbers,*-narrowing-conversions)
5643
5744static inline double
5845log_sum_log (const double p, const double q) {
59- if (p == 0 ) {
46+ if (p == 0 )
6047 return q;
61- }
62- else if (q == 0 ) {
48+ if (q == 0 )
6349 return p;
64- }
65- return p > q ? p + log1p ( exp (q - p)) : q + log1p (exp (p - q));
50+ return p > q ? p + std::log1p ( std::exp (q - p))
51+ : q + std:: log1p (std:: exp (p - q));
6652}
6753
6854static inline double
@@ -77,28 +63,29 @@ lnchoose(const unsigned int n, unsigned int m) {
7763
7864// p(k) = C(n1, k) C(n2, t - k) / C(n1 + n2, t)
7965static double
80- log_hyper_g (const size_t k, const size_t n1, const size_t n2, const size_t t) {
66+ log_hyper_g (const std::size_t k, const std::size_t n1, const std::size_t n2,
67+ const std::size_t t) {
8168 return lnchoose (n1, k) + lnchoose (n2, t - k) - lnchoose (n1 + n2, t);
8269}
8370
8471static double
85- fishers_exact (size_t a, size_t b, size_t c, size_t d) {
86- const size_t m = a + c; // sum of first column
87- const size_t n = b + d; // sum of second column
88- const size_t k = a + b; // sum of first row
72+ fishers_exact (std:: size_t a, std:: size_t b, std:: size_t c, std:: size_t d) {
73+ const std:: size_t m = a + c; // sum of first column
74+ const std:: size_t n = b + d; // sum of second column
75+ const std:: size_t k = a + b; // sum of first row
8976 // ADS: want more extreme than "observed"
9077 const double observed = log_hyper_g (a, m, n, k);
9178 double p = 0.0 ;
92- for (size_t i = (n > k ? 0ul : k - n); i <= std::min (k, m); ++i) {
79+ for (std:: size_t i = (n > k ? 0ul : k - n); i <= std::min (k, m); ++i) {
9380 const double curr = log_hyper_g (i, m, n, k);
9481 if (curr <= observed)
9582 p = log_sum_log (p, curr);
9683 }
97- return exp (p);
84+ return std:: exp (p);
9885}
9986
100- static size_t
101- state_pair_to_index (const string &s, const size_t idx) {
87+ static std:: size_t
88+ state_pair_to_index (const std:: string &s, const std:: size_t idx) {
10289 assert (idx < s.length () - 1 );
10390 const char a = s[idx];
10491 if (a == ' C' ) {
@@ -136,13 +123,13 @@ template <class T> struct PairStateCounter {
136123 return CC + CT + TC + TT;
137124 }
138125
139- string
126+ std:: string
140127 tostring () const {
141128 return toa (CC) + ' \t ' + toa (CT) + ' \t ' + toa (TC) + ' \t ' + toa (TT);
142129 }
143130
144131 void
145- increment (const size_t state) {
132+ increment (const std:: size_t state) {
146133 if (state == 0 )
147134 ++CC;
148135 else if (state == 1 )
@@ -156,19 +143,20 @@ template <class T> struct PairStateCounter {
156143
157144template <typename T>
158145void
159- fit_states (const epiread &er, vector<PairStateCounter<T>> &counts) {
160- for (size_t i = 0 ; i < er.length () - 1 ; ++i) {
161- const size_t pos = er.pos + i;
146+ fit_states (const epiread &er, std:: vector<PairStateCounter<T>> &counts) {
147+ for (std:: size_t i = 0 ; i < er.length () - 1 ; ++i) {
148+ const std:: size_t pos = er.pos + i;
162149 assert (pos < counts.size ());
163- const size_t curr_state = state_pair_to_index (er.seq , i);
150+ const std:: size_t curr_state = state_pair_to_index (er.seq , i);
164151 counts[pos].increment (curr_state);
165152 }
166153}
167154
168155static void
169- collect_cpgs (const string &s, unordered_map<size_t , size_t > &cpgs) {
170- size_t cpg_idx = 0 ;
171- size_t nuc_idx = 0 ;
156+ collect_cpgs (const std::string &s,
157+ std::unordered_map<std::size_t , std::size_t > &cpgs) {
158+ std::size_t cpg_idx = 0 ;
159+ std::size_t nuc_idx = 0 ;
172160 const auto lim = end (s) - (s.size () == 0 ? 0 : 1 );
173161 for (auto itr = begin (s); itr != lim; ++itr, ++nuc_idx)
174162 if (*itr == ' C' && *(itr + 1 ) == ' G' )
@@ -179,53 +167,58 @@ collect_cpgs(const string &s, unordered_map<size_t, size_t> &cpgs) {
179167 consecutive CpG sites. So there would be one fewer of them than the
180168 total CpGs in a chromosome. */
181169static void
182- convert_coordinates (const string &chrom, vector<MSite> &sites) {
183- unordered_map<size_t , size_t > cpgs;
170+ convert_coordinates (const std:: string &chrom, std:: vector<MSite> &sites) {
171+ std:: unordered_map<std:: size_t , std:: size_t > cpgs;
184172 collect_cpgs (chrom, cpgs);
185- for (size_t i = 0 ; i < sites.size (); ++i) {
173+ for (std:: size_t i = 0 ; i < sites.size (); ++i) {
186174 const auto pos_itr = cpgs.find (sites[i].pos );
187175 if (pos_itr == end (cpgs))
188- throw runtime_error (" failed converting site:\n " + sites[i].tostring ());
176+ throw std::runtime_error (" failed converting site:\n " +
177+ sites[i].tostring ());
189178 sites[i].pos = pos_itr->second ;
190179 }
191180}
192181
193182template <typename T>
194183void
195- add_cytosine (const string &chrom_name, const size_t start_cpg,
196- vector<PairStateCounter<T>> &counts, vector<MSite> &cytosines) {
184+ add_cytosine (const std::string &chrom_name, const std::size_t start_cpg,
185+ std::vector<PairStateCounter<T>> &counts,
186+ std::vector<MSite> &cytosines) {
197187 std::ostringstream s;
198188 s << counts[start_cpg].score () << " \t " << counts[start_cpg].total () << " \t "
199189 << counts[start_cpg].tostring ();
200- const string name (s.str ());
190+ const std:: string name (s.str ());
201191 cytosines.push_back (MSite (chrom_name, start_cpg, ' +' , name, 0 , 0 ));
202192}
203193
204194template <typename T>
205195void
206- process_chrom (const string &chrom_name, const vector<epiread> &epireads,
207- vector<MSite> &cytosines, vector<PairStateCounter<T>> &counts) {
208- for (size_t i = 0 ; i < epireads.size (); ++i)
196+ process_chrom (const std::string &chrom_name,
197+ const std::vector<epiread> &epireads,
198+ std::vector<MSite> &cytosines,
199+ std::vector<PairStateCounter<T>> &counts) {
200+ for (std::size_t i = 0 ; i < epireads.size (); ++i)
209201 fit_states (epireads[i], counts);
210- for (size_t i = 0 ; i < counts.size (); ++i)
202+ for (std:: size_t i = 0 ; i < counts.size (); ++i)
211203 add_cytosine (chrom_name, i, counts, cytosines);
212204}
213205
214206static void
215- update_chroms_seen (const string &chrom_name,
216- unordered_set<string> &chroms_seen) {
207+ update_chroms_seen (const std:: string &chrom_name,
208+ std:: unordered_set<std:: string> &chroms_seen) {
217209 const auto chr_itr = chroms_seen.find (chrom_name);
218210 if (chr_itr != end (chroms_seen))
219- throw runtime_error (" chroms out of order: " + chrom_name);
211+ throw std:: runtime_error (" chroms out of order: " + chrom_name);
220212 chroms_seen.insert (chrom_name);
221213}
222214
223215static void
224- verify_chroms_available (const string &chrom_name,
225- unordered_map<string, size_t > &chrom_lookup) {
216+ verify_chroms_available (
217+ const std::string &chrom_name,
218+ std::unordered_map<std::string, std::size_t > &chrom_lookup) {
226219 const auto chr_itr = chrom_lookup.find (chrom_name);
227220 if (chr_itr == end (chrom_lookup))
228- throw runtime_error (" chrom not found: " + chrom_name);
221+ throw std:: runtime_error (" chrom not found: " + chrom_name);
229222}
230223
231224int
@@ -236,8 +229,8 @@ computes probability of allele-specific methylation at each tuple of CpGs
236229)" ;
237230 bool VERBOSE = false ;
238231
239- string outfile;
240- string chroms_dir;
232+ std:: string outfile;
233+ std:: string chroms_dir;
241234 /* ***************** COMMAND LINE OPTIONS ********************/
242235 OptionParser opt_parse (argv[0 ], // NOLINT(*-pointer-arithmetic)
243236 description, " <epireads>" );
@@ -246,79 +239,79 @@ computes probability of allele-specific methylation at each tuple of CpGs
246239 opt_parse.add_opt (" chrom" , ' c' , " genome sequence file/directory" , true ,
247240 chroms_dir);
248241 opt_parse.add_opt (" verbose" , ' v' , " print more run info" , false , VERBOSE);
249- vector<string> leftover_args;
242+ std:: vector<std:: string> leftover_args;
250243 opt_parse.parse (argc, argv, leftover_args);
251244 if (argc == 1 || opt_parse.help_requested ()) {
252- cerr << opt_parse.help_message () << ' \n '
253- << opt_parse.about_message () << ' \n ' ;
245+ std:: cerr << opt_parse.help_message () << ' \n '
246+ << opt_parse.about_message () << ' \n ' ;
254247 return EXIT_SUCCESS;
255248 }
256249 if (opt_parse.about_requested ()) {
257- cerr << opt_parse.about_message () << ' \n ' ;
250+ std:: cerr << opt_parse.about_message () << ' \n ' ;
258251 return EXIT_SUCCESS;
259252 }
260253 if (opt_parse.option_missing ()) {
261- cerr << opt_parse.option_missing_message () << ' \n ' ;
254+ std:: cerr << opt_parse.option_missing_message () << ' \n ' ;
262255 return EXIT_SUCCESS;
263256 }
264257 if (leftover_args.size () != 1 ) {
265- cerr << opt_parse.help_message () << ' \n ' ;
258+ std:: cerr << opt_parse.help_message () << ' \n ' ;
266259 return EXIT_SUCCESS;
267260 }
268- const string epi_file (leftover_args.front ());
261+ const std:: string epi_file (leftover_args.front ());
269262 /* ***************** END COMMAND LINE OPTIONS *****************/
270263
271- vector<string> chrom_names;
272- vector<string> chroms;
264+ std:: vector<std:: string> chrom_names;
265+ std:: vector<std:: string> chroms;
273266 read_fasta_file_short_names (chroms_dir, chrom_names, chroms);
274267 for (auto &&i : chroms)
275268 transform (begin (i), end (i), begin (i),
276269 [](const char c) { return std::toupper (c); });
277270
278271 // lookup to map chrom names to chrom sequences
279- unordered_map<string, size_t > chrom_lookup;
280- for (size_t i = 0 ; i < chrom_names.size (); ++i)
272+ std:: unordered_map<std:: string, std:: size_t > chrom_lookup;
273+ for (std:: size_t i = 0 ; i < chrom_names.size (); ++i)
281274 chrom_lookup.insert (make_pair (chrom_names[i], i));
282275
283- unordered_map<string, size_t > chrom_sizes;
284- for (size_t i = 0 ; i < chrom_names.size (); ++i) {
285- size_t cpg_count = 0 ;
286- for (size_t j = 0 ; j < chroms[i].size () - 1 ; ++j)
276+ std:: unordered_map<std:: string, std:: size_t > chrom_sizes;
277+ for (std:: size_t i = 0 ; i < chrom_names.size (); ++i) {
278+ std:: size_t cpg_count = 0 ;
279+ for (std:: size_t j = 0 ; j < chroms[i].size () - 1 ; ++j)
287280 cpg_count += (chroms[i][j] == ' C' && chroms[i][j + 1 ] == ' G' );
288281 chrom_sizes.insert (make_pair (chrom_names[i], cpg_count));
289282 }
290283
291284 if (VERBOSE)
292- cerr << " number of chromosomes: " << chrom_sizes.size () << ' \n ' ;
285+ std:: cerr << " number of chromosomes: " << chrom_sizes.size () << ' \n ' ;
293286
294287 std::ifstream in (epi_file);
295288 if (!in)
296- throw runtime_error (" cannot open input file: " + epi_file);
289+ throw std:: runtime_error (" cannot open input file: " + epi_file);
297290
298- std::ofstream of;
299- if (!outfile.empty ())
300- of.open (outfile);
301- std::ostream out (outfile.empty () ? cout.rdbuf () : of.rdbuf ());
291+ std::ofstream out (outfile);
292+ if (!out)
293+ throw std::runtime_error (" failed to open output file: " + outfile);
302294
303- unordered_set<string> chroms_seen;
304- string chrom;
295+ std:: unordered_set<std:: string> chroms_seen;
296+ std:: string chrom;
305297 epiread er;
306- vector<epiread> epireads;
298+ std:: vector<epiread> epireads;
307299 while (in >> er) {
308300 if (er.chr != chrom) {
309301 update_chroms_seen (er.chr , chroms_seen);
310302 verify_chroms_available (er.chr , chrom_lookup);
311303
312304 if (VERBOSE)
313- cerr << " [processing " << er.chr << " ]" << ' \n ' ;
305+ std:: cerr << " [processing " << er.chr << " ]" << ' \n ' ;
314306
315307 if (!chrom.empty ()) {
316- vector<PairStateCounter<uint32_t >> counts (chrom_sizes[chrom] - 1 );
317- vector<MSite> cytosines;
308+ std::vector<PairStateCounter<std::uint32_t >> counts (
309+ chrom_sizes[chrom] - 1 );
310+ std::vector<MSite> cytosines;
318311 process_chrom (chrom, epireads, cytosines, counts);
319- const size_t chrom_idx = chrom_lookup[chrom];
312+ const std:: size_t chrom_idx = chrom_lookup[chrom];
320313 convert_coordinates (chroms[chrom_idx], cytosines);
321- for (size_t i = 0 ; i < cytosines.size () - 1 ; ++i) {
314+ for (std:: size_t i = 0 ; i < cytosines.size () - 1 ; ++i) {
322315 out << cytosines[i].chrom << " \t " << cytosines[i].pos
323316 << " \t +\t CpG\t " << cytosines[i].context << ' \n ' ;
324317 }
@@ -329,22 +322,23 @@ computes probability of allele-specific methylation at each tuple of CpGs
329322 chrom.swap (er.chr );
330323 }
331324 if (!chrom.empty ()) {
332- vector<PairStateCounter<uint32_t >> counts (chrom_sizes[chrom] - 1 );
333- vector<MSite> cytosines;
325+ std::vector<PairStateCounter<std::uint32_t >> counts (chrom_sizes[chrom] -
326+ 1 );
327+ std::vector<MSite> cytosines;
334328 process_chrom (chrom, epireads, cytosines, counts);
335- const size_t chrom_idx = chrom_lookup[chrom];
329+ const std:: size_t chrom_idx = chrom_lookup[chrom];
336330 convert_coordinates (chroms[chrom_idx], cytosines);
337- for (size_t i = 0 ; i < cytosines.size () - 1 ; ++i) {
331+ for (std:: size_t i = 0 ; i < cytosines.size () - 1 ; ++i) {
338332 out << cytosines[i].chrom << " \t " << cytosines[i].pos << " \t +\t CpG\t "
339333 << cytosines[i].context << ' \n ' ;
340334 }
341335 }
342336 }
343337 catch (const std::exception &e) {
344- cerr << e.what () << ' \n ' ;
338+ std:: cerr << e.what () << ' \n ' ;
345339 return EXIT_FAILURE;
346340 }
347341 return EXIT_SUCCESS;
348342}
349343
350- // NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions,*-prefer-member-initializer )
344+ // NOLINTEND(*-avoid-magic-numbers,*-narrowing-conversions)
0 commit comments