1- /* unxcounts: reverse the process of xcounts and generate the counts
2- * file, including sites not covered.
1+ /* Copyright (C) 2023 Andrew D. Smith
32 *
4- * Copyright (C) 2023 Andrew D. Smith
3+ * This program is free software: you can redistribute it and/or modify it
4+ * under the terms of the GNU General Public License as published by the Free
5+ * Software Foundation, either version 3 of the License, or (at your option)
6+ * any later version.
57 *
6- * Authors: Andrew D. Smith
7- *
8- * This program is free software: you can redistribute it and/or
9- * modify it under the terms of the GNU General Public License as
10- * published by the Free Software Foundation, either version 3 of the
11- * License, or (at your option) any later version.
12- *
13- * This program is distributed in the hope that it will be useful, but
14- * WITHOUT ANY WARRANTY; without even the implied warranty of
15- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16- * General Public License for more details.
8+ * This program is distributed in the hope that it will be useful, but WITHOUT
9+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
10+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
11+ * more details.
1712 */
1813
14+ [[maybe_unused]] static constexpr auto about = R"(
15+ unxcounts: reverse the process of xcounts and generate the counts file,
16+ including sites not covered.
17+ )" ;
18+
1919#include " OptionParser.hpp"
2020#include " bsutils.hpp"
2121#include " counts_header.hpp"
2828#include < htslib/sam.h>
2929
3030#include < algorithm>
31+ #include < array>
3132#include < cassert>
3233#include < cctype>
3334#include < charconv>
4546#include < utility>
4647#include < vector>
4748
48- // NOLINTBEGIN(*-avoid-c-arrays,*-avoid-magic-numbers,*-avoid-non-const-global-variables,*- narrowing-conversions,*-constant-array-index,*-pointer-arithmetic )
49+ // NOLINTBEGIN(*-narrowing-conversions)
4950
5051static void
5152read_fasta_file_short_names_uppercase (const std::string &chroms_file,
@@ -55,8 +56,8 @@ read_fasta_file_short_names_uppercase(const std::string &chroms_file,
5556 names.clear ();
5657 read_fasta_file_short_names (chroms_file, names, chroms);
5758 for (auto &i : chroms)
58- transform (std::cbegin (i), std::cend (i), begin (i),
59- [](const char c) { return std::toupper (c); });
59+ std:: transform (std::cbegin (i), std::cend (i), begin (i),
60+ [](const char c) { return std::toupper (c); });
6061}
6162
6263static void
@@ -83,7 +84,7 @@ verify_chrom_orders(
8384 throw std::runtime_error (" failed to acquire buffer" );
8485
8586 while (bamxx::getline (in, line)) {
86- if (std::isdigit (line.s [0 ]))
87+ if (std::isdigit (line.s [0 ])) // NOLINT(*-pointer-arithmetic)
8788 continue ;
8889 if (is_counts_header_line (line.s ))
8990 continue ;
@@ -110,20 +111,22 @@ verify_chrom_orders(
110111 std::cerr << " chrom orders are consistent" << " \n " ;
111112}
112113
113- static const char * tag_values[] = {
114+ static constexpr auto tag_values = std::array< const char *, 5 > {
114115 " CpG" , // 0
115116 " CHH" , // 1
116117 " CXG" , // 2
117118 " CCG" , // 3
118119 " N" // 4
119120};
120121
121- static const int tag_sizes[] = {3 , 3 , 3 , 3 , 1 };
122+ static constexpr auto tag_sizes = std::array<int , 5 >{
123+ 3 , 3 , 3 , 3 , 1 ,
124+ };
122125
123- // ADS: the values below allow for things like CHH where the is a N in
124- // the triplet; I'm allowing that for consistency with the weird logic
125- // from earlier versions.
126- const std:: uint32_t context_codes[] = {
126+ // ADS: the values below allow for things like CHH where the is a N in the
127+ // triplet; I'm allowing that for consistency with the weird logic from
128+ // earlier versions.
129+ static constexpr auto context_codes = std::array<std:: uint32_t , 25 > {
127130 /* CAA CHH*/ 1 ,
128131 /* CAC CHH*/ 1 ,
129132 /* CAG CXG*/ 2 ,
@@ -154,26 +157,28 @@ const std::uint32_t context_codes[] = {
154157static inline std::uint32_t
155158get_tag_from_genome_c (const std::string &s, const size_t pos) {
156159 const auto val = base2int (s[pos + 1 ]) * 5 + base2int (s[pos + 2 ]);
157- return context_codes[val];
160+ return context_codes[val]; // NOLINT(*-constant-array-index)
158161}
159162
160163static inline std::uint32_t
161164get_tag_from_genome_g (const std::string &s, const size_t pos) {
162165 const auto val =
163166 base2int (complement (s[pos - 1 ])) * 5 + base2int (complement (s[pos - 2 ]));
164- return context_codes[val];
167+ return context_codes[val]; // NOLINT(*-constant-array-index)
165168}
166169
167170static bool
168171write_missing (const std::uint32_t name_size, const std::string &chrom,
169172 const std::uint64_t start_pos, const std::uint64_t end_pos,
170173 std::vector<char > &buf, bamxx::bgzf_file &out) {
171174 static constexpr auto zeros = " \t 0\t 0\n " ;
175+ static constexpr auto zeros_sz = 5 ;
172176 static constexpr auto pos_strand = " \t +\t " ;
173177 static constexpr auto neg_strand = " \t -\t " ;
174- const auto buf_end = buf.data () + size (buf);
178+ const auto buf_end =
179+ buf.data () + std::size (buf); // NOLINT(*-pointer-arithmetic)
175180 // chrom name is already in the buffer so move past it
176- auto cursor = buf.data () + name_size + 1 ;
181+ auto cursor = buf.data () + name_size + 1 ; // NOLINT(*-pointer-arithmetic)
177182 for (auto pos = start_pos; pos < end_pos; ++pos) {
178183 const char base = chrom[pos];
179184 if (is_cytosine (base) || is_guanine (base)) {
@@ -182,10 +187,12 @@ write_missing(const std::uint32_t name_size, const std::string &chrom,
182187 : get_tag_from_genome_g (chrom, pos);
183188#pragma GCC diagnostic push
184189#pragma GCC diagnostic error "-Wstringop-overflow=0"
190+ // NOLINTBEGIN(*-constant-array-index)
185191 auto [ptr, ec] = std::to_chars (cursor, buf_end, pos);
186192 ptr = std::copy_n (is_c ? pos_strand : neg_strand, 3 , ptr);
187193 ptr = std::copy_n (tag_values[the_tag], tag_sizes[the_tag], ptr);
188- ptr = std::copy_n (zeros, 5 , ptr);
194+ ptr = std::copy_n (zeros, zeros_sz, ptr);
195+ // NOLINTEND(*-constant-array-index)
189196 const auto sz = std::distance (buf.data (), ptr);
190197#pragma GCC diagnostic push
191198
@@ -201,21 +208,26 @@ write_missing_cpg(const std::uint32_t &name_size, const std::string &chrom,
201208 const std::uint64_t start_pos, const std::uint64_t end_pos,
202209 std::vector<char > &buf, bamxx::bgzf_file &out) {
203210 static constexpr auto zeros = " \t 0\t 0\n " ;
211+ static constexpr auto zeros_sz = 5 ;
204212 static constexpr auto pos_strand = " \t +\t " ;
205- const auto buf_end = buf.data () + size (buf);
213+ static constexpr auto pos_strand_sz = 3 ;
214+ const auto buf_end =
215+ buf.data () + std::size (buf); // NOLINT(*-pointer-arithmetic)
206216 // chrom name is already in the buffer so move past it
207- auto cursor = buf.data () + name_size + 1 ;
217+ auto cursor = buf.data () + name_size + 1 ; // NOLINT(*-pointer-arithmetic)
208218 for (auto pos = start_pos; pos < end_pos - 1 ; ++pos) {
209- // When this function is called, the "end_pos" is either the chrom
210- // size or the position of a base known to be a C. So we never
211- // have to allow pos+1 to equal end_pos.
219+ // When this function is called, the "end_pos" is either the chrom size or
220+ // the position of a base known to be a C. So we never have to allow pos+1
221+ // to equal end_pos.
212222 if (is_cytosine (chrom[pos]) && is_guanine (chrom[pos + 1 ])) {
213223#pragma GCC diagnostic push
214224#pragma GCC diagnostic error "-Wstringop-overflow=0"
225+ // NOLINTBEGIN(*-constant-array-index)
215226 auto [ptr, ec] = std::to_chars (cursor, buf_end, pos);
216- ptr = std::copy_n (pos_strand, 3 , ptr);
227+ ptr = std::copy_n (pos_strand, pos_strand_sz , ptr);
217228 ptr = std::copy_n (" CpG" , 3 , ptr);
218- ptr = std::copy_n (zeros, 5 , ptr);
229+ ptr = std::copy_n (zeros, zeros_sz, ptr);
230+ // NOLINTEND(*-constant-array-index)
219231 const auto sz = std::distance (buf.data (), ptr);
220232#pragma GCC diagnostic push
221233 if (bgzf_write (out.f , buf.data (), sz) != sz)
@@ -233,8 +245,11 @@ write_site(const std::uint32_t name_size, const std::string &chrom,
233245 static constexpr auto pos_strand = " \t +\t " ;
234246 static constexpr auto neg_strand = " \t -\t " ;
235247 static constexpr auto fmt = std::chars_format::general;
248+ // use default precision, 6, same as std::cout default
249+ static constexpr auto precision = 6 ;
236250
237- const auto buf_end = buf.data () + size (buf);
251+ const auto buf_end =
252+ buf.data () + std::size (buf); // NOLINT(*-pointer-arithmetic)
238253 const char base = chrom[pos];
239254 assert (is_cytosine (base) || is_guanine (base));
240255 const bool is_c = is_cytosine (base);
@@ -246,17 +261,17 @@ write_site(const std::uint32_t name_size, const std::string &chrom,
246261#pragma GCC diagnostic push
247262#pragma GCC diagnostic error "-Wstringop-overflow=0"
248263 // chrom name is already in the buffer so move past it
249- auto cursor = buf.data () + name_size + 1 ;
264+ auto cursor = buf.data () + name_size + 1 ; // NOLINT(*-pointer-arithmetic)
250265 {
251266 auto [ptr, ec] = std::to_chars (cursor, buf_end, pos);
252267 cursor = ptr;
253268 }
254269 cursor = std::copy_n (is_c ? pos_strand : neg_strand, 3 , cursor);
270+ // NOLINTNEXTLINE(*-constant-array-index)
255271 cursor = std::copy_n (tag_values[the_tag], tag_sizes[the_tag], cursor);
256272 *cursor++ = ' \t ' ;
257273 {
258- // use default precision, 6, same as std::cout default
259- auto [ptr, ec] = std::to_chars (cursor, buf_end, meth, fmt, 6 );
274+ auto [ptr, ec] = std::to_chars (cursor, buf_end, meth, fmt, precision);
260275 cursor = ptr;
261276 }
262277 *cursor++ = ' \t ' ;
@@ -318,11 +333,11 @@ get_lookups(const std::vector<std::string> &names,
318333 std::vector<std::uint64_t > &chrom_sizes) {
319334 chrom_lookup.clear ();
320335 name_to_id.clear ();
321- chrom_sizes = std::vector<std::uint64_t >(size (chroms), 0 );
322- for (size_t i = 0 ; i < size (chroms); ++i) {
336+ chrom_sizes = std::vector<std::uint64_t >(std:: size (chroms), 0 );
337+ for (size_t i = 0 ; i < std:: size (chroms); ++i) {
323338 chrom_lookup[names[i]] = std::cbegin (chroms) + i;
324339 name_to_id[names[i]] = i;
325- chrom_sizes[i] = size (chroms[i]);
340+ chrom_sizes[i] = std:: size (chroms[i]);
326341 }
327342}
328343
@@ -332,7 +347,8 @@ process_header_line(
332347 const std::vector<std::uint64_t > &chrom_sizes, const kstring_t &line,
333348 bamxx::bgzf_file &out) {
334349 std::string hdr_line{line.s };
335- if (size (hdr_line) > 1 && !verify_chrom (hdr_line, name_to_id, chrom_sizes))
350+ if (std::size (hdr_line) > 1 &&
351+ !verify_chrom (hdr_line, name_to_id, chrom_sizes))
336352 throw std::runtime_error{" failed to verify header for: " + hdr_line};
337353 if (!write_counts_header_line (hdr_line, out))
338354 throw std::runtime_error{" failed to write header line: " + hdr_line};
@@ -351,7 +367,8 @@ write_all_sites(const bool verbose, const std::uint32_t prev_chr_id,
351367 auto res =
352368 std::copy (std::cbegin (names[i]), std::cend (names[i]), buf.data ());
353369 *res = ' \t ' ;
354- write_missing (size (names[i]), chroms[i], 0u , size (chroms[i]), buf, out);
370+ write_missing (std::size (names[i]), chroms[i], 0u , std::size (chroms[i]), buf,
371+ out);
355372 }
356373}
357374
@@ -364,11 +381,11 @@ process_sites(const bool verbose, const bool add_missing_chroms,
364381 std::vector<std::string> chroms, names;
365382 read_fasta_file_short_names_uppercase (chroms_file, names, chroms);
366383 if (verbose)
367- std::cerr << " [n chroms in reference: " << chroms. size () << " ]" << " \n " ;
384+ std::cerr << " [n chroms in reference: " << std:: size (chroms ) << " ]" << " \n " ;
368385
369386 std::unordered_map<std::string, chrom_itr_t > chrom_lookup;
370387 std::unordered_map<std::string, std::int32_t > name_to_id;
371- std::vector<std::uint64_t > chrom_sizes (size (chroms), 0 );
388+ std::vector<std::uint64_t > chrom_sizes (std:: size (chroms), 0 );
372389 get_lookups (names, chroms, chrom_lookup, name_to_id, chrom_sizes);
373390
374391 if (add_missing_chroms)
@@ -412,15 +429,16 @@ process_sites(const bool verbose, const bool add_missing_chroms,
412429 while (getline (in, line)) {
413430 if (is_counts_header_line (line.s )) {
414431 process_header_line (name_to_id, chrom_sizes, line, out);
415- continue ; // ADS: early loop exit
432+ continue ; // ADS: just skip headers
416433 }
417434
418- if (!std::isdigit (line.s [0 ])) { // check if we have a chrom line
435+ // check if we have a chrom line
436+ if (!std::isdigit (line.s [0 ])) { // NOLINT(*-pointer-arithmetic)
419437 if (!require_covered && pos != std::numeric_limits<std::uint64_t >::max ())
420- write_missing (nm_sz, *ch_itr, pos + 1 , size (*ch_itr), buf, out);
438+ write_missing (nm_sz, *ch_itr, pos + 1 , std:: size (*ch_itr), buf, out);
421439
422440 chrom_name = std::string{line.s };
423- nm_sz = size (chrom_name);
441+ nm_sz = std:: size (chrom_name);
424442 const std::int32_t chr_id = get_chrom_id (name_to_id, chrom_name);
425443
426444 if (add_missing_chroms)
@@ -438,10 +456,12 @@ process_sites(const bool verbose, const bool add_missing_chroms,
438456 }
439457 else {
440458 std::uint32_t pos_step = 0 , n_meth = 0 , n_unmeth = 0 ;
459+ // NOLINTBEGIN(*-pointer-arithmetic)
441460 const auto end_line = line.s + line.l ;
442461 auto res = std::from_chars (line.s , end_line, pos_step);
443462 res = std::from_chars (res.ptr + 1 , end_line, n_meth);
444463 res = std::from_chars (res.ptr + 1 , end_line, n_unmeth);
464+ // NOLINTEND(*-pointer-arithmetic)
445465
446466 const auto curr_pos = pos + pos_step;
447467 if (!require_covered && pos + 1 < curr_pos)
@@ -452,9 +472,9 @@ process_sites(const bool verbose, const bool add_missing_chroms,
452472 }
453473 }
454474 if (!require_covered)
455- write_missing (nm_sz, *ch_itr, pos + 1 , size (*ch_itr), buf, out);
475+ write_missing (nm_sz, *ch_itr, pos + 1 , std:: size (*ch_itr), buf, out);
456476 if (add_missing_chroms)
457- write_all_sites (verbose, prev_chr_id, size (chroms), names, chroms, buf,
477+ write_all_sites (verbose, prev_chr_id, std:: size (chroms), names, chroms, buf,
458478 out);
459479}
460480
@@ -471,7 +491,8 @@ write_all_cpgs(const bool verbose, const std::uint32_t prev_chr_id,
471491 auto res =
472492 std::copy (std::cbegin (names[i]), std::cend (names[i]), buf.data ());
473493 *res = ' \t ' ;
474- write_missing_cpg (size (names[i]), chroms[i], 0u , size (chroms[i]), buf, out);
494+ write_missing_cpg (std::size (names[i]), chroms[i], 0u , std::size (chroms[i]),
495+ buf, out);
475496 }
476497}
477498
@@ -484,11 +505,11 @@ process_cpg_sites(const bool verbose, const bool add_missing_chroms,
484505 std::vector<std::string> chroms, names;
485506 read_fasta_file_short_names_uppercase (chroms_file, names, chroms);
486507 if (verbose)
487- std::cerr << " [n chroms in reference: " << chroms. size () << " ]" << " \n " ;
508+ std::cerr << " [n chroms in reference: " << std:: size (chroms ) << " ]" << " \n " ;
488509
489510 std::unordered_map<std::string, chrom_itr_t > chrom_lookup;
490511 std::unordered_map<std::string, std::int32_t > name_to_id;
491- std::vector<std::uint64_t > chrom_sizes (size (chroms), 0 );
512+ std::vector<std::uint64_t > chrom_sizes (std:: size (chroms), 0 );
492513 get_lookups (names, chroms, chrom_lookup, name_to_id, chrom_sizes);
493514
494515 if (add_missing_chroms)
@@ -535,12 +556,14 @@ process_cpg_sites(const bool verbose, const bool add_missing_chroms,
535556 continue ; // ADS: early loop exit
536557 }
537558
538- if (!std::isdigit (line.s [0 ])) { // check if we have a chrom line
559+ // check if we have a chrom line
560+ if (!std::isdigit (line.s [0 ])) { // NOLINT(*-pointer-arithmetic)
539561 if (!require_covered && pos != std::numeric_limits<std::uint64_t >::max ())
540- write_missing_cpg (nm_sz, *ch_itr, pos + 1 , size (*ch_itr), buf, out);
562+ write_missing_cpg (nm_sz, *ch_itr, pos + 1 , std::size (*ch_itr), buf,
563+ out);
541564
542565 chrom_name = std::string{line.s };
543- nm_sz = size (chrom_name);
566+ nm_sz = std:: size (chrom_name);
544567 const std::int32_t chr_id = get_chrom_id (name_to_id, chrom_name);
545568
546569 if (add_missing_chroms)
@@ -558,10 +581,12 @@ process_cpg_sites(const bool verbose, const bool add_missing_chroms,
558581 }
559582 else {
560583 std::uint32_t pos_step = 0 , n_meth = 0 , n_unmeth = 0 ;
584+ // NOLINTBEGIN(*-pointer-arithmetic)
561585 const auto end_line = line.s + line.l ;
562586 auto res = std::from_chars (line.s , end_line, pos_step);
563587 res = std::from_chars (res.ptr + 1 , end_line, n_meth);
564588 res = std::from_chars (res.ptr + 1 , end_line, n_unmeth);
589+ // NOLINTEND(*-pointer-arithmetic)
565590
566591 const auto curr_pos = pos + pos_step;
567592 if (!require_covered && pos + 1 < curr_pos)
@@ -572,9 +597,10 @@ process_cpg_sites(const bool verbose, const bool add_missing_chroms,
572597 }
573598 }
574599 if (!require_covered)
575- write_missing_cpg (nm_sz, *ch_itr, pos + 1 , size (*ch_itr), buf, out);
600+ write_missing_cpg (nm_sz, *ch_itr, pos + 1 , std:: size (*ch_itr), buf, out);
576601 if (add_missing_chroms)
577- write_all_cpgs (verbose, prev_chr_id, size (chroms), names, chroms, buf, out);
602+ write_all_cpgs (verbose, prev_chr_id, std::size (chroms), names, chroms, buf,
603+ out);
578604}
579605
580606int
@@ -622,7 +648,7 @@ main_unxcounts(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
622648 std::cerr << opt_parse.option_missing_message () << " \n " ;
623649 return EXIT_SUCCESS;
624650 }
625- if (leftover_args. size () != 1 ) {
651+ if (std:: size (leftover_args ) != 1 ) {
626652 std::cerr << opt_parse.help_message () << " \n " ;
627653 return EXIT_SUCCESS;
628654 }
@@ -653,4 +679,4 @@ main_unxcounts(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays)
653679 return EXIT_SUCCESS;
654680}
655681
656- // NOLINTEND(*-avoid-c-arrays,*-avoid-magic-numbers,*-avoid-non-const-global-variables,*- narrowing-conversions,*-constant-array-index,*-pointer-arithmetic )
682+ // NOLINTEND(*-narrowing-conversions)
0 commit comments