diff --git a/Makefile.am b/Makefile.am index 19871763..a963cc47 100644 --- a/Makefile.am +++ b/Makefile.am @@ -153,6 +153,7 @@ test_scripts/test_unxcounts.log: test_scripts/test_xcounts.log noinst_LIBRARIES = libdnmtools.a libdnmtools_a_SOURCES = \ + src/common/file_progress.cpp \ src/common/Interval.cpp \ src/common/Interval6.cpp \ src/common/dnmtools_gaussinv.cpp \ @@ -175,6 +176,7 @@ libdnmtools_a_SOURCES = \ libdnmtools_a_SOURCES += \ src/common/dnmtools_lgamma.hpp \ + src/common/file_progress.hpp \ src/common/Interval.hpp \ src/common/Interval6.hpp \ src/common/dnmtools_gaussinv.hpp \ diff --git a/src/amrfinder/amrtester.cpp b/src/amrfinder/amrtester.cpp index d76ecfac..247ed2e2 100644 --- a/src/amrfinder/amrtester.cpp +++ b/src/amrfinder/amrtester.cpp @@ -48,11 +48,11 @@ methylation static void backup_to_start_of_current_record(std::ifstream &in) { static constexpr std::size_t assumed_max_valid_line_width = 10000; - std::size_t count = 0; + std::size_t byte_count = 0; while (in.tellg() > 0 && in.peek() != '\n' && in.peek() != '\r' && - count++ < assumed_max_valid_line_width) + byte_count++ < assumed_max_valid_line_width) in.seekg(-1, std::ios_base::cur); - if (count > assumed_max_valid_line_width) + if (byte_count > assumed_max_valid_line_width) throw std::runtime_error("file contains a line longer than " + std::to_string(assumed_max_valid_line_width)); } diff --git a/src/analysis/multimethstat.cpp b/src/analysis/multimethstat.cpp index 3b4d1051..02b55654 100644 --- a/src/analysis/multimethstat.cpp +++ b/src/analysis/multimethstat.cpp @@ -150,7 +150,8 @@ process_with_cpgs_loaded(const bool verbose, throw std::runtime_error("cannot open file: " + cpgs_file); std::string header; - getline(in, header); + if (!getline(in, header)) + throw std::runtime_error("failed to read header"); std::istringstream iss(header); std::string col_name; diff --git a/src/analysis/pmd.cpp b/src/analysis/pmd.cpp index 02c6e0ba..a084e531 100644 --- a/src/analysis/pmd.cpp +++ b/src/analysis/pmd.cpp @@ -712,11 +712,13 @@ check_if_array_data(const std::string &infile) { skip_counts_header(in); std::string line; - getline(in, line); + if (!getline(in, line)) + throw std::runtime_error("failed to read line from file: " + infile); + std::istringstream iss(line); std::string chrom, pos, strand, seq, meth, cov; iss >> chrom >> pos >> strand >> seq >> meth; - return (!(iss >> cov)); + return !(iss >> cov); } static void @@ -1205,8 +1207,7 @@ main_pmd(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) double confidence_interval = default_confidence_interval; double prop_accept = default_prop_accept; for (std::size_t i = 0; i < n_replicates && !insufficient_data; ++i) { - const bool arrayData = check_if_array_data(cpgs_file[i]); - if (!arrayData) { + if (!check_if_array_data(cpgs_file[i])) { bin_size = binsize_selection(resolution, min_bin_size, max_bin_size, confidence_interval, prop_accept, cpgs_file[i]); diff --git a/src/analysis/roimethstat.cpp b/src/analysis/roimethstat.cpp index 560e5b41..e6b3126a 100644 --- a/src/analysis/roimethstat.cpp +++ b/src/analysis/roimethstat.cpp @@ -405,8 +405,9 @@ get_bed_columns(const std::string ®ions_file) { return n_columns; } -static auto +[[nodiscard]] static auto get_intervals(const std::string &filename) -> std::vector { + static constexpr auto n_bed6_cols = 6; const auto field_count = [&] { bamxx::bgzf_file in(filename, "r"); if (!in) @@ -419,13 +420,16 @@ get_intervals(const std::string &filename) -> std::vector { (std::istream_iterator())); return std::size(v); }(); - if (field_count >= 6) + if (field_count >= n_bed6_cols) return read_intervals6(filename); - auto without_names = read_intervals(filename); - std::vector intervals; - for (const auto &w : without_names) - intervals.emplace_back(w.chrom, w.start, w.stop, std::string{}, 0, '+'); + const auto without_names = read_intervals(filename); + std::vector intervals(std::size(without_names)); + std::transform(std::cbegin(without_names), std::cend(without_names), + std::begin(intervals), [&](const auto &w) { + return Interval6(w.chrom, w.start, w.stop, std::string{}, 0, + '+'); + }); return intervals; } diff --git a/src/common/file_progress.cpp b/src/common/file_progress.cpp new file mode 100644 index 00000000..4891e534 --- /dev/null +++ b/src/common/file_progress.cpp @@ -0,0 +1,48 @@ +/* Copyright (C) 2025 Andrew D Smith + * + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include "file_progress.hpp" + +#include +#include +#include +#include +#include + +// NOLINTBEGIN(*-narrowing-conversions) + +file_progress::file_progress(const std::string &filename) : + one_thousand_over_filesize{ + static_cast(one_thousand) / + static_cast(std::filesystem::file_size(filename))} {} + +void +file_progress::operator()( + std::ifstream &in) { // cppcheck-suppress constParameterReference + static constexpr auto ten = 10.0; + const std::size_t curr_offset = + in.eof() ? one_thousand : in.tellg() * one_thousand_over_filesize; + if (curr_offset <= prev_offset) + return; + std::ios old_state(nullptr); + old_state.copyfmt(std::cerr); + std::cerr << "\r[progress: " << std::setw(5) // NOLINT(*-avoid-magic-numbers) + << std::fixed << std::setprecision(1) << (curr_offset / ten) + << (curr_offset == one_thousand ? "%]\n" : "%]"); + std::cerr.copyfmt(old_state); + prev_offset = curr_offset == one_thousand + ? std::numeric_limits::max() + : curr_offset; +} + +// NOLINTEND(*-narrowing-conversions) diff --git a/src/common/file_progress.hpp b/src/common/file_progress.hpp new file mode 100644 index 00000000..d2531fba --- /dev/null +++ b/src/common/file_progress.hpp @@ -0,0 +1,32 @@ +/* Copyright (C) 2025 Andrew D Smith + * + * Author: Andrew D. Smith + * + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef SRC_COMMON_FILE_PROGRESS_HPP_ +#define SRC_COMMON_FILE_PROGRESS_HPP_ + +#include +#include +#include + +struct file_progress { + static constexpr auto one_thousand{1000ul}; + double one_thousand_over_filesize{}; + std::size_t prev_offset{}; + explicit file_progress(const std::string &filename); + void + operator()(std::ifstream &in); // cppcheck-suppress constParameterReference +}; + +#endif // SRC_COMMON_FILE_PROGRESS_HPP_ diff --git a/src/radmeth/radmeth.cpp b/src/radmeth/radmeth.cpp index 4f5e3a0a..b93369ac 100644 --- a/src/radmeth/radmeth.cpp +++ b/src/radmeth/radmeth.cpp @@ -1,7 +1,4 @@ -/* Copyright (C) 2013-2025 Andrew D Smith - * - * Author: Andrew D. Smith - * Contributors: Egor Dolzhenko and Guilherme Sena +/* Copyright (C) 2013-2025 Andrew D Smith, Egor Dolzhenko and Guilherme Sena * * This program is free software: you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -14,6 +11,7 @@ * more details. */ +#include "file_progress.hpp" #include "radmeth_design.hpp" #include "radmeth_model.hpp" #include "radmeth_optimize_params.hpp" diff --git a/src/radmeth/radmeth_nano.cpp b/src/radmeth/radmeth_nano.cpp index 5b76f495..5eeffffc 100644 --- a/src/radmeth/radmeth_nano.cpp +++ b/src/radmeth/radmeth_nano.cpp @@ -1,6 +1,4 @@ /* Copyright (C) 2025 Andrew D Smith - * - * Author: Andrew D. Smith * * This program is free software: you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the Free @@ -13,6 +11,7 @@ * more details. */ +#include "file_progress.hpp" #include "radmeth_design.hpp" #include "radmeth_model.hpp" #include "radmeth_optimize_gamma.hpp" @@ -36,8 +35,9 @@ #include #include +template [[nodiscard]] static bool -has_low_coverage(const Regression ®, const std::size_t test_factor) { +has_low_coverage(const RegressionType ®, const std::size_t test_factor) { bool cvrd_in_test_fact_smpls = false; const auto &tcol = reg.design.tmatrix[test_factor]; for (std::size_t i = 0; i < reg.n_samples() && !cvrd_in_test_fact_smpls; ++i) @@ -50,8 +50,9 @@ has_low_coverage(const Regression ®, const std::size_t test_factor) { return !cvrd_in_test_fact_smpls || !cvrd_in_other_smpls; } +template [[nodiscard]] static bool -has_extreme_counts(const Regression ®) { +has_extreme_counts(const RegressionType ®) { const auto &mc = reg.mc; bool full_meth = true; diff --git a/src/radmeth/radmeth_utils.cpp b/src/radmeth/radmeth_utils.cpp index 530199b6..53e950f8 100644 --- a/src/radmeth/radmeth_utils.cpp +++ b/src/radmeth/radmeth_utils.cpp @@ -17,15 +17,14 @@ #include #include +#include #include -#include #include -#include #include #include #include -// NOLINTBEGIN(*-narrowing-conversions,*-avoid-magic-numbers) +// NOLINTBEGIN(*-narrowing-conversions) [[nodiscard]] std::string format_duration(const std::chrono::duration elapsed) { @@ -39,32 +38,14 @@ format_duration(const std::chrono::duration elapsed) { const double seconds = tot_s - (hours * s_per_h) - (minutes * s_per_m); std::ostringstream oss; + // NOLINTBEGIN(*-avoid-magic-numbers) oss << std::setfill('0') << std::setw(2) << hours << ":" << std::setfill('0') << std::setw(2) << minutes << ":" << std::fixed << std::setprecision(2) << std::setw(5) << seconds; + // NOLINTEND(*-avoid-magic-numbers) return oss.str(); } -file_progress::file_progress(const std::string &filename) : - one_thousand_over_filesize{1000.0 / std::filesystem::file_size(filename)} {} - -void -file_progress::operator()( - std::ifstream &in) { // cppcheck-suppress constParameterReference - const std::size_t curr_offset = - in.eof() ? 1000 : in.tellg() * one_thousand_over_filesize; - if (curr_offset <= prev_offset) - return; - std::ios old_state(nullptr); - old_state.copyfmt(std::cerr); - std::cerr << "\r[progress: " << std::setw(5) << std::fixed - << std::setprecision(1) << (curr_offset / 10.0) - << (curr_offset == 1000 ? "%]\n" : "%]"); - std::cerr.copyfmt(old_state); - prev_offset = (curr_offset == 1000) ? std::numeric_limits::max() - : curr_offset; -} - // Series representation for the lower incomplete gamma P(a,x) [[nodiscard]] static double gamma_p_series(const double a, const double x) { @@ -113,19 +94,15 @@ gamma_q_contfrac(const double a, const double x) { // Regularized lower incomplete gamma P(a,x) [[nodiscard]] static double gamma_p(const double a, const double x) { - if (x < 0 || a <= 0) - return 0.0; - if (x == 0) - return 0.0; - if (x < a + 1.0) - return gamma_p_series(a, x); - return 1.0 - gamma_q_contfrac(a, x); + return x <= 0.0 || a <= 0 ? 0.0 + : (x < a + 1.0 ? gamma_p_series(a, x) + : 1.0 - gamma_q_contfrac(a, x)); } // chi-square CDF: P(k/2, x/2) [[nodiscard]] static double chi_square_cdf(const double x, const double k) { - return gamma_p(k * 0.5, x * 0.5); + return gamma_p(k / 2, x / 2); } // Given the maximum likelihood estimates of the full and reduced models, the @@ -148,4 +125,4 @@ llr_test(const double null_loglik, const double full_loglik) { return p_value; } -// NOLINTEND(*-narrowing-conversions,*-avoid-magic-numbers) +// NOLINTEND(*-narrowing-conversions) diff --git a/src/radmeth/radmeth_utils.hpp b/src/radmeth/radmeth_utils.hpp index 4522fe55..9245072b 100644 --- a/src/radmeth/radmeth_utils.hpp +++ b/src/radmeth/radmeth_utils.hpp @@ -17,22 +17,12 @@ #define RADMETH_UTILS_HPP #include -#include #include -#include #include [[nodiscard]] std::string format_duration(const std::chrono::duration elapsed); -struct file_progress { - double one_thousand_over_filesize{}; - std::size_t prev_offset{}; - explicit file_progress(const std::string &filename); - void - operator()(std::ifstream &in); // cppcheck-suppress constParameterReference -}; - [[nodiscard]] double llr_test(const double null_loglik, const double full_loglik); diff --git a/src/utils/merge-methcounts.cpp b/src/utils/merge-methcounts.cpp index 9eb3451c..df8f0f6a 100644 --- a/src/utils/merge-methcounts.cpp +++ b/src/utils/merge-methcounts.cpp @@ -1,21 +1,39 @@ -/* merge-methcounts: a program for merging methcounts files +/* Copyright (C) 2011-2025 Andrew D. Smith * - * Copyright (C) 2011-2025 University of Southern California and - * Andrew D. Smith + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. * - * Authors: Andrew D Smith - * - * This program is free software: you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation, either version 3 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. */ +[[maybe_unused]] static constexpr auto about = R"( +merge multiple counts format files into a table or single methylome. +)"; + +[[maybe_unused]] static constexpr auto description = R"( +This utility does two things, and they are grouped together here because of +how they are done, not because the uses are related. (1) merge-methcounts can +take a set of methcounts output files and combine them into one. There are +several reasons a user might want to do this. An example is when technical +replicates are performed, and analyzed separately to understand technical +variance (e.g., between sequencing runs or library preps). After examining the +technical variation, subsequent analyses might be best conducted on all the +data together. So all the methcounts files can be combined into one using +merge-methcounts. In this case, the coverage at any site is the sum of the +coverages in the original methcounts files, and the methylation level at any +site is the weighted mean. (2) merge-methcounts can take a set of methcounts +output files, and create a table that contains all the same information. The +table format is helpful if subsequent analyses are best done using a data +table, for example in R. When producing a tabular format, merge-methcounts +allows the user to select whether the desired output is in counts or +fractions. +)"; + #include "MSite.hpp" #include "OptionParser.hpp" @@ -27,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -41,14 +60,14 @@ #include #include -// NOLINTBEGIN(*-owning-memory,*-avoid-magic-numbers,*-narrowing-conversions,*-pointer-arithmetic) +// NOLINTBEGIN(*-narrowing-conversions) static void set_invalid(MSite &s) { s.pos = std::numeric_limits::max(); } -static bool +[[nodiscard]] static bool is_valid(const MSite &s) { return s.pos != std::numeric_limits::max(); } @@ -66,7 +85,7 @@ any_sites_unprocessed( outdated[i] = false; MSite tmp_site; if (read_site(*infiles[i], tmp_site)) { - // ADS: chrom order within a file already tested + // ADS: assume chrom order within a file already tested if (tmp_site.pos <= sites[i].pos && tmp_site.chrom == sites[i].chrom) throw std::runtime_error("sites not sorted in " + filenames[i]); sites_remain = true; @@ -111,7 +130,7 @@ find_minimum_site( ms_id = i; if (ms_id == std::numeric_limits::max()) - throw std::runtime_error("failed in a next site to print"); + throw std::runtime_error("failed to find next site to print"); // now find the earliest site to print among those that could be for (std::size_t i = 0; i < n_files; ++i) @@ -122,7 +141,7 @@ find_minimum_site( return ms_id; } -[[nodiscard]] static bool +[[nodiscard]] static inline bool same_location(const MSite &a, const MSite &b) { return a.chrom == b.chrom && a.pos == b.pos; } @@ -133,15 +152,11 @@ collect_sites_to_print( const std::unordered_map &chroms_order, const std::vector &outdated, std::vector &to_print) { const std::size_t n_files = std::size(sites); - const std::size_t min_site_idx = find_minimum_site(sites, chroms_order, outdated); - + const auto &min_site = sites[min_site_idx]; for (std::size_t i = 0; i < n_files; ++i) - // condition below covers "is_valid(sites[i])" - if (same_location(sites[min_site_idx], sites[i])) - to_print[i] = true; - + to_print[i] = same_location(min_site, sites[i]); return min_site_idx; } @@ -169,7 +184,7 @@ write_line_for_tabular(std::array &buffer, min_site.set_mutated(); // ADS: is this the format we want for the row names? - auto cursor = buffer.data(); + auto cursor = std::begin(buffer); auto bytes_left = buffer_size; { const auto n_bytes = @@ -177,7 +192,7 @@ write_line_for_tabular(std::array &buffer, min_site.pos, min_site.strand, min_site.context.data()); if (n_bytes < 0 || n_bytes == static_cast(bytes_left)) throw std::runtime_error("failed to write output line"); - cursor += n_bytes; + cursor += n_bytes; // NOLINT(*-pointer-arithmetic) bytes_left -= n_bytes; } @@ -185,13 +200,13 @@ write_line_for_tabular(std::array &buffer, for (std::size_t i = 0; i < n_files; ++i) { const std::size_t r = sites[i].n_reads; const auto n_bytes = [&] { - return (to_print[i] && r >= min_reads) + return to_print[i] && r >= min_reads ? std::snprintf(cursor, bytes_left, "\t%.6g", sites[i].meth) : std::snprintf(cursor, bytes_left, "\tNA"); }(); if (n_bytes < 0 || n_bytes == static_cast(bytes_left)) throw std::runtime_error("failed to write output line"); - cursor += n_bytes; + cursor += n_bytes; // NOLINT(*-pointer-arithmetic) bytes_left -= n_bytes; } } @@ -205,16 +220,16 @@ write_line_for_tabular(std::array &buffer, }(); if (n_bytes < 0 || n_bytes == static_cast(bytes_left)) throw std::runtime_error("failed to write output line"); - cursor += n_bytes; + cursor += n_bytes; // NOLINT(*-pointer-arithmetic) bytes_left -= n_bytes; } if (static_cast(buffer_size) <= - std::distance(buffer.data(), cursor)) + std::distance(std::begin(buffer), cursor)) throw std::runtime_error("failed to write output line"); *cursor++ = '\n'; - out.write(buffer.data(), std::distance(buffer.data(), cursor)); + out.write(std::cbegin(buffer), std::distance(std::begin(buffer), cursor)); } static void @@ -335,29 +350,12 @@ get_chroms_order(const std::vector &filenames, throw std::runtime_error("inconsistent order of chroms between files"); } -/* - This utility does two things, and they are grouped together here because of - how they are done, not because the uses are related. (1) merge-methcounts - can take a set of methcounts output files and combine them into one. There - are several reasons a user might want to do this. An example is when - technical replicates are performed, and analyzed separately to understand - technical variance (e.g., between sequencing runs or library preps). After - examining the technical variation, subsequent analyses might be best - conducted on all the data together. So all the methcounts files can be - combined into one using merge-methcounts. In this case, the coverage at any - site is the sum of the coverages in the original methcounts files, and the - methylation level at any site is the weighted mean. (2) merge-methcounts can - take a set of methcounts output files, and create a table that contains all - the same information. The table format is helpful if subsequent analyses are - best done using a data table, for example in R. When producing a tabular - format, merge-methcounts allows the user to select whether the desired - output is in counts or fractions. - */ int main_merge_methcounts(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) static constexpr auto buffer_size = 65536; try { - static const std::string description = "merge multiple methcounts files"; + static const std::string brief_description = + "merge multiple methcounts files"; std::string outfile; bool verbose = false; @@ -376,7 +374,7 @@ main_merge_methcounts(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) /****************** COMMAND LINE OPTIONS ********************/ OptionParser opt_parse(argv[0], // NOLINT(*-pointer-arithmetic) - description, ""); + brief_description, ""); opt_parse.add_opt("output", 'o', "output file name (default: stdout)", true, outfile); opt_parse.add_opt("header", 'H', "header to print (ignored for tabular)", @@ -526,11 +524,11 @@ main_merge_methcounts(int argc, char *argv[]) { // NOLINT(*-avoid-c-arrays) std::swap(outdated, sites_to_print); } } - catch (const std::runtime_error &e) { + catch (const std::exception &e) { std::cerr << e.what() << '\n'; return EXIT_FAILURE; } return EXIT_SUCCESS; } -// NOLINTEND(*-owning-memory,*-avoid-magic-numbers,*-narrowing-conversions,*-pointer-arithmetic) +// NOLINTEND(*-narrowing-conversions)