2222#include < iostream>
2323#include < fstream>
2424#include < stdexcept>
25-
25+ #include < unordered_set>
26+ #include < filesystem>
2627#include < bamxx.hpp>
2728
2829// from smithlab_cpp
@@ -36,56 +37,70 @@ using std::string;
3637using std::cout;
3738using std::cerr;
3839using std::endl;
40+ using std::unordered_set;
3941
4042using bamxx::bgzf_file;
4143
42- inline bool
44+ static inline bool
4345found_symmetric (const MSite &prev_cpg, const MSite &curr_cpg) {
4446 // assumes check for CpG already done
45- return (prev_cpg.strand == ' +' &&
46- curr_cpg.strand == ' -' &&
47+ return (prev_cpg.strand == ' +' && curr_cpg.strand == ' -' &&
4748 prev_cpg.pos + 1 == curr_cpg.pos );
4849}
4950
50- template <class T >
51- static void
51+ template <class T > static bool
5252process_sites (bgzf_file &in, T &out) {
53-
5453 MSite prev_site, curr_site;
5554 bool prev_is_cpg = false ;
5655 if (read_site (in, prev_site))
57- if (prev_site.is_cpg ())
58- prev_is_cpg = true ;
56+ if (prev_site.is_cpg ()) prev_is_cpg = true ;
57+
58+ unordered_set<string> chroms_seen;
59+ bool sites_are_sorted = true ;
5960
6061 while (read_site (in, curr_site)) {
6162 if (curr_site.is_cpg ()) {
63+ if (curr_site.chrom != prev_site.chrom ) {
64+ const auto chrom_itr = chroms_seen.find (curr_site.chrom );
65+ if (chrom_itr != cend (chroms_seen)) {
66+ sites_are_sorted = false ;
67+ break ;
68+ }
69+ else
70+ chroms_seen.insert (curr_site.chrom );
71+ }
72+ else if (curr_site.pos <= prev_site.pos ) {
73+ sites_are_sorted = false ;
74+ break ;
75+ }
6276 if (prev_is_cpg && found_symmetric (prev_site, curr_site)) {
6377 prev_site.add (curr_site);
6478 write_site (out, prev_site);
6579 }
6680 prev_is_cpg = true ;
6781 }
68- else prev_is_cpg = false ;
82+ else
83+ prev_is_cpg = false ;
6984 std::swap (prev_site, curr_site);
7085 }
86+
87+ return sites_are_sorted;
7188}
7289
7390int
7491main_symmetric_cpgs (int argc, const char **argv) {
75-
7692 try {
77-
7893 string outfile;
7994 // (not used) bool VERBOSE = false;
8095
8196 const string description =
8297 " Get CpG sites and make methylation levels symmetric." ;
8398
8499 /* ***************** COMMAND LINE OPTIONS ********************/
85- OptionParser opt_parse (strip_path (argv[0 ]),
86- description, " <methcounts-file>" );
87- opt_parse.add_opt (" output" , ' o' , " output file (default: stdout)" ,
88- false , outfile);
100+ OptionParser opt_parse (strip_path (argv[0 ]), description,
101+ " <methcounts-file>" );
102+ opt_parse.add_opt (" output" , ' o' , " output file (default: stdout)" , false ,
103+ outfile);
89104 // opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
90105 std::vector<string> leftover_args;
91106 opt_parse.parse (argc, argv, leftover_args);
@@ -112,18 +127,27 @@ main_symmetric_cpgs(int argc, const char **argv) {
112127 bgzf_file in (filename, " r" );
113128 if (!in) throw std::runtime_error (" could not open file: " + filename);
114129
130+ bool sites_are_sorted = true ;
115131 if (outfile.empty () || !has_gz_ext (outfile)) {
116132 std::ofstream of;
117133 if (!outfile.empty ()) of.open (outfile.c_str ());
118134 std::ostream out (outfile.empty () ? std::cout.rdbuf () : of.rdbuf ());
119- process_sites (in, out);
135+ sites_are_sorted = process_sites (in, out);
120136 }
121137 else {
122138 bgzf_file out (outfile, " w" );
123- process_sites (in, out);
139+ sites_are_sorted = process_sites (in, out);
140+ }
141+
142+ if (!sites_are_sorted) {
143+ cerr << " sites are not sorted in: " << filename << endl;
144+ namespace fs = std::filesystem;
145+ const fs::path outpath{outfile};
146+ if (fs::exists (outpath)) fs::remove (outpath);
147+ return EXIT_FAILURE;
124148 }
125149 }
126- catch (const std::runtime_error &e) {
150+ catch (const std::runtime_error &e) {
127151 cerr << e.what () << endl;
128152 return EXIT_FAILURE;
129153 }
0 commit comments