2121#include < sstream>
2222#include < stdexcept>
2323#include < regex>
24+ #include < charconv>
2425
2526#include " smithlab_utils.hpp"
2627
2728using std::string;
2829using std::runtime_error;
2930using std::regex_match;
31+ using std::from_chars;
32+ using std::find_first_of;
33+ using std::find_if;
34+ using std::cbegin;
35+ using std::cend;
36+ using std::end;
3037
3138MSite::MSite (const string &line) {
32- /* GS: this is faster but seems to be genenerating issues when
33- * compiled with clang
34- std::istringstream iss;
35- iss.rdbuf()->pubsetbuf(const_cast<char*>(line.c_str()), line.length());
36- */
37- std::istringstream iss (line);
38- string strand_tmp;
39- if (!(iss >> chrom >> pos >> strand_tmp >> context >> meth >> n_reads))
40- throw std::runtime_error (" bad line: \" " + line + " \" " );
41- strand = strand_tmp[0 ];
42- if (strand != ' -' && strand != ' +' )
43- throw std::runtime_error (" bad line: \" " + line + " \" " );
44- }
39+ constexpr auto is_sep = [](const char x) { return x == ' ' || x == ' \t ' ; };
40+ constexpr auto not_sep = [](const char x) { return x != ' ' && x != ' \t ' ; };
41+
42+ bool failed = false ;
43+
44+ const auto c = line.data ();
45+ const auto c_end = c + line.size ();
46+
47+ auto field_s = c;
48+ auto field_e = find_if (field_s + 1 , c_end, is_sep);
49+ if (field_e == c_end) failed = true ;
50+
51+ {
52+ const uint32_t d = std::distance (field_s, field_e);
53+ chrom = string{field_s, d};
54+ }
55+
56+ field_s = find_if (field_e + 1 , c_end, not_sep);
57+ field_e = find_if (field_s + 1 , c_end, is_sep);
58+ failed = failed || (field_e == c_end);
59+
60+ {
61+ const auto [ptr, ec] = from_chars (field_s, field_e, pos);
62+ failed = failed || (ptr == field_s);
63+ }
64+
65+ field_s = find_if (field_e + 1 , c_end, not_sep);
66+ field_e = find_if (field_s + 1 , c_end, is_sep);
67+ failed = failed || (field_e != field_s + 1 || field_e == c_end);
68+
69+ strand = *field_s;
70+ failed = failed || (strand != ' -' && strand != ' +' );
71+
72+ field_s = find_if (field_e + 1 , c_end, not_sep);
73+ field_e = find_if (field_s + 1 , c_end, is_sep);
74+ failed = failed || (field_e == c_end);
75+
76+ {
77+ const uint32_t d = std::distance (field_s, field_e);
78+ context = string{field_s, d};
79+ }
4580
81+ field_s = find_if (field_e + 1 , c_end, not_sep);
82+ field_e = find_if (field_s + 1 , c_end, is_sep);
83+ failed = failed || (field_e == c_end);
84+
85+ {
86+ const auto [ptr, ec] = from_chars (field_s, field_e, meth);
87+ failed = failed || (ptr == field_s);
88+ }
89+
90+ field_s = find_if (field_e + 1 , c_end, not_sep);
91+
92+ {
93+ const auto [ptr, ec] = from_chars (field_s, c_end, n_reads);
94+ failed = failed || (ptr != c_end);
95+ }
96+
97+ if (failed) {
98+ throw runtime_error (" bad count line: " + line);
99+ // ADS: the value below would work for a flag
100+ // pos = std::numeric_limits<decltype(pos)>::max();
101+ }
102+ }
46103
47104string
48105MSite::tostring () const {
@@ -56,15 +113,13 @@ MSite::tostring() const {
56113 return oss.str ();
57114}
58115
59-
60116size_t
61117distance (const MSite &a, const MSite &b) {
62118 return a.chrom == b.chrom ?
63119 std::max (a.pos , b.pos ) - std::min (a.pos , b.pos ) :
64120 std::numeric_limits<size_t >::max ();
65121}
66122
67-
68123using std::ifstream;
69124using std::ios_base;
70125
@@ -155,45 +210,43 @@ find_offset_for_msite(const std::string &chr,
155210 }
156211}
157212
158-
159213bool
160214is_msite_line (const string &line) {
161-
162215 std::istringstream iss (line);
163216
164217 string chrom;
165218 if (!(iss >> chrom)) return false ;
166219
167- long int pos = 0 ;
168- if (!(iss >> pos)) return false ;
220+ int64_t pos = 0 ;
221+ if (!(iss >> pos || pos < 0 )) return false ;
169222
170223 string strand;
171- if (!(iss >> strand) ||
172- (strand.size () != 1 ) ||
173- ((strand != " +" ) && (strand != " -" )) )
224+ if (!(iss >> strand) || (strand.size () != 1 ) ||
225+ ((strand != " +" ) && (strand != " -" )))
174226 return false ;
175227
176228 string context;
177- std::regex pattern (" ^C[pHWX][GH]$" );
178- if (!(iss >> context) || !regex_match (context, pattern)) return false ;
229+ // ADS: below, allowing any context
230+ // std::regex pattern("^C[pHWX][GH]$");
231+ if (!(iss >> context)) return false ;
179232
180233 double level = 0.0 ;
181234 if (!(iss >> level) || level < 0.0 || level > 1.0 ) return false ;
182235
183- long int n_reads = 0 ;
184- if (!(iss >> n_reads) || n_reads < 0 ) return false ;
236+ int64_t n_reads = 0 ;
237+ if (!(iss >> n_reads || n_reads < 0 ) ) return false ;
185238
186239 string temp;
187- if (iss >> temp) return false ;
188- else return true ;
189-
240+ if (iss >> temp)
241+ return false ;
242+ else
243+ return true ;
190244}
191245
192246bool
193247is_msite_file (const string &file) {
194248 bamxx::bgzf_file in (file, " r" );
195- if (!in)
196- throw runtime_error (" cannot open file: " + file);
249+ if (!in) throw runtime_error (" cannot open file: " + file);
197250
198251 string line;
199252 if (!getline (in, line)) return false ;
0 commit comments