Skip to content

Commit 461ddc1

Browse files
Merge pull request #130 from smithlabcode/faster-msite-parsing
MSite.cpp: faster parsing of string format for MSite
2 parents f3d62d5 + 9eca725 commit 461ddc1

File tree

1 file changed

+84
-31
lines changed

1 file changed

+84
-31
lines changed

src/common/MSite.cpp

Lines changed: 84 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -21,28 +21,85 @@
2121
#include <sstream>
2222
#include <stdexcept>
2323
#include <regex>
24+
#include <charconv>
2425

2526
#include "smithlab_utils.hpp"
2627

2728
using std::string;
2829
using std::runtime_error;
2930
using std::regex_match;
31+
using std::from_chars;
32+
using std::find_first_of;
33+
using std::find_if;
34+
using std::cbegin;
35+
using std::cend;
36+
using std::end;
3037

3138
MSite::MSite(const string &line) {
32-
/* GS: this is faster but seems to be genenerating issues when
33-
* compiled with clang
34-
std::istringstream iss;
35-
iss.rdbuf()->pubsetbuf(const_cast<char*>(line.c_str()), line.length());
36-
*/
37-
std::istringstream iss(line);
38-
string strand_tmp;
39-
if (!(iss >> chrom >> pos >> strand_tmp >> context >> meth >> n_reads))
40-
throw std::runtime_error("bad line: \"" + line + "\"");
41-
strand = strand_tmp[0];
42-
if (strand != '-' && strand != '+')
43-
throw std::runtime_error("bad line: \"" + line + "\"");
44-
}
39+
constexpr auto is_sep = [](const char x) { return x == ' ' || x == '\t'; };
40+
constexpr auto not_sep = [](const char x) { return x != ' ' && x != '\t'; };
41+
42+
bool failed = false;
43+
44+
const auto c = line.data();
45+
const auto c_end = c + line.size();
46+
47+
auto field_s = c;
48+
auto field_e = find_if(field_s + 1, c_end, is_sep);
49+
if (field_e == c_end) failed = true;
50+
51+
{
52+
const uint32_t d = std::distance(field_s, field_e);
53+
chrom = string{field_s, d};
54+
}
55+
56+
field_s = find_if(field_e + 1, c_end, not_sep);
57+
field_e = find_if(field_s + 1, c_end, is_sep);
58+
failed = failed || (field_e == c_end);
59+
60+
{
61+
const auto [ptr, ec] = from_chars(field_s, field_e, pos);
62+
failed = failed || (ptr == field_s);
63+
}
64+
65+
field_s = find_if(field_e + 1, c_end, not_sep);
66+
field_e = find_if(field_s + 1, c_end, is_sep);
67+
failed = failed || (field_e != field_s + 1 || field_e == c_end);
68+
69+
strand = *field_s;
70+
failed = failed || (strand != '-' && strand != '+');
71+
72+
field_s = find_if(field_e + 1, c_end, not_sep);
73+
field_e = find_if(field_s + 1, c_end, is_sep);
74+
failed = failed || (field_e == c_end);
75+
76+
{
77+
const uint32_t d = std::distance(field_s, field_e);
78+
context = string{field_s, d};
79+
}
4580

81+
field_s = find_if(field_e + 1, c_end, not_sep);
82+
field_e = find_if(field_s + 1, c_end, is_sep);
83+
failed = failed || (field_e == c_end);
84+
85+
{
86+
const auto [ptr, ec] = from_chars(field_s, field_e, meth);
87+
failed = failed || (ptr == field_s);
88+
}
89+
90+
field_s = find_if(field_e + 1, c_end, not_sep);
91+
92+
{
93+
const auto [ptr, ec] = from_chars(field_s, c_end, n_reads);
94+
failed = failed || (ptr != c_end);
95+
}
96+
97+
if (failed) {
98+
throw runtime_error("bad count line: " + line);
99+
// ADS: the value below would work for a flag
100+
// pos = std::numeric_limits<decltype(pos)>::max();
101+
}
102+
}
46103

47104
string
48105
MSite::tostring() const {
@@ -56,15 +113,13 @@ MSite::tostring() const {
56113
return oss.str();
57114
}
58115

59-
60116
size_t
61117
distance(const MSite &a, const MSite &b) {
62118
return a.chrom == b.chrom ?
63119
std::max(a.pos, b.pos) - std::min(a.pos, b.pos) :
64120
std::numeric_limits<size_t>::max();
65121
}
66122

67-
68123
using std::ifstream;
69124
using std::ios_base;
70125

@@ -155,45 +210,43 @@ find_offset_for_msite(const std::string &chr,
155210
}
156211
}
157212

158-
159213
bool
160214
is_msite_line(const string &line) {
161-
162215
std::istringstream iss(line);
163216

164217
string chrom;
165218
if (!(iss >> chrom)) return false;
166219

167-
long int pos = 0;
168-
if (!(iss >> pos)) return false;
220+
int64_t pos = 0;
221+
if (!(iss >> pos || pos < 0)) return false;
169222

170223
string strand;
171-
if (!(iss >> strand) ||
172-
(strand.size() != 1) ||
173-
((strand != "+") && (strand != "-")) )
224+
if (!(iss >> strand) || (strand.size() != 1) ||
225+
((strand != "+") && (strand != "-")))
174226
return false;
175227

176228
string context;
177-
std::regex pattern("^C[pHWX][GH]$");
178-
if (!(iss >> context) || !regex_match(context, pattern)) return false;
229+
// ADS: below, allowing any context
230+
// std::regex pattern("^C[pHWX][GH]$");
231+
if (!(iss >> context)) return false;
179232

180233
double level = 0.0;
181234
if (!(iss >> level) || level < 0.0 || level > 1.0) return false;
182235

183-
long int n_reads = 0;
184-
if (!(iss >> n_reads) || n_reads < 0) return false;
236+
int64_t n_reads = 0;
237+
if (!(iss >> n_reads || n_reads < 0)) return false;
185238

186239
string temp;
187-
if (iss >> temp) return false;
188-
else return true;
189-
240+
if (iss >> temp)
241+
return false;
242+
else
243+
return true;
190244
}
191245

192246
bool
193247
is_msite_file(const string &file) {
194248
bamxx::bgzf_file in(file, "r");
195-
if (!in)
196-
throw runtime_error("cannot open file: " + file);
249+
if (!in) throw runtime_error("cannot open file: " + file);
197250

198251
string line;
199252
if (!getline(in, line)) return false;

0 commit comments

Comments
 (0)