Skip to content

Commit ea885a2

Browse files
GenomicRegion: updates to accommodate gzip functionality and modernization
1 parent 168cb24 commit ea885a2

File tree

2 files changed

+139
-181
lines changed

2 files changed

+139
-181
lines changed

GenomicRegion.cpp

Lines changed: 88 additions & 143 deletions
Original file line numberDiff line numberDiff line change
@@ -63,28 +63,28 @@ SimpleGenomicRegion::retrieve_chrom(chrom_id_type i) {
6363
SimpleGenomicRegion::SimpleGenomicRegion(const GenomicRegion &r) :
6464
chrom(assign_chrom(r.get_chrom())), start(r.get_start()), end(r.get_end()) {}
6565

66-
SimpleGenomicRegion::SimpleGenomicRegion(string string_representation) {
67-
vector<string> parts = smithlab::split_whitespace_quoted(string_representation);
68-
69-
// make sure there is the minimal required info
70-
if (parts.size() < 3)
71-
throw GenomicRegionException("Invalid string representation: " +
72-
string_representation);
73-
// set the chromosome name
74-
chrom = assign_chrom(parts[0]);
75-
76-
// set the start position
77-
const int checkChromStart = atoi(parts[1].c_str());
78-
if (checkChromStart < 0)
79-
throw GenomicRegionException("Invalid start: " + parts[1]);
80-
else start = static_cast<size_t>(checkChromStart);
81-
82-
// set the end position
83-
const int checkChromEnd = atoi(parts[2].c_str());
84-
if (checkChromEnd < 0)
85-
throw GenomicRegionException("Invalid end: " + parts[2]);
86-
else end = static_cast<size_t>(checkChromEnd);
87-
}
66+
// SimpleGenomicRegion::SimpleGenomicRegion(string string_representation) {
67+
// vector<string> parts = smithlab::split_whitespace_quoted(string_representation);
68+
69+
// // make sure there is the minimal required info
70+
// if (parts.size() < 3)
71+
// throw runtime_error("Invalid string representation: " +
72+
// string_representation);
73+
// // set the chromosome name
74+
// chrom = assign_chrom(parts[0]);
75+
76+
// // set the start position
77+
// const int checkChromStart = atoi(parts[1].c_str());
78+
// if (checkChromStart < 0)
79+
// throw runtime_error("Invalid start: " + parts[1]);
80+
// else start = static_cast<size_t>(checkChromStart);
81+
82+
// // set the end position
83+
// const int checkChromEnd = atoi(parts[2].c_str());
84+
// if (checkChromEnd < 0)
85+
// throw runtime_error("Invalid end: " + parts[2]);
86+
// else end = static_cast<size_t>(checkChromEnd);
87+
// }
8888

8989
SimpleGenomicRegion::SimpleGenomicRegion(const char *s, const size_t len) {
9090
size_t i = 0;
@@ -116,33 +116,6 @@ SimpleGenomicRegion::tostring() const {
116116
}
117117

118118

119-
std::ostream&
120-
operator<<(std::ostream& s, const SimpleGenomicRegion& region) {
121-
return s << region.tostring();
122-
}
123-
124-
std::istream&
125-
operator>>(std::istream& s, SimpleGenomicRegion& region) {
126-
string chrom;
127-
size_t start = 0ul, end = 0ul;
128-
if (s >> chrom >> start >> end) {
129-
region = SimpleGenomicRegion(chrom, start, end);
130-
// else region = SimpleGenomicRegion();
131-
}
132-
else s.setstate(std::ios::badbit);
133-
134-
char c;
135-
while ((c = s.get()) != '\n' && s);
136-
if (c != '\n')
137-
s.setstate(std::ios::badbit);
138-
139-
if (s.eof())
140-
s.setstate(std::ios::badbit);
141-
142-
return s;
143-
}
144-
145-
146119
bool
147120
SimpleGenomicRegion::contains(const SimpleGenomicRegion& other) const {
148121
return chrom == other.chrom && start <= other.start && other.end <= end;
@@ -229,38 +202,37 @@ GenomicRegion::retrieve_chrom(chrom_id_type i) {
229202
}
230203

231204

232-
GenomicRegion::GenomicRegion(string string_representation) : strand('+') {
233-
vector<string> parts(smithlab::split_whitespace_quoted(string_representation));
205+
// GenomicRegion::GenomicRegion(string string_representation) : strand('+') {
206+
// vector<string> parts(smithlab::split_whitespace_quoted(string_representation));
234207

235-
// make sure there is the minimal required info
236-
if (parts.size() < 3)
237-
throw GenomicRegionException("Invalid string representation: " +
238-
string_representation);
239-
// set the chromosome name
240-
chrom = assign_chrom(parts[0]);
208+
// // make sure there is the minimal required info
209+
// if (parts.size() < 3)
210+
// throw runtime_error("Invalid string representation: " +
211+
// string_representation);
212+
// // set the chromosome name
213+
// chrom = assign_chrom(parts[0]);
241214

242-
// set the start position
243-
const int checkChromStart = atoi(parts[1].c_str());
244-
if (checkChromStart < 0)
245-
throw GenomicRegionException("Invalid start: " + parts[1]);
246-
else start = static_cast<size_t>(checkChromStart);
215+
// // set the start position
216+
// const int checkChromStart = atoi(parts[1].c_str());
217+
// if (checkChromStart < 0)
218+
// throw runtime_error("Invalid start: " + parts[1]);
219+
// else start = static_cast<size_t>(checkChromStart);
247220

248-
// set the end position
249-
const int checkChromEnd = atoi(parts[2].c_str());
250-
if (checkChromEnd < 0)
251-
throw GenomicRegionException("Invalid end: " + parts[2]);
252-
else end = static_cast<size_t>(checkChromEnd);
221+
// // set the end position
222+
// const int checkChromEnd = atoi(parts[2].c_str());
223+
// if (checkChromEnd < 0)
224+
// throw runtime_error("Invalid end: " + parts[2]);
225+
// else end = static_cast<size_t>(checkChromEnd);
253226

254-
if (parts.size() > 3)
255-
name = parts[3];
227+
// if (parts.size() > 3)
228+
// name = parts[3];
256229

257-
if (parts.size() > 4)
258-
score = atof(parts[4].c_str());
259-
260-
if (parts.size() > 5)
261-
strand = parts[5][0];
262-
}
230+
// if (parts.size() > 4)
231+
// score = atof(parts[4].c_str());
263232

233+
// if (parts.size() > 5)
234+
// strand = parts[5][0];
235+
// }
264236

265237
GenomicRegion::GenomicRegion(const char *s, const size_t len) {
266238
size_t i = 0;
@@ -316,33 +288,33 @@ GenomicRegion::tostring() const {
316288
return s.str();
317289
}
318290

319-
std::ostream&
320-
operator<<(std::ostream& s, const GenomicRegion& region) {
321-
return s << region.tostring();
322-
}
291+
// std::ostream&
292+
// operator<<(std::ostream& s, const GenomicRegion& region) {
293+
// return s << region.tostring();
294+
// }
323295

324-
std::istream&
325-
operator>>(std::istream& s, GenomicRegion& region) {
326-
string chrom, name;
327-
size_t start = 0ul, end = 0ul;
328-
double score = 0.0;
329-
char strand = '\0';
296+
// std::istream&
297+
// operator>>(std::istream& s, GenomicRegion& region) {
298+
// string chrom, name;
299+
// size_t start = 0ul, end = 0ul;
300+
// double score = 0.0;
301+
// char strand = '\0';
330302

331-
if (s >> chrom >> start >> end >> name >> score >> strand)
332-
region = GenomicRegion(chrom, start, end, name, score, strand);
333-
else region = GenomicRegion();
303+
// if (s >> chrom >> start >> end >> name >> score >> strand)
304+
// region = GenomicRegion(chrom, start, end, name, score, strand);
305+
// else region = GenomicRegion();
334306

335-
char c;
336-
while ((c = s.get()) != '\n' && s);
307+
// char c;
308+
// while ((c = s.get()) != '\n' && s);
337309

338-
if (c != '\n')
339-
s.setstate(std::ios::badbit);
310+
// if (c != '\n')
311+
// s.setstate(std::ios::badbit);
340312

341-
if (s.eof())
342-
s.setstate(std::ios::badbit);
313+
// if (s.eof())
314+
// s.setstate(std::ios::badbit);
343315

344-
return s;
345-
}
316+
// return s;
317+
// }
346318

347319
bool
348320
GenomicRegion::contains(const GenomicRegion& other) const {
@@ -468,7 +440,7 @@ is_header_line(const string& line) {
468440

469441

470442
static bool
471-
is_track_line(const char *line) {
443+
is_track_line(const string &line) {
472444
static const char *track_label = "track";
473445
static const size_t track_label_len = 5;
474446
for (size_t i = 0; i < track_label_len; ++i)
@@ -479,57 +451,30 @@ is_track_line(const char *line) {
479451

480452

481453
void
482-
ReadBEDFile(string filename, vector<GenomicRegion> &the_regions) {
483-
static const size_t buffer_size = 10000; // Magic
454+
ReadBEDFile(const string &filename, vector<GenomicRegion> &the_regions) {
484455

485-
// open and check the file
486-
std::ifstream in(filename.c_str());
456+
std::ifstream in(filename);
487457
if (!in)
488-
throw BEDFileException("cannot open input file " + filename);
489-
while (!in.eof()) {
490-
char buffer[buffer_size];
491-
in.getline(buffer, buffer_size);
492-
if (in.gcount() == buffer_size - 1)
493-
throw BEDFileException("Line too long in file: " + filename);
494-
if (!is_header_line(buffer) && !is_track_line(buffer)) {
495-
// const string line(buffer);
496-
the_regions.push_back(GenomicRegion(buffer));
497-
}
498-
in.peek();
499-
}
500-
in.close();
458+
throw runtime_error("cannot open input file " + filename);
459+
460+
string line;
461+
while (getline(in, line))
462+
if (!is_header_line(line) && !is_track_line(line))
463+
the_regions.push_back(GenomicRegion(line));
501464
}
502465

503466

504467
void
505-
ReadBEDFile(string filename, vector<SimpleGenomicRegion> &the_regions) {
506-
std::ifstream in(filename.c_str());
507-
if (!in.good())
508-
throw BEDFileException("cannot open input file " + filename);
509-
size_t begin_pos = in.tellg();
510-
in.seekg(0, std::ios_base::end);
511-
size_t end_pos = in.tellg();
512-
in.seekg(0, std::ios_base::beg);
513-
514-
size_t filesize = end_pos - begin_pos;
515-
char *buffer = new char[filesize + 1];
516-
517-
in.read(buffer, filesize);
518-
in.close();
519-
520-
the_regions.reserve(std::count(buffer, buffer + filesize, '\n'));
521-
522-
char *buffer_end = buffer + filesize;
523-
char *c = buffer;
524-
while (c != buffer_end) {
525-
char *next = std::find(c, buffer_end, '\n');
526-
if (next == c)
527-
return;
528-
if (*c != 't')
529-
the_regions.push_back(SimpleGenomicRegion(c, next - c));
530-
c = next + 1;
531-
}
532-
delete[] buffer;
468+
ReadBEDFile(const string &filename, vector<SimpleGenomicRegion> &the_regions) {
469+
470+
std::ifstream in(filename);
471+
if (!in)
472+
throw runtime_error("cannot open input file " + filename);
473+
474+
string line;
475+
while (getline(in, line))
476+
if (!is_header_line(line) && !is_track_line(line))
477+
the_regions.push_back(SimpleGenomicRegion(line));
533478
}
534479

535480

@@ -597,7 +542,7 @@ extract_regions_chrom_fasta(const string &chrom_name,
597542
if (!in)
598543
throw runtime_error("cannot read file: " + filename);
599544

600-
for (auto i = regions.begin(); i != regions.end(); ++i) {
545+
for (auto i = begin(regions); i != end(regions); ++i) {
601546

602547
const size_t orig_start_pos = i->get_start();
603548
const size_t orig_end_pos = i->get_end();

0 commit comments

Comments
 (0)