Skip to content

Commit f80f5f6

Browse files
Merge branch 'master' of github.com:smithlabcode/smithlab_cpp
2 parents 4f381dc + acfde8d commit f80f5f6

File tree

4 files changed

+62
-76
lines changed

4 files changed

+62
-76
lines changed

chromosome_utils.cpp

Lines changed: 35 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ parse_region_name(string region_name,
6464

6565
static size_t
6666
adjust_start_pos(const size_t orig_start, const string &chrom_name) {
67-
static const double LINE_WIDTH = 50.0;
67+
static const double LINE_WIDTH = 50.0; // ADS: dangerous; often this is 80
6868
const size_t name_offset = chrom_name.length() + 2; // For the '>' and '\n';
6969
const size_t preceding_newlines =
7070
static_cast<size_t>(std::floor(orig_start / LINE_WIDTH));
@@ -76,92 +76,62 @@ static size_t
7676
adjust_region_size(const size_t orig_start,
7777
const string &chrom_name, // ADS: remove this soon
7878
const size_t orig_size) {
79-
static const double LINE_WIDTH = 50.0;
79+
static const double LINE_WIDTH = 50.0; // ADS: dangerous; often this is 80
8080
const size_t preceding_newlines_start =
8181
static_cast<size_t>(std::floor(orig_start / LINE_WIDTH));
8282
const size_t preceding_newlines_end =
8383
static_cast<size_t>(std::floor((orig_start + orig_size) / LINE_WIDTH));
8484
return (orig_size + (preceding_newlines_end - preceding_newlines_start));
8585
}
8686

87-
87+
template <class T>
8888
void
89-
extract_regions_chrom_fasta(const string &chrom_name,
90-
const string &filename,
91-
const vector<SimpleGenomicRegion> &regions,
92-
vector<string> &sequences) {
93-
94-
std::ifstream in(filename.c_str());
95-
for (vector<SimpleGenomicRegion>::const_iterator i(regions.begin());
96-
i != regions.end(); ++i) {
97-
98-
const size_t orig_start_pos = i->get_start();
99-
const size_t orig_end_pos = i->get_end();
100-
const size_t orig_region_size = orig_end_pos - orig_start_pos;
101-
102-
const size_t start_pos = adjust_start_pos(orig_start_pos, chrom_name);
103-
const size_t region_size = adjust_region_size(
104-
orig_start_pos, chrom_name, orig_region_size);
89+
extract_regions_chrom_fasta_impl(const string &chrom_name,
90+
const string &filename,
91+
const vector<T> &regions,
92+
vector<string> &sequences) {
93+
94+
std::ifstream in(filename);
95+
if (!in) throw runtime_error("failed to open file: " + filename);
96+
97+
for (auto &i : regions) {
98+
const auto orig_start_pos = i.get_start();
99+
const auto orig_end_pos = i.get_end();
100+
const auto orig_region_size = orig_end_pos - orig_start_pos;
101+
102+
const auto start_pos = adjust_start_pos(orig_start_pos, chrom_name);
103+
const auto region_size =
104+
adjust_region_size(orig_start_pos, chrom_name, orig_region_size);
105105
assert(start_pos >= 0);
106106

107107
in.seekg(start_pos);
108-
char buffer[region_size + 1];
109-
buffer[region_size] = '\0';
110-
in.read(buffer, region_size);
111-
112-
std::remove_if(buffer, buffer + region_size,
113-
[](const char x) {return x == '\n';});
114-
buffer[orig_region_size] = '\0';
115-
116-
sequences.push_back(buffer);
117-
std::transform(sequences.back().begin(), sequences.back().end(),
118-
sequences.back().begin(), [](const char x) {return toupper(x);});
119-
assert(i->get_width() == sequences.back().length());
108+
string buffer(region_size, '\0');
109+
in.read(buffer.data(), region_size);
110+
111+
buffer.erase(remove(begin(buffer), end(buffer), '\n'));
112+
transform(cbegin(buffer), cend(buffer), begin(buffer),
113+
[](const char x) {return toupper(x);});
114+
sequences.push_back(move(buffer));
115+
assert(i.get_width() == sequences.back().size());
120116
}
121-
in.close();
122117
}
123118

119+
void
120+
extract_regions_chrom_fasta(const string &chrom_name,
121+
const string &filename,
122+
const vector<SimpleGenomicRegion> &regions,
123+
vector<string> &sequences) {
124+
extract_regions_chrom_fasta_impl(chrom_name, filename, regions, sequences);
125+
}
124126

125127
void
126128
extract_regions_chrom_fasta(const string &chrom_name,
127129
const string &filename,
128130
const vector<GenomicRegion> &regions,
129131
vector<string> &sequences) {
130-
131-
std::ifstream in(filename.c_str());
132-
for (vector<GenomicRegion>::const_iterator i(regions.begin());
133-
i != regions.end(); ++i) {
134-
135-
const size_t orig_start_pos = i->get_start();
136-
const size_t orig_end_pos = i->get_end();
137-
const size_t orig_region_size = orig_end_pos - orig_start_pos;
138-
139-
const size_t start_pos = adjust_start_pos(orig_start_pos, chrom_name);
140-
const size_t region_size = adjust_region_size(
141-
orig_start_pos, chrom_name, orig_region_size);
142-
assert(start_pos >= 0);
143-
144-
in.seekg(start_pos);
145-
char buffer[region_size + 1];
146-
buffer[region_size] = '\0';
147-
in.read(buffer, region_size);
148-
149-
std::remove_if(buffer, buffer + region_size,
150-
[](const char x) {return x == '\n';});
151-
buffer[orig_region_size] = '\0';
152-
153-
sequences.push_back(buffer);
154-
std::transform(sequences.back().begin(), sequences.back().end(),
155-
sequences.back().begin(), [](const char x) {return toupper(x);});
156-
157-
if (i->neg_strand())
158-
revcomp_inplace(sequences.back());
159-
assert(i->get_width() == sequences.back().length());
160-
}
161-
in.close();
132+
extract_regions_chrom_fasta_impl(chrom_name, filename, regions, sequences);
162133
}
163134

164-
165135
void
166136
extract_regions_fasta(const string &dirname,
167137
const vector<GenomicRegion> &regions_in,

smithlab_os.cpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,11 @@ read_fastq_file(const char *filename,
232232
vector<char> scr;
233233
vector<vector<char> > scrs;
234234
bool first_line = true;
235+
// ADS: preprocessor stuff below is because is_sequence_line is only
236+
// used with asserts; consider removing variable
237+
#ifndef NDEBUG
235238
bool is_sequence_line = false;
239+
#endif
236240
size_t line_count = 0;
237241
while (!in.eof()) {
238242
char buffer[INPUT_BUFFER_SIZE + 1];
@@ -261,12 +265,16 @@ read_fastq_file(const char *filename,
261265
name = name.substr(name.find_first_not_of("@ "));
262266
s = "";
263267
scr.clear();
268+
#ifndef NDEBUG
264269
is_sequence_line = true;
270+
#endif
265271
}
266272
if (is_fastq_sequence_line(line_count)) {
267273
assert(is_sequence_line);
268274
s += buffer;
275+
#ifndef NDEBUG
269276
is_sequence_line = false;
277+
#endif
270278
}
271279
if (is_fastq_score_name_line(line_count)) {
272280
if (buffer[0] != '+')
@@ -319,7 +327,11 @@ void read_fastq_file(const char *filename, vector<string> &names,
319327

320328
string s, name, scr;
321329
bool first_line = true;
330+
// ADS: preprocessor stuff below is because is_sequence_line is only
331+
// used with asserts; consider removing variable
332+
#ifndef NDEBUG
322333
bool is_sequence_line = false;
334+
#endif
323335
size_t line_count = 0;
324336
while (!in.eof()) {
325337
char buffer[INPUT_BUFFER_SIZE + 1];
@@ -346,12 +358,16 @@ void read_fastq_file(const char *filename, vector<string> &names,
346358
first_line = false;
347359
name = buffer;
348360
name = name.substr(name.find_first_not_of("@ "));
361+
#ifndef NDEBUG
349362
is_sequence_line = true;
363+
#endif
350364
}
351365
if (is_fastq_sequence_line(line_count)) {
352366
assert(is_sequence_line);
353367
s = buffer;
368+
#ifndef NDEBUG
354369
is_sequence_line = false;
370+
#endif
355371
}
356372
if (is_fastq_score_name_line(line_count)) {
357373
if (buffer[0] != '+')

smithlab_utils.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ smithlab::squash(const std::vector<std::string>& v) {
226226

227227
void
228228
ProgressBar::report(std::ostream &out, const size_t i) {
229-
prev = std::round(std::min(i, total)/total);
229+
prev = std::round((100.0*std::min(i, total))/total);
230230
const size_t x =
231231
std::min(static_cast<size_t>(bar_width*(prev/100.0)), bar_width);
232232
fill_n(begin(bar), x, '=');

smithlab_utils.hpp

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -380,13 +380,13 @@ kmer_counts(const std::vector<std::string> &seqs,
380380
counts.resize(nwords, 0);
381381
size_t total = 0;
382382
for (size_t i = 0; i < seqs.size(); ++i) {
383-
char seq[seqs[i].length() + 1];
384-
seq[seqs[i].length()] = '\0';
385-
copy(seqs[i].begin(), seqs[i].end(), seq);
383+
std::vector<char> seq(seqs[i].length() + 1, '\0');
384+
auto seq_data = seq.data();
385+
copy(cbegin(seqs[i]), cend(seqs[i]), seq_data);
386386
for (size_t j = 0; j < seqs[i].length() - k + 1; ++j)
387-
if (std::count_if(seq + j, seq + j + k, &valid_base) ==
387+
if (std::count_if(seq_data + j, seq_data + j + k, &valid_base) ==
388388
static_cast<int>(k)) {
389-
counts[mer2index(seq + j, k)]++;
389+
counts[mer2index(seq_data + j, k)]++;
390390
++total;
391391
}
392392
}
@@ -412,21 +412,21 @@ kmer_counts(const std::vector<std::string> &seqs,
412412
class ProgressBar {
413413
public:
414414
ProgressBar(const size_t x, const std::string message = "completion") :
415-
total(x/100.0), prev(0), mid_tag(message) {
415+
total(x), prev(0), mid_tag(message) {
416416
bar_width = max_bar_width - message.length() - 3 - 5;
417417
bar = std::string(bar_width, ' ');
418418
}
419419
bool time_to_report(const size_t i) const {
420-
return std::round(std::min(i, total)/total) > prev;
420+
return std::round((100.0*std::min(i, total))/total) > prev;
421421
}
422422
void
423423
report(std::ostream &out, const size_t i);
424424

425425
private:
426426

427-
size_t total{};
428-
size_t prev{};
429-
size_t bar_width{};
427+
size_t total;
428+
size_t prev;
429+
size_t bar_width;
430430
std::string left_tag = "\r[";
431431
std::string mid_tag;
432432
std::string bar;

0 commit comments

Comments
 (0)