Skip to content

Commit acfde8d

Browse files
chromosome_utils.cpp: consolidating the implementation of extract_regions_chrom_fasta for GenomicRegion and SimpleGenomicRegion. Removing variable length array by replacing with a string and reading raw data into the dot data member of that string. This function is not tested and is currently not used in abismal or dnmtools but might still be used in rmap
1 parent 07da0a4 commit acfde8d

File tree

1 file changed

+35
-65
lines changed

1 file changed

+35
-65
lines changed

chromosome_utils.cpp

Lines changed: 35 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ parse_region_name(string region_name,
6464

6565
static size_t
6666
adjust_start_pos(const size_t orig_start, const string &chrom_name) {
67-
static const double LINE_WIDTH = 50.0;
67+
static const double LINE_WIDTH = 50.0; // ADS: dangerous; often this is 80
6868
const size_t name_offset = chrom_name.length() + 2; // For the '>' and '\n';
6969
const size_t preceding_newlines =
7070
static_cast<size_t>(std::floor(orig_start / LINE_WIDTH));
@@ -76,92 +76,62 @@ static size_t
7676
adjust_region_size(const size_t orig_start,
7777
const string &chrom_name, // ADS: remove this soon
7878
const size_t orig_size) {
79-
static const double LINE_WIDTH = 50.0;
79+
static const double LINE_WIDTH = 50.0; // ADS: dangerous; often this is 80
8080
const size_t preceding_newlines_start =
8181
static_cast<size_t>(std::floor(orig_start / LINE_WIDTH));
8282
const size_t preceding_newlines_end =
8383
static_cast<size_t>(std::floor((orig_start + orig_size) / LINE_WIDTH));
8484
return (orig_size + (preceding_newlines_end - preceding_newlines_start));
8585
}
8686

87-
87+
template <class T>
8888
void
89-
extract_regions_chrom_fasta(const string &chrom_name,
90-
const string &filename,
91-
const vector<SimpleGenomicRegion> &regions,
92-
vector<string> &sequences) {
93-
94-
std::ifstream in(filename.c_str());
95-
for (vector<SimpleGenomicRegion>::const_iterator i(regions.begin());
96-
i != regions.end(); ++i) {
97-
98-
const size_t orig_start_pos = i->get_start();
99-
const size_t orig_end_pos = i->get_end();
100-
const size_t orig_region_size = orig_end_pos - orig_start_pos;
101-
102-
const size_t start_pos = adjust_start_pos(orig_start_pos, chrom_name);
103-
const size_t region_size = adjust_region_size(
104-
orig_start_pos, chrom_name, orig_region_size);
89+
extract_regions_chrom_fasta_impl(const string &chrom_name,
90+
const string &filename,
91+
const vector<T> &regions,
92+
vector<string> &sequences) {
93+
94+
std::ifstream in(filename);
95+
if (!in) throw runtime_error("failed to open file: " + filename);
96+
97+
for (auto &i : regions) {
98+
const auto orig_start_pos = i.get_start();
99+
const auto orig_end_pos = i.get_end();
100+
const auto orig_region_size = orig_end_pos - orig_start_pos;
101+
102+
const auto start_pos = adjust_start_pos(orig_start_pos, chrom_name);
103+
const auto region_size =
104+
adjust_region_size(orig_start_pos, chrom_name, orig_region_size);
105105
assert(start_pos >= 0);
106106

107107
in.seekg(start_pos);
108-
char buffer[region_size + 1];
109-
buffer[region_size] = '\0';
110-
in.read(buffer, region_size);
111-
112-
std::remove_if(buffer, buffer + region_size,
113-
[](const char x) {return x == '\n';});
114-
buffer[orig_region_size] = '\0';
115-
116-
sequences.push_back(buffer);
117-
std::transform(sequences.back().begin(), sequences.back().end(),
118-
sequences.back().begin(), [](const char x) {return toupper(x);});
119-
assert(i->get_width() == sequences.back().length());
108+
string buffer(region_size, '\0');
109+
in.read(buffer.data(), region_size);
110+
111+
buffer.erase(remove(begin(buffer), end(buffer), '\n'));
112+
transform(cbegin(buffer), cend(buffer), begin(buffer),
113+
[](const char x) {return toupper(x);});
114+
sequences.push_back(move(buffer));
115+
assert(i.get_width() == sequences.back().size());
120116
}
121-
in.close();
122117
}
123118

119+
void
120+
extract_regions_chrom_fasta(const string &chrom_name,
121+
const string &filename,
122+
const vector<SimpleGenomicRegion> &regions,
123+
vector<string> &sequences) {
124+
extract_regions_chrom_fasta_impl(chrom_name, filename, regions, sequences);
125+
}
124126

125127
void
126128
extract_regions_chrom_fasta(const string &chrom_name,
127129
const string &filename,
128130
const vector<GenomicRegion> &regions,
129131
vector<string> &sequences) {
130-
131-
std::ifstream in(filename.c_str());
132-
for (vector<GenomicRegion>::const_iterator i(regions.begin());
133-
i != regions.end(); ++i) {
134-
135-
const size_t orig_start_pos = i->get_start();
136-
const size_t orig_end_pos = i->get_end();
137-
const size_t orig_region_size = orig_end_pos - orig_start_pos;
138-
139-
const size_t start_pos = adjust_start_pos(orig_start_pos, chrom_name);
140-
const size_t region_size = adjust_region_size(
141-
orig_start_pos, chrom_name, orig_region_size);
142-
assert(start_pos >= 0);
143-
144-
in.seekg(start_pos);
145-
char buffer[region_size + 1];
146-
buffer[region_size] = '\0';
147-
in.read(buffer, region_size);
148-
149-
std::remove_if(buffer, buffer + region_size,
150-
[](const char x) {return x == '\n';});
151-
buffer[orig_region_size] = '\0';
152-
153-
sequences.push_back(buffer);
154-
std::transform(sequences.back().begin(), sequences.back().end(),
155-
sequences.back().begin(), [](const char x) {return toupper(x);});
156-
157-
if (i->neg_strand())
158-
revcomp_inplace(sequences.back());
159-
assert(i->get_width() == sequences.back().length());
160-
}
161-
in.close();
132+
extract_regions_chrom_fasta_impl(chrom_name, filename, regions, sequences);
162133
}
163134

164-
165135
void
166136
extract_regions_fasta(const string &dirname,
167137
const vector<GenomicRegion> &regions_in,

0 commit comments

Comments
 (0)