Skip to content

Commit 98d6bec

Browse files
states: allowing output in gzip format
1 parent d0ceb1f commit 98d6bec

File tree

1 file changed

+99
-107
lines changed

1 file changed

+99
-107
lines changed

src/analysis/methstates.cpp

Lines changed: 99 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -17,110 +17,109 @@
1717
* General Public License for more details.
1818
*/
1919

20-
#include <string>
21-
#include <vector>
22-
#include <iostream>
2320
#include <algorithm>
21+
#include <iostream>
2422
#include <numeric>
2523
#include <stdexcept>
24+
#include <string>
2625
#include <unordered_set>
26+
#include <vector>
2727

2828
#include "OptionParser.hpp"
29-
#include "smithlab_utils.hpp"
30-
#include "smithlab_os.hpp"
31-
#include "htslib_wrapper.hpp"
32-
#include "sam_record.hpp"
29+
#include "bam_record_utils.hpp"
3330
#include "cigar_utils.hpp"
3431
#include "dnmt_error.hpp"
32+
#include "htslib_wrapper.hpp"
33+
#include "sam_record.hpp"
34+
#include "smithlab_os.hpp"
35+
#include "smithlab_utils.hpp"
3536

36-
#include "bam_record_utils.hpp"
37-
38-
using std::string;
39-
using std::vector;
40-
using std::cout;
4137
using std::cerr;
38+
using std::cout;
4239
using std::endl;
40+
using std::lower_bound;
41+
using std::string;
4342
using std::unordered_map;
4443
using std::unordered_set;
45-
using std::runtime_error;
46-
using std::lower_bound;
44+
using std::vector;
4745

4846
using bamxx::bam_rec;
4947

5048
static const char b2c[] = "TNGNNNCNNNNNNNNNNNNA";
5149

50+
struct quick_buf : public std::ostringstream,
51+
public std::basic_stringbuf<char> {
52+
// ADS: consider putting this class in a header somewhere
53+
quick_buf() { static_cast<std::basic_ios<char> &>(*this).rdbuf(this); }
54+
55+
void clear() { setp(pbase(), pbase()); }
56+
57+
char const *c_str() {
58+
*pptr() = '\0';
59+
return pbase();
60+
}
61+
};
62+
5263
template<class BidirIt, class OutputIt>
5364
// constexpr // since C++20
54-
OutputIt revcomp_copy(BidirIt first, BidirIt last, OutputIt d_first) {
55-
for (; first != last; ++d_first)
56-
*d_first = b2c[*(--last) - 'A'];
65+
OutputIt
66+
revcomp_copy(BidirIt first, BidirIt last, OutputIt d_first) {
67+
for (; first != last; ++d_first) *d_first = b2c[*(--last) - 'A'];
5768
return d_first;
5869
}
5970

6071
inline static bool
61-
is_cpg(const string &s, const size_t idx) {
72+
is_cpg(const string &s, const uint64_t idx) {
6273
return s[idx] == 'C' && s[idx + 1] == 'G';
6374
}
6475

6576
static void
66-
collect_cpgs(const string &s,
67-
vector<size_t> &cpgs) {
77+
collect_cpgs(const string &s, vector<uint64_t> &cpgs) {
6878
cpgs.clear();
69-
const size_t lim = s.length() - 1;
70-
for (size_t i = 0; i < lim; ++i) {
71-
if (is_cpg(s, i))
72-
cpgs.push_back(i);
73-
}
79+
const uint64_t lim = std::size(s) - 1;
80+
for (auto i = 0u; i < lim; ++i)
81+
if (is_cpg(s, i)) cpgs.push_back(i);
7482
}
7583

7684
static bool
77-
convert_meth_states_pos(const vector<size_t> &cpgs,
78-
const bamxx::bam_header &hdr,
79-
const bam_rec &aln,
80-
size_t &first_cpg_index, string &states) {
81-
85+
convert_meth_states_pos(const vector<uint64_t> &cpgs,
86+
const bamxx::bam_header &hdr, const bam_rec &aln,
87+
uint64_t &first_cpg_index, string &states) {
8288
states.clear();
8389

84-
const size_t seq_start = get_pos(aln);
85-
const size_t width = rlen_from_cigar(aln);
86-
const size_t seq_end = seq_start + width;
90+
const uint64_t seq_start = get_pos(aln);
91+
const uint64_t width = rlen_from_cigar(aln);
92+
const uint64_t seq_end = seq_start + width;
8793

8894
string seq_str;
8995
get_seq_str(aln, seq_str);
9096
apply_cigar(aln, seq_str, 'N');
9197

92-
if (seq_str.size() != width) {
93-
throw runtime_error("bad sam record format: " + to_string(hdr, aln));
94-
}
98+
if (std::size(seq_str) != width)
99+
throw dnmt_error("bad sam record format: " + to_string(hdr, aln));
95100

96101
// get the first cpg site equal to or large than seq_start
97102
auto cpg_itr = lower_bound(begin(cpgs), end(cpgs), seq_start);
98103
auto first_cpg_itr = end(cpgs);
99104

100-
if (cpg_itr == end(cpgs)) {
101-
return false;
102-
} else {
103-
for (; cpg_itr != end(cpgs) && *cpg_itr < seq_end; cpg_itr++) {
104-
const char x = seq_str[*cpg_itr - seq_start];
105-
states += (x == 'C') ? 'C' : ((x == 'T') ? 'T' : 'N');
106-
if (first_cpg_itr == end(cpgs))
107-
first_cpg_itr = cpg_itr;
108-
}
105+
if (cpg_itr == end(cpgs)) return false;
106+
107+
for (; cpg_itr != end(cpgs) && *cpg_itr < seq_end; cpg_itr++) {
108+
const char x = seq_str[*cpg_itr - seq_start];
109+
states += (x == 'T') ? 'T' : ((x == 'C') ? 'C' : 'N');
110+
if (first_cpg_itr == end(cpgs)) first_cpg_itr = cpg_itr;
109111
}
110112

111-
if (first_cpg_itr != end(cpgs)) {
113+
if (first_cpg_itr != end(cpgs))
112114
first_cpg_index = distance(begin(cpgs), first_cpg_itr);
113-
}
114115

115116
return states.find_first_of("CT") != string::npos;
116117
}
117118

118-
119119
static bool
120-
convert_meth_states_neg(const vector<size_t> &cpgs,
121-
const bamxx::bam_header &hdr,
122-
const bam_rec &aln,
123-
size_t &first_cpg_index, string &states) {
120+
convert_meth_states_neg(const vector<uint64_t> &cpgs,
121+
const bamxx::bam_header &hdr, const bam_rec &aln,
122+
uint64_t &first_cpg_index, string &states) {
124123
/* ADS: the "revcomp" on the read sequence is needed for the cigar
125124
to be applied, since the cigar is relative to the genome
126125
coordinates and not the read's sequence. But the read sequence
@@ -131,9 +130,9 @@ convert_meth_states_neg(const vector<size_t> &cpgs,
131130

132131
states.clear();
133132

134-
const size_t seq_start = get_pos(aln);
135-
const size_t width = rlen_from_cigar(aln);
136-
const size_t seq_end = seq_start + width;
133+
const uint64_t seq_start = get_pos(aln);
134+
const uint64_t width = rlen_from_cigar(aln);
135+
const uint64_t seq_end = seq_start + width;
137136

138137
string orig_seq;
139138
get_seq_str(aln, orig_seq);
@@ -143,25 +142,22 @@ convert_meth_states_neg(const vector<size_t> &cpgs,
143142
revcomp_copy(begin(orig_seq), end(orig_seq), begin(seq_str));
144143
apply_cigar(aln, seq_str, 'N');
145144

146-
if (seq_str.size() != width) {
147-
throw runtime_error("bad sam record format: " + to_string(hdr, aln));
148-
}
145+
if (seq_str.size() != width)
146+
throw dnmt_error("bad sam record format: " + to_string(hdr, aln));
149147

150148
// get the first cpg site equal to or large than seq_start - 1
151149
// the -1 is because we look for G in the read corresponding to a
152150
// CpG in chromosome, which are indexed in cpgs based on the position of C
153-
auto cpg_itr = lower_bound(begin(cpgs), end(cpgs),
154-
seq_start > 0 ? seq_start - 1 : 0);
151+
auto cpg_itr =
152+
lower_bound(begin(cpgs), end(cpgs), seq_start > 0 ? seq_start - 1 : 0);
155153
auto first_cpg_itr = end(cpgs);
156154

157-
if (cpg_itr == end(cpgs)) {
158-
return false;
159-
} else {
155+
if (cpg_itr == end(cpgs)) { return false; }
156+
else {
160157
for (; cpg_itr != end(cpgs) && *cpg_itr < seq_end - 1; cpg_itr++) {
161158
const char x = seq_str[*cpg_itr - seq_start + 1];
162159
states += (x == 'G') ? 'C' : ((x == 'A') ? 'T' : 'N');
163-
if (first_cpg_itr == end(cpgs))
164-
first_cpg_itr = cpg_itr;
160+
if (first_cpg_itr == end(cpgs)) first_cpg_itr = cpg_itr;
165161
}
166162
}
167163

@@ -173,26 +169,19 @@ convert_meth_states_neg(const vector<size_t> &cpgs,
173169
}
174170

175171
static void
176-
get_chrom(const string &chrom_name,
177-
const vector<string> &all_chroms,
178-
const unordered_map<string, size_t> &chrom_lookup,
179-
string &chrom) {
180-
172+
get_chrom(const string &chrom_name, const vector<string> &all_chroms,
173+
const unordered_map<string, uint64_t> &chrom_lookup, string &chrom) {
181174
auto the_chrom = chrom_lookup.find(chrom_name);
182175
if (the_chrom == end(chrom_lookup))
183-
throw runtime_error("could not find chrom: " + chrom_name);
176+
throw dnmt_error("could not find chrom: " + chrom_name);
184177

185178
chrom = all_chroms[the_chrom->second];
186-
if (chrom.empty())
187-
throw runtime_error("problem with chrom: " + chrom_name);
179+
if (chrom.empty()) throw dnmt_error("problem with chrom: " + chrom_name);
188180
}
189181

190-
191182
int
192183
main_methstates(int argc, const char **argv) {
193-
194184
try {
195-
196185
const string description =
197186
"Convert mapped reads in SAM format into a format that indicates binary \
198187
sequences of methylation states in each read, indexed by the identity \
@@ -203,6 +192,7 @@ main_methstates(int argc, const char **argv) {
203192
sequences are loaded at once.";
204193

205194
bool VERBOSE = false;
195+
bool compress_output = false;
206196

207197
string chrom_file;
208198
string outfile;
@@ -211,10 +201,11 @@ main_methstates(int argc, const char **argv) {
211201
/****************** COMMAND LINE OPTIONS ********************/
212202
OptionParser opt_parse(argv[0], description, "<sam-file>");
213203
opt_parse.add_opt("output", 'o', "output file name", false, outfile);
214-
opt_parse.add_opt("chrom", 'c', "fasta format reference genome file",
215-
true , chrom_file);
216-
opt_parse.add_opt("threads", 't', "threads to use for reading input",
217-
false, n_threads);
204+
opt_parse.add_opt("chrom", 'c', "fasta format reference genome file", true,
205+
chrom_file);
206+
opt_parse.add_opt("threads", 't', "threads to use for reading input", false,
207+
n_threads);
208+
opt_parse.add_opt("zip", 'z', "output gzip format", false, compress_output);
218209
opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
219210

220211
vector<string> leftover_args;
@@ -239,24 +230,22 @@ main_methstates(int argc, const char **argv) {
239230
const string mapped_reads_file = leftover_args.front();
240231
/****************** END COMMAND LINE OPTIONS *****************/
241232

242-
if (n_threads < 0)
243-
throw dnmt_error("thread count cannot be negative");
233+
if (n_threads < 0) throw dnmt_error("thread count cannot be negative");
244234

245235
/* first load in all the chromosome sequences and names, and make
246236
a map from chromosome name to the location of the chromosome
247237
itself */
248238
vector<string> all_chroms, chrom_names;
249239
read_fasta_file_short_names(chrom_file, chrom_names, all_chroms);
250-
for (auto &&i: all_chroms)
240+
for (auto &&i : all_chroms)
251241
transform(begin(i), end(i), begin(i),
252242
[](const char c) { return std::toupper(c); });
253243

254-
unordered_map<string, size_t> chrom_lookup;
255-
for (size_t i = 0; i < chrom_names.size(); ++i)
244+
unordered_map<string, uint64_t> chrom_lookup;
245+
for (uint64_t i = 0; i < chrom_names.size(); ++i)
256246
chrom_lookup[chrom_names[i]] = i;
257247

258-
if (VERBOSE)
259-
cerr << "n_chroms: " << all_chroms.size() << endl;
248+
if (VERBOSE) cerr << "n_chroms: " << all_chroms.size() << endl;
260249

261250
bamxx::bam_tpool tp(n_threads); // declared first; destroyed last
262251

@@ -266,53 +255,56 @@ main_methstates(int argc, const char **argv) {
266255
if (!hdr) throw dnmt_error("cannot read heade" + mapped_reads_file);
267256

268257
/* set the threads for the input file decompression */
269-
if (n_threads > 1)
270-
tp.set_io(in);
258+
if (n_threads > 1) tp.set_io(in);
271259

272-
std::ofstream of;
273-
if (!outfile.empty()) of.open(outfile.c_str());
274-
std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf());
260+
// open the output file
261+
const string output_mode = compress_output ? "w" : "wu";
262+
bamxx::bgzf_file out(outfile, output_mode);
263+
if (!out) throw dnmt_error("error opening output file: " + outfile);
275264

276-
// for the current chrom, this maps cpg index to cpg position in
277-
// the chrom
278-
//unordered_map<size_t, size_t> cpgs;
279-
vector<size_t> cpgs;
265+
vector<uint64_t> cpgs;
280266

281267
unordered_set<string> chroms_seen;
282268
string chrom_name;
283269
string chrom;
284270

271+
quick_buf buf;
272+
285273
// iterate over records/reads in the SAM file, sequentially
286274
// processing each before considering the next
287275
bam_rec aln;
288276
while (in.read(hdr, aln)) {
289-
290277
// get the correct chrom if it has changed
291278
if (string(sam_hdr_tid2name(hdr, aln)) != chrom_name) {
292279
chrom_name = sam_hdr_tid2name(hdr, aln);
293280

294281
// make sure all reads from same chrom are contiguous in the file
295282
if (chroms_seen.find(chrom_name) != end(chroms_seen))
296-
throw runtime_error("chroms out of order (check SAM file sorted)");
283+
throw dnmt_error("chroms out of order (check SAM file sorted)");
297284

298-
if (VERBOSE)
299-
cerr << "processing " << chrom_name << endl;
285+
if (VERBOSE) cerr << "processing " << chrom_name << endl;
300286

301287
get_chrom(chrom_name, all_chroms, chrom_lookup, chrom);
302288
collect_cpgs(chrom, cpgs);
303289
}
304290

305-
size_t first_cpg_index = std::numeric_limits<size_t>::max();
291+
uint64_t first_cpg_index = std::numeric_limits<uint64_t>::max();
306292
string seq;
307293

308-
const bool has_cpgs = bam_is_rev(aln) ?
309-
convert_meth_states_neg(cpgs, hdr, aln, first_cpg_index, seq) :
310-
convert_meth_states_pos(cpgs, hdr, aln, first_cpg_index, seq);
294+
const bool has_cpgs =
295+
bam_is_rev(aln)
296+
? convert_meth_states_neg(cpgs, hdr, aln, first_cpg_index, seq)
297+
: convert_meth_states_pos(cpgs, hdr, aln, first_cpg_index, seq);
311298

312-
if (has_cpgs)
313-
out << sam_hdr_tid2name(hdr, aln) << '\t'
314-
<< first_cpg_index << '\t'
299+
if (has_cpgs) {
300+
buf.clear();
301+
buf << sam_hdr_tid2name(hdr, aln) << '\t' << first_cpg_index << '\t'
315302
<< seq << '\n';
303+
if (!out.write(buf.c_str(), buf.tellp())) {
304+
cerr << "failure writing output" << endl;
305+
return EXIT_FAILURE;
306+
}
307+
}
316308
}
317309
}
318310
catch (const std::exception &e) {

0 commit comments

Comments
 (0)