1717 * General Public License for more details.
1818 */
1919
20- #include < string>
21- #include < vector>
22- #include < iostream>
2320#include < algorithm>
21+ #include < iostream>
2422#include < numeric>
2523#include < stdexcept>
24+ #include < string>
2625#include < unordered_set>
26+ #include < vector>
2727
2828#include " OptionParser.hpp"
29- #include " smithlab_utils.hpp"
30- #include " smithlab_os.hpp"
31- #include " htslib_wrapper.hpp"
32- #include " sam_record.hpp"
29+ #include " bam_record_utils.hpp"
3330#include " cigar_utils.hpp"
3431#include " dnmt_error.hpp"
32+ #include " htslib_wrapper.hpp"
33+ #include " sam_record.hpp"
34+ #include " smithlab_os.hpp"
35+ #include " smithlab_utils.hpp"
3536
36- #include " bam_record_utils.hpp"
37-
38- using std::string;
39- using std::vector;
40- using std::cout;
4137using std::cerr;
38+ using std::cout;
4239using std::endl;
40+ using std::lower_bound;
41+ using std::string;
4342using std::unordered_map;
4443using std::unordered_set;
45- using std::runtime_error;
46- using std::lower_bound;
44+ using std::vector;
4745
4846using bamxx::bam_rec;
4947
5048static const char b2c[] = " TNGNNNCNNNNNNNNNNNNA" ;
5149
50+ struct quick_buf : public std ::ostringstream,
51+ public std::basic_stringbuf<char > {
52+ // ADS: consider putting this class in a header somewhere
53+ quick_buf () { static_cast <std::basic_ios<char > &>(*this ).rdbuf (this ); }
54+
55+ void clear () { setp (pbase (), pbase ()); }
56+
57+ char const *c_str () {
58+ *pptr () = ' \0 ' ;
59+ return pbase ();
60+ }
61+ };
62+
5263template <class BidirIt , class OutputIt >
5364// constexpr // since C++20
54- OutputIt revcomp_copy (BidirIt first, BidirIt last, OutputIt d_first) {
55- for (; first != last; ++ d_first)
56- *d_first = b2c[*(--last) - ' A' ];
65+ OutputIt
66+ revcomp_copy (BidirIt first, BidirIt last, OutputIt d_first) {
67+ for (; first != last; ++d_first) *d_first = b2c[*(--last) - ' A' ];
5768 return d_first;
5869}
5970
6071inline static bool
61- is_cpg (const string &s, const size_t idx) {
72+ is_cpg (const string &s, const uint64_t idx) {
6273 return s[idx] == ' C' && s[idx + 1 ] == ' G' ;
6374}
6475
6576static void
66- collect_cpgs (const string &s,
67- vector<size_t > &cpgs) {
77+ collect_cpgs (const string &s, vector<uint64_t > &cpgs) {
6878 cpgs.clear ();
69- const size_t lim = s.length () - 1 ;
70- for (size_t i = 0 ; i < lim; ++i) {
71- if (is_cpg (s, i))
72- cpgs.push_back (i);
73- }
79+ const uint64_t lim = std::size (s) - 1 ;
80+ for (auto i = 0u ; i < lim; ++i)
81+ if (is_cpg (s, i)) cpgs.push_back (i);
7482}
7583
7684static bool
77- convert_meth_states_pos (const vector<size_t > &cpgs,
78- const bamxx::bam_header &hdr,
79- const bam_rec &aln,
80- size_t &first_cpg_index, string &states) {
81-
85+ convert_meth_states_pos (const vector<uint64_t > &cpgs,
86+ const bamxx::bam_header &hdr, const bam_rec &aln,
87+ uint64_t &first_cpg_index, string &states) {
8288 states.clear ();
8389
84- const size_t seq_start = get_pos (aln);
85- const size_t width = rlen_from_cigar (aln);
86- const size_t seq_end = seq_start + width;
90+ const uint64_t seq_start = get_pos (aln);
91+ const uint64_t width = rlen_from_cigar (aln);
92+ const uint64_t seq_end = seq_start + width;
8793
8894 string seq_str;
8995 get_seq_str (aln, seq_str);
9096 apply_cigar (aln, seq_str, ' N' );
9197
92- if (seq_str.size () != width) {
93- throw runtime_error (" bad sam record format: " + to_string (hdr, aln));
94- }
98+ if (std::size (seq_str) != width)
99+ throw dnmt_error (" bad sam record format: " + to_string (hdr, aln));
95100
96101 // get the first cpg site equal to or large than seq_start
97102 auto cpg_itr = lower_bound (begin (cpgs), end (cpgs), seq_start);
98103 auto first_cpg_itr = end (cpgs);
99104
100- if (cpg_itr == end (cpgs)) {
101- return false ;
102- } else {
103- for (; cpg_itr != end (cpgs) && *cpg_itr < seq_end; cpg_itr++) {
104- const char x = seq_str[*cpg_itr - seq_start];
105- states += (x == ' C' ) ? ' C' : ((x == ' T' ) ? ' T' : ' N' );
106- if (first_cpg_itr == end (cpgs))
107- first_cpg_itr = cpg_itr;
108- }
105+ if (cpg_itr == end (cpgs)) return false ;
106+
107+ for (; cpg_itr != end (cpgs) && *cpg_itr < seq_end; cpg_itr++) {
108+ const char x = seq_str[*cpg_itr - seq_start];
109+ states += (x == ' T' ) ? ' T' : ((x == ' C' ) ? ' C' : ' N' );
110+ if (first_cpg_itr == end (cpgs)) first_cpg_itr = cpg_itr;
109111 }
110112
111- if (first_cpg_itr != end (cpgs)) {
113+ if (first_cpg_itr != end (cpgs))
112114 first_cpg_index = distance (begin (cpgs), first_cpg_itr);
113- }
114115
115116 return states.find_first_of (" CT" ) != string::npos;
116117}
117118
118-
119119static bool
120- convert_meth_states_neg (const vector<size_t > &cpgs,
121- const bamxx::bam_header &hdr,
122- const bam_rec &aln,
123- size_t &first_cpg_index, string &states) {
120+ convert_meth_states_neg (const vector<uint64_t > &cpgs,
121+ const bamxx::bam_header &hdr, const bam_rec &aln,
122+ uint64_t &first_cpg_index, string &states) {
124123 /* ADS: the "revcomp" on the read sequence is needed for the cigar
125124 to be applied, since the cigar is relative to the genome
126125 coordinates and not the read's sequence. But the read sequence
@@ -131,9 +130,9 @@ convert_meth_states_neg(const vector<size_t> &cpgs,
131130
132131 states.clear ();
133132
134- const size_t seq_start = get_pos (aln);
135- const size_t width = rlen_from_cigar (aln);
136- const size_t seq_end = seq_start + width;
133+ const uint64_t seq_start = get_pos (aln);
134+ const uint64_t width = rlen_from_cigar (aln);
135+ const uint64_t seq_end = seq_start + width;
137136
138137 string orig_seq;
139138 get_seq_str (aln, orig_seq);
@@ -143,25 +142,22 @@ convert_meth_states_neg(const vector<size_t> &cpgs,
143142 revcomp_copy (begin (orig_seq), end (orig_seq), begin (seq_str));
144143 apply_cigar (aln, seq_str, ' N' );
145144
146- if (seq_str.size () != width) {
147- throw runtime_error (" bad sam record format: " + to_string (hdr, aln));
148- }
145+ if (seq_str.size () != width)
146+ throw dnmt_error (" bad sam record format: " + to_string (hdr, aln));
149147
150148 // get the first cpg site equal to or large than seq_start - 1
151149 // the -1 is because we look for G in the read corresponding to a
152150 // CpG in chromosome, which are indexed in cpgs based on the position of C
153- auto cpg_itr = lower_bound ( begin (cpgs), end (cpgs),
154- seq_start > 0 ? seq_start - 1 : 0 );
151+ auto cpg_itr =
152+ lower_bound ( begin (cpgs), end (cpgs), seq_start > 0 ? seq_start - 1 : 0 );
155153 auto first_cpg_itr = end (cpgs);
156154
157- if (cpg_itr == end (cpgs)) {
158- return false ;
159- } else {
155+ if (cpg_itr == end (cpgs)) { return false ; }
156+ else {
160157 for (; cpg_itr != end (cpgs) && *cpg_itr < seq_end - 1 ; cpg_itr++) {
161158 const char x = seq_str[*cpg_itr - seq_start + 1 ];
162159 states += (x == ' G' ) ? ' C' : ((x == ' A' ) ? ' T' : ' N' );
163- if (first_cpg_itr == end (cpgs))
164- first_cpg_itr = cpg_itr;
160+ if (first_cpg_itr == end (cpgs)) first_cpg_itr = cpg_itr;
165161 }
166162 }
167163
@@ -173,26 +169,19 @@ convert_meth_states_neg(const vector<size_t> &cpgs,
173169}
174170
175171static void
176- get_chrom (const string &chrom_name,
177- const vector<string> &all_chroms,
178- const unordered_map<string, size_t > &chrom_lookup,
179- string &chrom) {
180-
172+ get_chrom (const string &chrom_name, const vector<string> &all_chroms,
173+ const unordered_map<string, uint64_t > &chrom_lookup, string &chrom) {
181174 auto the_chrom = chrom_lookup.find (chrom_name);
182175 if (the_chrom == end (chrom_lookup))
183- throw runtime_error (" could not find chrom: " + chrom_name);
176+ throw dnmt_error (" could not find chrom: " + chrom_name);
184177
185178 chrom = all_chroms[the_chrom->second ];
186- if (chrom.empty ())
187- throw runtime_error (" problem with chrom: " + chrom_name);
179+ if (chrom.empty ()) throw dnmt_error (" problem with chrom: " + chrom_name);
188180}
189181
190-
191182int
192183main_methstates (int argc, const char **argv) {
193-
194184 try {
195-
196185 const string description =
197186 " Convert mapped reads in SAM format into a format that indicates binary \
198187 sequences of methylation states in each read, indexed by the identity \
@@ -203,6 +192,7 @@ main_methstates(int argc, const char **argv) {
203192 sequences are loaded at once." ;
204193
205194 bool VERBOSE = false ;
195+ bool compress_output = false ;
206196
207197 string chrom_file;
208198 string outfile;
@@ -211,10 +201,11 @@ main_methstates(int argc, const char **argv) {
211201 /* ***************** COMMAND LINE OPTIONS ********************/
212202 OptionParser opt_parse (argv[0 ], description, " <sam-file>" );
213203 opt_parse.add_opt (" output" , ' o' , " output file name" , false , outfile);
214- opt_parse.add_opt (" chrom" , ' c' , " fasta format reference genome file" ,
215- true , chrom_file);
216- opt_parse.add_opt (" threads" , ' t' , " threads to use for reading input" ,
217- false , n_threads);
204+ opt_parse.add_opt (" chrom" , ' c' , " fasta format reference genome file" , true ,
205+ chrom_file);
206+ opt_parse.add_opt (" threads" , ' t' , " threads to use for reading input" , false ,
207+ n_threads);
208+ opt_parse.add_opt (" zip" , ' z' , " output gzip format" , false , compress_output);
218209 opt_parse.add_opt (" verbose" , ' v' , " print more run info" , false , VERBOSE);
219210
220211 vector<string> leftover_args;
@@ -239,24 +230,22 @@ main_methstates(int argc, const char **argv) {
239230 const string mapped_reads_file = leftover_args.front ();
240231 /* ***************** END COMMAND LINE OPTIONS *****************/
241232
242- if (n_threads < 0 )
243- throw dnmt_error (" thread count cannot be negative" );
233+ if (n_threads < 0 ) throw dnmt_error (" thread count cannot be negative" );
244234
245235 /* first load in all the chromosome sequences and names, and make
246236 a map from chromosome name to the location of the chromosome
247237 itself */
248238 vector<string> all_chroms, chrom_names;
249239 read_fasta_file_short_names (chrom_file, chrom_names, all_chroms);
250- for (auto &&i: all_chroms)
240+ for (auto &&i : all_chroms)
251241 transform (begin (i), end (i), begin (i),
252242 [](const char c) { return std::toupper (c); });
253243
254- unordered_map<string, size_t > chrom_lookup;
255- for (size_t i = 0 ; i < chrom_names.size (); ++i)
244+ unordered_map<string, uint64_t > chrom_lookup;
245+ for (uint64_t i = 0 ; i < chrom_names.size (); ++i)
256246 chrom_lookup[chrom_names[i]] = i;
257247
258- if (VERBOSE)
259- cerr << " n_chroms: " << all_chroms.size () << endl;
248+ if (VERBOSE) cerr << " n_chroms: " << all_chroms.size () << endl;
260249
261250 bamxx::bam_tpool tp (n_threads); // declared first; destroyed last
262251
@@ -266,53 +255,56 @@ main_methstates(int argc, const char **argv) {
266255 if (!hdr) throw dnmt_error (" cannot read heade" + mapped_reads_file);
267256
268257 /* set the threads for the input file decompression */
269- if (n_threads > 1 )
270- tp.set_io (in);
258+ if (n_threads > 1 ) tp.set_io (in);
271259
272- std::ofstream of;
273- if (!outfile.empty ()) of.open (outfile.c_str ());
274- std::ostream out (outfile.empty () ? cout.rdbuf () : of.rdbuf ());
260+ // open the output file
261+ const string output_mode = compress_output ? " w" : " wu" ;
262+ bamxx::bgzf_file out (outfile, output_mode);
263+ if (!out) throw dnmt_error (" error opening output file: " + outfile);
275264
276- // for the current chrom, this maps cpg index to cpg position in
277- // the chrom
278- // unordered_map<size_t, size_t> cpgs;
279- vector<size_t > cpgs;
265+ vector<uint64_t > cpgs;
280266
281267 unordered_set<string> chroms_seen;
282268 string chrom_name;
283269 string chrom;
284270
271+ quick_buf buf;
272+
285273 // iterate over records/reads in the SAM file, sequentially
286274 // processing each before considering the next
287275 bam_rec aln;
288276 while (in.read (hdr, aln)) {
289-
290277 // get the correct chrom if it has changed
291278 if (string (sam_hdr_tid2name (hdr, aln)) != chrom_name) {
292279 chrom_name = sam_hdr_tid2name (hdr, aln);
293280
294281 // make sure all reads from same chrom are contiguous in the file
295282 if (chroms_seen.find (chrom_name) != end (chroms_seen))
296- throw runtime_error (" chroms out of order (check SAM file sorted)" );
283+ throw dnmt_error (" chroms out of order (check SAM file sorted)" );
297284
298- if (VERBOSE)
299- cerr << " processing " << chrom_name << endl;
285+ if (VERBOSE) cerr << " processing " << chrom_name << endl;
300286
301287 get_chrom (chrom_name, all_chroms, chrom_lookup, chrom);
302288 collect_cpgs (chrom, cpgs);
303289 }
304290
305- size_t first_cpg_index = std::numeric_limits<size_t >::max ();
291+ uint64_t first_cpg_index = std::numeric_limits<uint64_t >::max ();
306292 string seq;
307293
308- const bool has_cpgs = bam_is_rev (aln) ?
309- convert_meth_states_neg (cpgs, hdr, aln, first_cpg_index, seq) :
310- convert_meth_states_pos (cpgs, hdr, aln, first_cpg_index, seq);
294+ const bool has_cpgs =
295+ bam_is_rev (aln)
296+ ? convert_meth_states_neg (cpgs, hdr, aln, first_cpg_index, seq)
297+ : convert_meth_states_pos (cpgs, hdr, aln, first_cpg_index, seq);
311298
312- if (has_cpgs)
313- out << sam_hdr_tid2name (hdr, aln) << ' \t '
314- << first_cpg_index << ' \t '
299+ if (has_cpgs) {
300+ buf. clear ();
301+ buf << sam_hdr_tid2name (hdr, aln) << ' \t ' << first_cpg_index << ' \t '
315302 << seq << ' \n ' ;
303+ if (!out.write (buf.c_str (), buf.tellp ())) {
304+ cerr << " failure writing output" << endl;
305+ return EXIT_FAILURE;
306+ }
307+ }
316308 }
317309 }
318310 catch (const std::exception &e) {
0 commit comments