@@ -43,11 +43,97 @@ using std::ios_base;
4343using std::runtime_error;
4444using std::ifstream;
4545using std::unordered_map;
46+ using std::tuple;
47+ using std::tie;
4648
4749using bamxx::bgzf_file;
4850
4951namespace fs = std::filesystem;
5052
53+ struct quick_buf : public std ::ostringstream,
54+ public std::basic_stringbuf<char > {
55+ // ADS: By user ecatmur on SO; very fast. Seems to work...
56+ quick_buf () {
57+ // ...but this seems to depend on data layout
58+ static_cast <std::basic_ios<char >&>(*this ).rdbuf (this );
59+ }
60+ void clear () {
61+ // reset buffer pointers (member functions)
62+ setp (pbase (), pbase ());
63+ }
64+ char const * c_str () {
65+ /* between c_str and insertion make sure to clear() */
66+ *pptr () = ' \0 ' ;
67+ return pbase ();
68+ }
69+ };
70+
71+ struct selectsites_summary {
72+
73+ // command_line is the command used to produce this summary file and
74+ // the corresponding results
75+ string command_line{};
76+
77+ // n_target_regions is the number of target regions provided as
78+ // input to the command
79+ uint64_t n_target_regions{};
80+
81+ // target_region_size is the sum of the sizes of each target region
82+ uint64_t target_region_size{};
83+
84+ // n_target_regions_collapsed is the number of target regions after
85+ // having collapsed the input target regions merging those that
86+ // overlap
87+ uint64_t n_target_regions_collapsed{};
88+
89+ // target_region_collapsed_size is the sum of the sizes of target
90+ // regions after collapsing
91+ uint64_t target_region_collapsed_size{};
92+
93+ // n_sites_total is the total number of sites available in the input
94+ // counts file. This value is displayed as zero if the command
95+ // specified to process the sites on disk.
96+ uint64_t n_sites_total{};
97+
98+ // n_sites_selected is the total number of sites in the output file
99+ uint64_t n_sites_selected{};
100+
101+ template <typename T> static auto measure_target_regions (const T &t)
102+ -> tuple<uint64_t, uint64_t> {
103+ return {
104+ std::size (t),
105+ accumulate (cbegin (t), cend (t), 0ul ,
106+ [](const uint64_t a, const typename T::value_type &v) {
107+ return a + v.get_width ();
108+ }),
109+ };
110+ }
111+
112+ auto tostring () const -> string {
113+ std::ostringstream oss;
114+ oss << " command_line: " << command_line << ' \n '
115+ << " n_target_regions: " << n_target_regions << ' \n '
116+ << " target_region_size: " << target_region_size << ' \n '
117+ << " n_target_regions_collapsed: " << n_target_regions_collapsed << ' \n '
118+ << " target_region_collapsed_size: " << target_region_collapsed_size
119+ << ' \n '
120+ << " n_sites_total: " << n_sites_total << ' \n '
121+ << " n_sites_selected: " << n_sites_selected;
122+ return oss.str ();
123+ }
124+ };
125+
126+ static auto
127+ write_stats_output (const selectsites_summary &summary,
128+ const string &summary_file) -> void {
129+ if (!summary_file.empty ()) {
130+ std::ofstream out_summary (summary_file);
131+ if (!out_summary) throw runtime_error (" bad summary output file" );
132+ out_summary << summary.tostring () << endl;
133+ }
134+ }
135+
136+
51137static void
52138collapsebed (vector<GenomicRegion> ®ions) {
53139 size_t j = 0 ;
@@ -73,17 +159,21 @@ contains(const GenomicRegion &r, const MSite &s) {
73159 (r.get_start () <= s.pos && s.pos < r.get_end ()));
74160}
75161
76- template < class T > static void
162+ static auto
77163process_all_sites (const bool VERBOSE, const string &sites_file,
78164 const unordered_map<string, vector<GenomicRegion>> ®ions,
79- T &out) {
165+ bgzf_file &out) -> tuple<uint64_t, uint64_t> {
80166 bgzf_file in (sites_file, " r" );
81167 if (!in) throw runtime_error (" cannot open file: " + sites_file);
82168
169+ uint64_t n_sites_total = 0u ;
170+ uint64_t n_sites_selected = 0u ;
171+
83172 MSite the_site, prev_site;
84173 vector<GenomicRegion>::const_iterator i, i_lim;
85174 bool chrom_is_relevant = false ;
86175 while (read_site (in, the_site)) {
176+ ++n_sites_total;
87177 if (the_site.chrom != prev_site.chrom ) {
88178 if (VERBOSE) cerr << " processing " << the_site.chrom << endl;
89179 const auto r = regions.find (the_site.chrom );
@@ -96,41 +186,58 @@ process_all_sites(const bool VERBOSE, const string &sites_file,
96186 if (chrom_is_relevant) {
97187 while (i != i_lim && precedes (*i, the_site))
98188 ++i;
99- if (i != i_lim && contains (*i, the_site))
189+ if (i != i_lim && contains (*i, the_site)) {
190+ ++n_sites_selected;
100191 write_site (out, the_site);
192+ }
101193 }
102194 std::swap (prev_site, the_site);
103195 }
196+ return {n_sites_selected, n_sites_total};
104197}
105198
106- static void
199+ static auto
107200get_sites_in_region (ifstream &site_in, const GenomicRegion ®ion,
108- std::ostream &out) {
201+ bgzf_file &out) -> uint64_t {
202+ quick_buf buf;
203+
109204 const string chrom{region.get_chrom ()};
110205 const size_t start_pos = region.get_start ();
111206 const size_t end_pos = region.get_end ();
112207 find_offset_for_msite (chrom, start_pos, site_in);
113208
114209 MSite the_site;
210+ uint64_t n_sites_selected = 0u ;
115211 // ADS: should only happen once that "the_site.chrom < chrom" and
116212 // this is only needed because of end state of binary search on disk
117213 while (site_in >> the_site &&
118214 (the_site.chrom < chrom ||
119215 (the_site.chrom == chrom && the_site.pos < end_pos)))
120- if (start_pos <= the_site.pos ) out << the_site << endl;
216+ if (start_pos <= the_site.pos ) {
217+ // ADS: don't like this -- always needing to "clear()" this
218+ // struct is bad...
219+ buf.clear ();
220+ buf << the_site << ' \n ' ;
221+ if (!out.write (buf.c_str (), buf.tellp ()))
222+ throw runtime_error (" error writing output" );
223+ ++n_sites_selected;
224+ }
225+ return n_sites_selected;
121226}
122227
123- static void
228+
229+ static auto
124230process_with_sites_on_disk (const string &sites_file,
125231 vector<GenomicRegion> ®ions,
126- std::ostream &out) {
127-
232+ bgzf_file &out) -> uint64_t {
128233 ifstream in (sites_file);
129234 if (!in)
130235 throw runtime_error (" cannot open file: " + sites_file);
131236
132- for (size_t i = 0 ; i < regions.size () && in; ++i)
133- get_sites_in_region (in, regions[i], out);
237+ uint64_t n_sites_selected = 0ul ;
238+ for (auto i = 0u ; i < size (regions) && in; ++i)
239+ n_sites_selected += get_sites_in_region (in, regions[i], out);
240+ return n_sites_selected;
134241}
135242
136243
@@ -164,15 +271,28 @@ is_compressed_file(const string &filename) {
164271}
165272
166273
274+ static auto
275+ get_command_line (const int argc, const char **const argv) -> string {
276+ if (argc == 0 ) return string ();
277+ std::ostringstream cmd;
278+ cmd << ' "' ;
279+ copy (argv, argv + (argc - 1 ), std::ostream_iterator<const char *>(cmd, " " ));
280+ cmd << argv[argc-1 ] << ' "' ;
281+ return cmd.str ();
282+ }
283+
284+
167285int
168286main_selectsites (int argc, const char **argv) {
169287
170288 try {
171289
172290 bool VERBOSE = false ;
173- bool load_entire_file = false ;
291+ bool keep_file_on_disk = false ;
292+ bool compress_output = false ;
174293
175- string outfile;
294+ string outfile (" -" );
295+ string summary_file;
176296
177297 const string description =
178298 " Select sites inside a set of genomic intervals. "
@@ -184,9 +304,11 @@ main_selectsites(int argc, const char **argv) {
184304 " <intervals-bed> <sites-file>" , 2 );
185305 opt_parse.add_opt (" output" , ' o' , " output file (default: stdout)" ,
186306 false , outfile);
187- opt_parse.add_opt (" preload" , ' p' ,
188- " preload sites (use for large target intervals)" ,
189- false , load_entire_file);
307+ opt_parse.add_opt (" disk" , ' d' , " process sites on disk "
308+ " (fast if target intervals are few)" ,
309+ false , keep_file_on_disk);
310+ opt_parse.add_opt (" summary" , ' S' , " write summary to this file" , false , summary_file);
311+ opt_parse.add_opt (" zip" , ' z' , " output gzip format" , false , compress_output);
190312 opt_parse.add_opt (" verbose" , ' v' , " print more run info" , false , VERBOSE);
191313 opt_parse.set_show_defaults ();
192314 vector<string> leftover_args;
@@ -212,11 +334,14 @@ main_selectsites(int argc, const char **argv) {
212334 const string sites_file = leftover_args.back ();
213335 /* ***************** END COMMAND LINE OPTIONS *****************/
214336
337+ selectsites_summary summary;
338+ summary.command_line = get_command_line (argc, argv);
339+
215340 if (!fs::is_regular_file (sites_file))
216341 throw runtime_error (" bad input sites file: " + sites_file);
217342
218343 if (is_compressed_file (sites_file)) {
219- load_entire_file = true ;
344+ keep_file_on_disk = false ;
220345 if (VERBOSE)
221346 cerr << " input file is so must be loaded" << endl;
222347 }
@@ -225,34 +350,35 @@ main_selectsites(int argc, const char **argv) {
225350 ReadBEDFile (regions_file, regions);
226351 if (!check_sorted (regions))
227352 throw runtime_error (" regions not sorted in file: " + regions_file);
353+ std::tie (summary.n_target_regions , summary.target_region_size ) =
354+ selectsites_summary::measure_target_regions (regions);
228355
229356 const size_t n_orig_regions = regions.size ();
230357 collapsebed (regions);
231358 if (VERBOSE && n_orig_regions != regions.size ())
232359 cerr << " [number of regions merged due to overlap: "
233360 << n_orig_regions - regions.size () << " ]" << endl;
234361
362+ std::tie (summary.n_target_regions_collapsed ,
363+ summary.target_region_collapsed_size ) =
364+ selectsites_summary::measure_target_regions (regions);
365+
235366 unordered_map<string, vector<GenomicRegion>> regions_lookup;
236- if ((outfile.empty () || !has_gz_ext (outfile)) && load_entire_file)
237- regions_by_chrom (regions, regions_lookup);
367+ if (!keep_file_on_disk) regions_by_chrom (regions, regions_lookup);
238368
239- if (outfile.empty () || !has_gz_ext (outfile)) {
240- std::ofstream of;
241- if (!outfile.empty ()) of.open (outfile);
242- std::ostream out (outfile.empty () ? cout.rdbuf () : of.rdbuf ());
243- if (!outfile.empty () && !out)
244- throw runtime_error (" failed to open output file: " + outfile);
369+ // open the output file
370+ const string output_mode = compress_output ? " w" : " wu" ;
371+ bamxx::bgzf_file out (outfile, output_mode);
372+ if (!out) throw runtime_error (" error opening output file: " + outfile);
245373
246- if (load_entire_file)
247- process_all_sites (VERBOSE, sites_file, regions_lookup, out);
248- else
374+ if (keep_file_on_disk)
375+ summary.n_sites_selected =
249376 process_with_sites_on_disk (sites_file, regions, out);
250- }
251- else {
252- // not supporting search on disk for gz file
253- bgzf_file out (outfile, " w" );
254- process_all_sites (VERBOSE, sites_file, regions_lookup, out);
255- }
377+ else
378+ std::tie (summary.n_sites_selected , summary.n_sites_total ) =
379+ process_all_sites (VERBOSE, sites_file, regions_lookup, out);
380+
381+ write_stats_output (summary, summary_file);
256382 }
257383 catch (const runtime_error &e) {
258384 cerr << e.what () << endl;
0 commit comments