Skip to content

Commit 04261d0

Browse files
selectsites: adding a summary output file and making output code more uniform using bgzf output format even when not compressed
1 parent f4a445c commit 04261d0

File tree

1 file changed

+160
-34
lines changed

1 file changed

+160
-34
lines changed

src/utils/selectsites.cpp

Lines changed: 160 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -43,11 +43,97 @@ using std::ios_base;
4343
using std::runtime_error;
4444
using std::ifstream;
4545
using std::unordered_map;
46+
using std::tuple;
47+
using std::tie;
4648

4749
using bamxx::bgzf_file;
4850

4951
namespace fs = std::filesystem;
5052

53+
struct quick_buf : public std::ostringstream,
54+
public std::basic_stringbuf<char> {
55+
// ADS: By user ecatmur on SO; very fast. Seems to work...
56+
quick_buf() {
57+
// ...but this seems to depend on data layout
58+
static_cast<std::basic_ios<char>&>(*this).rdbuf(this);
59+
}
60+
void clear() {
61+
// reset buffer pointers (member functions)
62+
setp(pbase(), pbase());
63+
}
64+
char const* c_str() {
65+
/* between c_str and insertion make sure to clear() */
66+
*pptr() = '\0';
67+
return pbase();
68+
}
69+
};
70+
71+
struct selectsites_summary {
72+
73+
// command_line is the command used to produce this summary file and
74+
// the corresponding results
75+
string command_line{};
76+
77+
// n_target_regions is the number of target regions provided as
78+
// input to the command
79+
uint64_t n_target_regions{};
80+
81+
// target_region_size is the sum of the sizes of each target region
82+
uint64_t target_region_size{};
83+
84+
// n_target_regions_collapsed is the number of target regions after
85+
// having collapsed the input target regions merging those that
86+
// overlap
87+
uint64_t n_target_regions_collapsed{};
88+
89+
// target_region_collapsed_size is the sum of the sizes of target
90+
// regions after collapsing
91+
uint64_t target_region_collapsed_size{};
92+
93+
// n_sites_total is the total number of sites available in the input
94+
// counts file. This value is displayed as zero if the command
95+
// specified to process the sites on disk.
96+
uint64_t n_sites_total{};
97+
98+
// n_sites_selected is the total number of sites in the output file
99+
uint64_t n_sites_selected{};
100+
101+
template<typename T> static auto measure_target_regions(const T &t)
102+
-> tuple<uint64_t, uint64_t> {
103+
return {
104+
std::size(t),
105+
accumulate(cbegin(t), cend(t), 0ul,
106+
[](const uint64_t a, const typename T::value_type &v) {
107+
return a + v.get_width();
108+
}),
109+
};
110+
}
111+
112+
auto tostring() const -> string {
113+
std::ostringstream oss;
114+
oss << "command_line: " << command_line << '\n'
115+
<< "n_target_regions: " << n_target_regions << '\n'
116+
<< "target_region_size: " << target_region_size << '\n'
117+
<< "n_target_regions_collapsed: " << n_target_regions_collapsed << '\n'
118+
<< "target_region_collapsed_size: " << target_region_collapsed_size
119+
<< '\n'
120+
<< "n_sites_total: " << n_sites_total << '\n'
121+
<< "n_sites_selected: " << n_sites_selected;
122+
return oss.str();
123+
}
124+
};
125+
126+
static auto
127+
write_stats_output(const selectsites_summary &summary,
128+
const string &summary_file) -> void {
129+
if (!summary_file.empty()) {
130+
std::ofstream out_summary(summary_file);
131+
if (!out_summary) throw runtime_error("bad summary output file");
132+
out_summary << summary.tostring() << endl;
133+
}
134+
}
135+
136+
51137
static void
52138
collapsebed(vector<GenomicRegion> &regions) {
53139
size_t j = 0;
@@ -73,17 +159,21 @@ contains(const GenomicRegion &r, const MSite &s) {
73159
(r.get_start() <= s.pos && s.pos < r.get_end()));
74160
}
75161

76-
template<class T> static void
162+
static auto
77163
process_all_sites(const bool VERBOSE, const string &sites_file,
78164
const unordered_map<string, vector<GenomicRegion>> &regions,
79-
T &out) {
165+
bgzf_file &out) -> tuple<uint64_t, uint64_t> {
80166
bgzf_file in(sites_file, "r");
81167
if (!in) throw runtime_error("cannot open file: " + sites_file);
82168

169+
uint64_t n_sites_total = 0u;
170+
uint64_t n_sites_selected = 0u;
171+
83172
MSite the_site, prev_site;
84173
vector<GenomicRegion>::const_iterator i, i_lim;
85174
bool chrom_is_relevant = false;
86175
while (read_site(in, the_site)) {
176+
++n_sites_total;
87177
if (the_site.chrom != prev_site.chrom) {
88178
if (VERBOSE) cerr << "processing " << the_site.chrom << endl;
89179
const auto r = regions.find(the_site.chrom);
@@ -96,41 +186,58 @@ process_all_sites(const bool VERBOSE, const string &sites_file,
96186
if (chrom_is_relevant) {
97187
while (i != i_lim && precedes(*i, the_site))
98188
++i;
99-
if (i != i_lim && contains(*i, the_site))
189+
if (i != i_lim && contains(*i, the_site)) {
190+
++n_sites_selected;
100191
write_site(out, the_site);
192+
}
101193
}
102194
std::swap(prev_site, the_site);
103195
}
196+
return {n_sites_selected, n_sites_total};
104197
}
105198

106-
static void
199+
static auto
107200
get_sites_in_region(ifstream &site_in, const GenomicRegion &region,
108-
std::ostream &out) {
201+
bgzf_file &out) -> uint64_t {
202+
quick_buf buf;
203+
109204
const string chrom{region.get_chrom()};
110205
const size_t start_pos = region.get_start();
111206
const size_t end_pos = region.get_end();
112207
find_offset_for_msite(chrom, start_pos, site_in);
113208

114209
MSite the_site;
210+
uint64_t n_sites_selected = 0u;
115211
// ADS: should only happen once that "the_site.chrom < chrom" and
116212
// this is only needed because of end state of binary search on disk
117213
while (site_in >> the_site &&
118214
(the_site.chrom < chrom ||
119215
(the_site.chrom == chrom && the_site.pos < end_pos)))
120-
if (start_pos <= the_site.pos) out << the_site << endl;
216+
if (start_pos <= the_site.pos) {
217+
// ADS: don't like this -- always needing to "clear()" this
218+
// struct is bad...
219+
buf.clear();
220+
buf << the_site << '\n';
221+
if (!out.write(buf.c_str(), buf.tellp()))
222+
throw runtime_error("error writing output");
223+
++n_sites_selected;
224+
}
225+
return n_sites_selected;
121226
}
122227

123-
static void
228+
229+
static auto
124230
process_with_sites_on_disk(const string &sites_file,
125231
vector<GenomicRegion> &regions,
126-
std::ostream &out) {
127-
232+
bgzf_file &out) -> uint64_t {
128233
ifstream in(sites_file);
129234
if (!in)
130235
throw runtime_error("cannot open file: " + sites_file);
131236

132-
for (size_t i = 0; i < regions.size() && in; ++i)
133-
get_sites_in_region(in, regions[i], out);
237+
uint64_t n_sites_selected = 0ul;
238+
for (auto i = 0u; i < size(regions) && in; ++i)
239+
n_sites_selected += get_sites_in_region(in, regions[i], out);
240+
return n_sites_selected;
134241
}
135242

136243

@@ -164,15 +271,28 @@ is_compressed_file(const string &filename) {
164271
}
165272

166273

274+
static auto
275+
get_command_line(const int argc, const char **const argv) -> string {
276+
if (argc == 0) return string();
277+
std::ostringstream cmd;
278+
cmd << '"';
279+
copy(argv, argv + (argc - 1), std::ostream_iterator<const char *>(cmd, " "));
280+
cmd << argv[argc-1] << '"';
281+
return cmd.str();
282+
}
283+
284+
167285
int
168286
main_selectsites(int argc, const char **argv) {
169287

170288
try {
171289

172290
bool VERBOSE = false;
173-
bool load_entire_file = false;
291+
bool keep_file_on_disk = false;
292+
bool compress_output = false;
174293

175-
string outfile;
294+
string outfile("-");
295+
string summary_file;
176296

177297
const string description =
178298
"Select sites inside a set of genomic intervals. "
@@ -184,9 +304,11 @@ main_selectsites(int argc, const char **argv) {
184304
"<intervals-bed> <sites-file>", 2);
185305
opt_parse.add_opt("output", 'o', "output file (default: stdout)",
186306
false, outfile);
187-
opt_parse.add_opt("preload", 'p',
188-
"preload sites (use for large target intervals)",
189-
false, load_entire_file);
307+
opt_parse.add_opt("disk", 'd', "process sites on disk "
308+
"(fast if target intervals are few)",
309+
false, keep_file_on_disk);
310+
opt_parse.add_opt("summary", 'S', "write summary to this file", false, summary_file);
311+
opt_parse.add_opt("zip", 'z', "output gzip format", false, compress_output);
190312
opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
191313
opt_parse.set_show_defaults();
192314
vector<string> leftover_args;
@@ -212,11 +334,14 @@ main_selectsites(int argc, const char **argv) {
212334
const string sites_file = leftover_args.back();
213335
/****************** END COMMAND LINE OPTIONS *****************/
214336

337+
selectsites_summary summary;
338+
summary.command_line = get_command_line(argc, argv);
339+
215340
if (!fs::is_regular_file(sites_file))
216341
throw runtime_error("bad input sites file: " + sites_file);
217342

218343
if (is_compressed_file(sites_file)) {
219-
load_entire_file = true;
344+
keep_file_on_disk = false;
220345
if (VERBOSE)
221346
cerr << "input file is so must be loaded" << endl;
222347
}
@@ -225,34 +350,35 @@ main_selectsites(int argc, const char **argv) {
225350
ReadBEDFile(regions_file, regions);
226351
if (!check_sorted(regions))
227352
throw runtime_error("regions not sorted in file: " + regions_file);
353+
std::tie(summary.n_target_regions, summary.target_region_size) =
354+
selectsites_summary::measure_target_regions(regions);
228355

229356
const size_t n_orig_regions = regions.size();
230357
collapsebed(regions);
231358
if (VERBOSE && n_orig_regions != regions.size())
232359
cerr << "[number of regions merged due to overlap: "
233360
<< n_orig_regions - regions.size() << "]" << endl;
234361

362+
std::tie(summary.n_target_regions_collapsed,
363+
summary.target_region_collapsed_size) =
364+
selectsites_summary::measure_target_regions(regions);
365+
235366
unordered_map<string, vector<GenomicRegion>> regions_lookup;
236-
if ((outfile.empty() || !has_gz_ext(outfile)) && load_entire_file)
237-
regions_by_chrom(regions, regions_lookup);
367+
if (!keep_file_on_disk) regions_by_chrom(regions, regions_lookup);
238368

239-
if (outfile.empty() || !has_gz_ext(outfile)) {
240-
std::ofstream of;
241-
if (!outfile.empty()) of.open(outfile);
242-
std::ostream out(outfile.empty() ? cout.rdbuf() : of.rdbuf());
243-
if (!outfile.empty() && !out)
244-
throw runtime_error("failed to open output file: " + outfile);
369+
// open the output file
370+
const string output_mode = compress_output ? "w" : "wu";
371+
bamxx::bgzf_file out(outfile, output_mode);
372+
if (!out) throw runtime_error("error opening output file: " + outfile);
245373

246-
if (load_entire_file)
247-
process_all_sites(VERBOSE, sites_file, regions_lookup, out);
248-
else
374+
if (keep_file_on_disk)
375+
summary.n_sites_selected =
249376
process_with_sites_on_disk(sites_file, regions, out);
250-
}
251-
else {
252-
// not supporting search on disk for gz file
253-
bgzf_file out(outfile, "w");
254-
process_all_sites(VERBOSE, sites_file, regions_lookup, out);
255-
}
377+
else
378+
std::tie(summary.n_sites_selected, summary.n_sites_total) =
379+
process_all_sites(VERBOSE, sites_file, regions_lookup, out);
380+
381+
write_stats_output(summary, summary_file);
256382
}
257383
catch (const runtime_error &e) {
258384
cerr << e.what() << endl;

0 commit comments

Comments
 (0)