Skip to content

Commit 6e9eac5

Browse files
Merge pull request #146 from smithlabcode/selectsites-input-file-type
Selectsites input file type (bug fix)
2 parents 00821e0 + 19abe1e commit 6e9eac5

File tree

1 file changed

+23
-35
lines changed

1 file changed

+23
-35
lines changed

src/utils/selectsites.cpp

Lines changed: 23 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@
2525
#include <algorithm>
2626
#include <numeric>
2727
#include <unordered_map>
28-
28+
#include <filesystem>
2929
#include <bamxx.hpp>
3030

3131
#include "OptionParser.hpp"
@@ -46,43 +46,37 @@ using std::unordered_map;
4646

4747
using bamxx::bgzf_file;
4848

49+
namespace fs = std::filesystem;
50+
4951
static void
5052
collapsebed(vector<GenomicRegion> &regions) {
5153
size_t j = 0;
5254
for (size_t i = 1; i < regions.size(); ++i) {
5355
if (regions[j].same_chrom(regions[i]) &&
5456
regions[i].get_start() <= regions[j].get_end()) {
55-
regions[j].set_end(std::max(regions[j].get_end(),
56-
regions[i].get_end()));
57-
}
58-
else {
59-
regions[++j] = regions[i];
57+
regions[j].set_end(std::max(regions[j].get_end(), regions[i].get_end()));
6058
}
59+
else { regions[++j] = regions[i]; }
6160
}
6261
regions.erase(begin(regions) + j + 1, end(regions));
6362
}
6463

65-
static bool
64+
static inline bool
6665
precedes(const GenomicRegion &r, const MSite &s) {
6766
return (r.get_chrom() < s.chrom ||
6867
(r.get_chrom() == s.chrom && r.get_end() <= s.pos));
6968
}
7069

71-
72-
static bool
70+
static inline bool
7371
contains(const GenomicRegion &r, const MSite &s) {
7472
return (r.get_chrom() == s.chrom &&
7573
(r.get_start() <= s.pos && s.pos < r.get_end()));
7674
}
7775

78-
79-
template <class T>
80-
static void
81-
process_all_sites(const bool VERBOSE,
82-
const string &sites_file,
76+
template<class T> static void
77+
process_all_sites(const bool VERBOSE, const string &sites_file,
8378
const unordered_map<string, vector<GenomicRegion>> &regions,
8479
T &out) {
85-
8680
bgzf_file in(sites_file, "r");
8781
if (!in) throw runtime_error("cannot open file: " + sites_file);
8882

@@ -91,32 +85,28 @@ process_all_sites(const bool VERBOSE,
9185
bool chrom_is_relevant = false;
9286
while (read_site(in, the_site)) {
9387
if (the_site.chrom != prev_site.chrom) {
94-
if (VERBOSE)
95-
cerr << "processing " << the_site.chrom << endl;
96-
auto r = regions.find(the_site.chrom);
97-
chrom_is_relevant = (r != end(regions));
88+
if (VERBOSE) cerr << "processing " << the_site.chrom << endl;
89+
const auto r = regions.find(the_site.chrom);
90+
chrom_is_relevant = (r != cend(regions));
9891
if (chrom_is_relevant) {
99-
i = begin(r->second);
100-
i_lim = end(r->second);
92+
i = cbegin(r->second);
93+
i_lim = cend(r->second);
10194
}
10295
}
10396
if (chrom_is_relevant) {
10497
while (i != i_lim && precedes(*i, the_site))
10598
++i;
106-
107-
if (contains(*i, the_site))
99+
if (i != i_lim && contains(*i, the_site))
108100
write_site(out, the_site);
109101
}
110102
std::swap(prev_site, the_site);
111103
}
112104
}
113105

114-
115106
static void
116107
get_sites_in_region(ifstream &site_in, const GenomicRegion &region,
117108
std::ostream &out) {
118-
119-
string chrom(region.get_chrom());
109+
const string chrom{region.get_chrom()};
120110
const size_t start_pos = region.get_start();
121111
const size_t end_pos = region.get_end();
122112
find_offset_for_msite(chrom, start_pos, site_in);
@@ -127,11 +117,9 @@ get_sites_in_region(ifstream &site_in, const GenomicRegion &region,
127117
while (site_in >> the_site &&
128118
(the_site.chrom < chrom ||
129119
(the_site.chrom == chrom && the_site.pos < end_pos)))
130-
if (start_pos <= the_site.pos)
131-
out << the_site << endl;
120+
if (start_pos <= the_site.pos) out << the_site << endl;
132121
}
133122

134-
135123
static void
136124
process_with_sites_on_disk(const string &sites_file,
137125
vector<GenomicRegion> &regions,
@@ -182,7 +170,7 @@ main_selectsites(int argc, const char **argv) {
182170
try {
183171

184172
bool VERBOSE = false;
185-
bool LOAD_ENTIRE_FILE = false;
173+
bool load_entire_file = false;
186174

187175
string outfile;
188176

@@ -198,7 +186,7 @@ main_selectsites(int argc, const char **argv) {
198186
false, outfile);
199187
opt_parse.add_opt("preload", 'p',
200188
"preload sites (use for large target intervals)",
201-
false, LOAD_ENTIRE_FILE);
189+
false, load_entire_file);
202190
opt_parse.add_opt("verbose", 'v', "print more run info", false, VERBOSE);
203191
opt_parse.set_show_defaults();
204192
vector<string> leftover_args;
@@ -224,11 +212,11 @@ main_selectsites(int argc, const char **argv) {
224212
const string sites_file = leftover_args.back();
225213
/****************** END COMMAND LINE OPTIONS *****************/
226214

227-
if (isdir(sites_file.c_str()) || !file_exists(sites_file))
215+
if (!fs::is_regular_file(sites_file))
228216
throw runtime_error("bad input sites file: " + sites_file);
229217

230218
if (is_compressed_file(sites_file)) {
231-
LOAD_ENTIRE_FILE = true;
219+
load_entire_file = true;
232220
if (VERBOSE)
233221
cerr << "input file is so must be loaded" << endl;
234222
}
@@ -245,7 +233,7 @@ main_selectsites(int argc, const char **argv) {
245233
<< n_orig_regions - regions.size() << "]" << endl;
246234

247235
unordered_map<string, vector<GenomicRegion>> regions_lookup;
248-
if ((outfile.empty() || !has_gz_ext(outfile)) && LOAD_ENTIRE_FILE)
236+
if ((outfile.empty() || !has_gz_ext(outfile)) && load_entire_file)
249237
regions_by_chrom(regions, regions_lookup);
250238

251239
if (outfile.empty() || !has_gz_ext(outfile)) {
@@ -255,7 +243,7 @@ main_selectsites(int argc, const char **argv) {
255243
if (!outfile.empty() && !out)
256244
throw runtime_error("failed to open output file: " + outfile);
257245

258-
if (LOAD_ENTIRE_FILE)
246+
if (load_entire_file)
259247
process_all_sites(VERBOSE, sites_file, regions_lookup, out);
260248
else
261249
process_with_sites_on_disk(sites_file, regions, out);

0 commit comments

Comments
 (0)