Skip to content

Commit b058e49

Browse files
Merge pull request #253 from smithlabcode/chrom-order-check-for-xcounts
Chrom order checks in xcounts
2 parents 85d3dce + cb04f30 commit b058e49

File tree

3 files changed

+132
-44
lines changed

3 files changed

+132
-44
lines changed

src/common/counts_header.cpp

Lines changed: 42 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -18,13 +18,13 @@
1818

1919
#include "counts_header.hpp"
2020

21-
#include <iostream>
22-
#include <fstream>
23-
#include <string>
24-
#include <vector>
2521
#include <cassert>
2622
#include <cstdint>
23+
#include <fstream>
24+
#include <iostream>
2725
#include <sstream>
26+
#include <string>
27+
#include <vector>
2828

2929
#include "bam_record_utils.hpp"
3030

@@ -34,48 +34,63 @@
3434
#include "bamxx.hpp"
3535
#include "dnmt_error.hpp"
3636

37-
using std::vector;
3837
using std::string;
3938
using std::to_string;
39+
using std::unordered_map;
40+
using std::vector;
4041

4142
using bamxx::bgzf_file;
4243

43-
void
44+
std::unordered_map<std::string, std::uint32_t>
4445
write_counts_header_from_chrom_sizes(const vector<string> &chrom_names,
4546
const vector<uint64_t> &chrom_sizes,
4647
bgzf_file &out) {
4748
const auto version = "#DNMTOOLS " + string(VERSION) + "\n";
4849
out.write(version.c_str());
50+
51+
std::unordered_map<std::string, std::uint32_t> chrom_order;
52+
std::uint32_t chrom_count = 0;
53+
4954
for (auto i = 0u; i < size(chrom_sizes); ++i) {
5055
const string tmp =
5156
"#" + chrom_names[i] + " " + to_string(chrom_sizes[i]) + "\n";
52-
out.write(tmp.c_str());
57+
out.write(tmp.data());
58+
chrom_order.emplace(chrom_names[i], chrom_count++);
5359
}
5460
out.write("#\n");
55-
}
5661

62+
return chrom_order;
63+
}
5764

58-
void
65+
std::unordered_map<std::string, std::uint32_t>
5966
write_counts_header_from_file(const string &header_file, bgzf_file &out) {
6067
std::ifstream in(header_file);
6168
if (!in.is_open()) {
62-
throw dnmt_error("failed to open header file: " + header_file);
69+
throw dnmt_error("failed to open header file: " + header_file);
6370
}
71+
72+
std::unordered_map<std::string, std::uint32_t> chrom_order;
73+
std::uint32_t chrom_count = 0;
74+
6475
string line;
65-
while(getline(in, line)) {
76+
while (getline(in, line)) {
6677
out.write(line + '\n');
78+
const auto name = line.substr(1, line.find(' ') - 1);
79+
chrom_order.emplace(name, chrom_count++);
6780
}
6881
in.close();
69-
}
7082

83+
return chrom_order;
84+
}
7185

7286
bamxx::bgzf_file &
7387
skip_counts_header(bamxx::bgzf_file &in) {
7488

7589
// use the kstring_t type to more directly use the BGZF file
7690
kstring_t line{0, 0, nullptr};
7791
const int ret = ks_resize(&line, 1024);
78-
if (ret) return in;
92+
if (ret)
93+
return in;
7994

8095
while (bamxx::getline(in, line) && line.s[0] == '#') {
8196
if (line.s[0] == '#' && line.l == 1)
@@ -86,7 +101,6 @@ skip_counts_header(bamxx::bgzf_file &in) {
86101
return in;
87102
}
88103

89-
90104
int
91105
get_chrom_sizes_for_counts_header(const uint32_t n_threads,
92106
const string &filename,
@@ -96,7 +110,8 @@ get_chrom_sizes_for_counts_header(const uint32_t n_threads,
96110
bamxx::bam_tpool tpool(n_threads);
97111

98112
bgzf_file in(filename, "r");
99-
if (!in) return -1;
113+
if (!in)
114+
return -1;
100115
if (n_threads > 1 && in.is_bgzf())
101116
tpool.set_io(in);
102117

@@ -106,18 +121,22 @@ get_chrom_sizes_for_counts_header(const uint32_t n_threads,
106121
// use the kstring_t type to more directly use the BGZF file
107122
kstring_t line{0, 0, nullptr};
108123
const int ret = ks_resize(&line, 1024);
109-
if (ret) return -1;
124+
if (ret)
125+
return -1;
110126

111127
uint64_t chrom_size = 0;
112128
while (getline(in, line)) {
113129
if (line.s[0] == '>') {
114-
if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size);
130+
if (!chrom_names.empty())
131+
chrom_sizes.push_back(chrom_size);
115132
chrom_names.emplace_back(line.s + 1);
116133
chrom_size = 0;
117134
}
118-
else chrom_size += line.l;
135+
else
136+
chrom_size += line.l;
119137
}
120-
if (!chrom_names.empty()) chrom_sizes.push_back(chrom_size);
138+
if (!chrom_names.empty())
139+
chrom_sizes.push_back(chrom_size);
121140

122141
ks_free(&line);
123142

@@ -126,7 +145,6 @@ get_chrom_sizes_for_counts_header(const uint32_t n_threads,
126145
return 0;
127146
}
128147

129-
130148
void
131149
write_counts_header_from_bam_header(const bamxx::bam_header &hdr,
132150
bgzf_file &out) {
@@ -142,7 +160,6 @@ write_counts_header_from_bam_header(const bamxx::bam_header &hdr,
142160
out.write("#\n");
143161
}
144162

145-
146163
bool
147164
write_counts_header_line(string line, bgzf_file &out) {
148165
line += '\n';
@@ -153,8 +170,10 @@ write_counts_header_line(string line, bgzf_file &out) {
153170
bool
154171
get_has_counts_header(const string &filename) {
155172
bgzf_file in(filename, "r");
156-
if (!in) return false;
173+
if (!in)
174+
return false;
157175
string line;
158-
if (!getline(in, line)) return false;
176+
if (!getline(in, line))
177+
return false;
159178
return line[0] == '#';
160179
}

src/common/counts_header.hpp

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -19,18 +19,19 @@
1919
#ifndef COUNTS_HEADER_HPP
2020
#define COUNTS_HEADER_HPP
2121

22+
#include <cstdint>
2223
#include <string>
24+
#include <unordered_map>
2325
#include <vector>
24-
#include <cstdint>
2526

2627
#include "bamxx.hpp"
2728

28-
void
29-
write_counts_header_from_chrom_sizes(const std::vector<std::string> &chrom_names,
30-
const std::vector<uint64_t> &chrom_sizes,
31-
bamxx::bgzf_file &out);
29+
std::unordered_map<std::string, std::uint32_t>
30+
write_counts_header_from_chrom_sizes(
31+
const std::vector<std::string> &chrom_names,
32+
const std::vector<uint64_t> &chrom_sizes, bamxx::bgzf_file &out);
3233

33-
void
34+
std::unordered_map<std::string, std::uint32_t>
3435
write_counts_header_from_file(const std::string &header_file,
3536
bamxx::bgzf_file &out);
3637

@@ -60,7 +61,7 @@ is_counts_header_version_line(const std::string &line) {
6061
return line.compare(0, 9, version_line) == 0;
6162
}
6263

63-
template<typename T>
64+
template <typename T>
6465
inline bool
6566
is_counts_header_line(T &line) {
6667
return line[0] == '#';

src/utils/xcounts.cpp

Lines changed: 82 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <stdexcept>
2424
#include <string>
2525
#include <system_error>
26+
#include <type_traits> // std::underlying_type_t
2627
#include <vector>
2728

2829
// from smithlab_cpp
@@ -45,6 +46,49 @@ using std::vector;
4546

4647
using bamxx::bgzf_file;
4748

49+
enum class xcounts_err {
50+
// clang-format off
51+
ok = 0,
52+
open_failure = 1,
53+
header_failure = 2,
54+
chromosome_not_found = 3,
55+
inconsistent_chromosome_order = 4,
56+
incorrect_chromosome_size = 5,
57+
failed_to_write_file = 6,
58+
failed_to_parse_site = 7,
59+
// clang-format on
60+
};
61+
62+
struct xcounts_err_cat : std::error_category {
63+
auto name() const noexcept -> const char * override {
64+
return "xcounts error";
65+
}
66+
auto message(const int condition) const -> std::string override {
67+
using std::string_literals::operator""s;
68+
switch (condition) {
69+
case 0: return "ok"s;
70+
case 1: return "failed to open methylome file"s;
71+
case 2: return "failed to parse xcounts header"s;
72+
case 3: return "failed to find chromosome in xcounts header"s;
73+
case 4: return "inconsistent chromosome order"s;
74+
case 5: return "incorrect chromosome size"s;
75+
case 6: return "failed to write to file"s;
76+
case 7: return "failed to parse site"s;
77+
}
78+
std::abort(); // unreacheable
79+
}
80+
};
81+
82+
template <>
83+
struct std::is_error_code_enum<xcounts_err> : public std::true_type {};
84+
85+
std::error_code
86+
make_error_code(xcounts_err e) {
87+
static auto category = xcounts_err_cat{};
88+
return std::error_code(static_cast<std::underlying_type_t<xcounts_err>>(e),
89+
category);
90+
}
91+
4892
template <typename T>
4993
static inline uint32_t
5094
fill_output_buffer(const uint32_t offset, const MSite &s, T &buf) {
@@ -137,10 +181,12 @@ main_xcounts(int argc, const char **argv) {
137181
tpool.set_io(out);
138182
}
139183

184+
std::unordered_map<string, uint32_t> chrom_order;
140185
if (!header_file.empty())
141-
write_counts_header_from_file(header_file, out);
186+
chrom_order = write_counts_header_from_file(header_file, out);
142187
else if (!genome_file.empty())
143-
write_counts_header_from_chrom_sizes(chrom_names, chrom_sizes, out);
188+
chrom_order =
189+
write_counts_header_from_chrom_sizes(chrom_names, chrom_sizes, out);
144190

145191
// use the kstring_t type to more directly use the BGZF file
146192
kstring_t line{0, 0, nullptr};
@@ -152,11 +198,14 @@ main_xcounts(int argc, const char **argv) {
152198

153199
uint32_t offset = 0;
154200
string prev_chrom;
155-
bool status_ok = true;
156201
bool found_header = (!genome_file.empty() || !header_file.empty());
157202

203+
std::error_code ec{};
204+
205+
uint32_t chrom_counter = 0;
206+
158207
MSite site;
159-
while (status_ok && bamxx::getline(in, line)) {
208+
while (ec == std::errc{} && bamxx::getline(in, line)) {
160209
if (is_counts_header_line(line.s)) {
161210
if (!genome_file.empty() || !header_file.empty())
162211
continue;
@@ -166,34 +215,53 @@ main_xcounts(int argc, const char **argv) {
166215
continue;
167216
}
168217

169-
status_ok = site.initialize(line.s, line.s + line.l);
170-
if (!status_ok || !found_header)
218+
if (!site.initialize(line.s, line.s + line.l)) {
219+
ec = xcounts_err::failed_to_parse_site;
220+
break;
221+
}
222+
if (!found_header) {
223+
ec = xcounts_err::header_failure;
171224
break;
225+
}
172226

173227
if (site.chrom != prev_chrom) {
174228
if (verbose)
175229
cerr << "processing: " << site.chrom << endl;
230+
231+
if (!chrom_order.empty()) {
232+
const auto expected_chrom_counter = chrom_order.find(site.chrom);
233+
if (expected_chrom_counter == std::cend(chrom_order)) {
234+
ec = xcounts_err::chromosome_not_found;
235+
break;
236+
}
237+
if (expected_chrom_counter->second != chrom_counter) {
238+
ec = xcounts_err::inconsistent_chromosome_order;
239+
break;
240+
}
241+
}
242+
243+
chrom_counter++;
244+
176245
prev_chrom = site.chrom;
177246
offset = 0;
178247

179248
site.chrom += '\n';
180249
const int64_t sz = size(site.chrom);
181-
status_ok = bgzf_write(out.f, site.chrom.data(), sz) == sz;
250+
if (bgzf_write(out.f, site.chrom.data(), sz) != sz)
251+
ec = xcounts_err::failed_to_write_file;
182252
}
183253
if (site.n_reads > 0) {
184254
const int64_t sz = fill_output_buffer(offset, site, buf);
185-
status_ok = bgzf_write(out.f, buf.data(), sz) == sz;
255+
if (bgzf_write(out.f, buf.data(), sz) != sz)
256+
ec = xcounts_err::failed_to_write_file;
186257
offset = site.pos;
187258
}
188259
}
189260
ks_free(&line);
190261

191-
if (!status_ok) {
192-
cerr << "failed converting " << filename << " to " << outfile << endl;
193-
return EXIT_FAILURE;
194-
}
195-
if (!found_header) {
196-
cerr << "no header provided or found" << endl;
262+
if (ec) {
263+
cerr << "failed converting " << filename << " to " << outfile << endl
264+
<< ec << endl;
197265
return EXIT_FAILURE;
198266
}
199267
}

0 commit comments

Comments
 (0)