Skip to content

Commit d66bc56

Browse files
Merge pull request #94 from smithlabcode/format-bsmap
Format bsmap
2 parents 0be65ee + 425e8ea commit d66bc56

File tree

3 files changed

+43
-28
lines changed

3 files changed

+43
-28
lines changed

src/analysis/bsrate.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -332,12 +332,10 @@ main_bsrate(int argc, const char **argv) {
332332
const int32_t the_tid = get_tid(aln);
333333

334334
// make sure all reads from same chrom are contiguous in the file
335-
if (chroms_seen.find(the_tid) != end(chroms_seen)) {
336-
cerr << the_tid << '\t' << current_tid << endl;
335+
if (chroms_seen.find(the_tid) != end(chroms_seen))
337336
throw runtime_error("chroms out of order in mapped reads file");
338-
}
337+
339338
current_tid = the_tid;
340-
if (VERBOSE) cerr << "processing " << the_tid << endl;
341339

342340
chroms_seen.insert(the_tid);
343341

@@ -346,6 +344,9 @@ main_bsrate(int argc, const char **argv) {
346344
throw runtime_error("could not find chrom: " + the_tid);
347345

348346
chrom_idx = chrom_itr->second;
347+
348+
if (VERBOSE) cerr << "processing " << names[chrom_idx] << endl;
349+
349350
use_this_chrom = seq_to_use.empty() || chrom_idx == chrom_idx_to_use;
350351
}
351352

src/common/bam_record_utils.cpp

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -527,15 +527,12 @@ merge_by_byte(const bam1_t *a, const bam1_t *b, bam1_t *c) {
527527

528528
static inline void
529529
flip_conversion(bam1_t *aln) {
530-
if (aln->core.flag & BAM_FREVERSE)
531-
aln->core.flag = aln->core.flag & (~BAM_FREVERSE);
532-
else
533-
aln->core.flag = aln->core.flag | BAM_FREVERSE;
530+
aln->core.flag ^= BAM_FREVERSE; // ADS: flip the "reverse" bit
534531

535532
revcomp_seq_by_byte(aln);
536533

537534
// ADS: don't like *(cv + 1) below, but no HTSlib function for it?
538-
uint8_t *cv = bam_aux_get(aln, "CV");
535+
auto cv = bam_aux_get(aln, "CV");
539536
if (!cv) throw dnmt_error("bam_aux_get failed for CV");
540537
*(cv + 1) = 'T';
541538
}
@@ -812,31 +809,40 @@ standardize_format(const string &input_format, bam1_t *aln) {
812809
if (input_format == "abismal" || input_format == "walt") return;
813810

814811
if (input_format == "bsmap") {
815-
// A/T rich; get the ZS tag value
816-
auto zs_tag = bam_aux_get(aln, "ZS");
812+
// A/T rich: get the ZS tag value
813+
const auto zs_tag = bam_aux_get(aln, "ZS");
817814
if (!zs_tag) throw dnmt_error("bam_aux_get for ZS (invalid bsmap)");
818815
// ADS: test for errors on the line below
819-
const uint8_t cv = string(bam_aux2Z(zs_tag))[1] == '-' ? 'A' : 'T';
816+
const auto zs_tag_value = string(bam_aux2Z(zs_tag));
817+
if (zs_tag_value.empty()) throw dnmt_error("empty ZS tag in bsmap format");
818+
if (zs_tag_value[0] != '-' && zs_tag_value[0] != '+')
819+
throw dnmt_error("invalid ZS tag in bsmap format");
820+
const uint8_t cv = zs_tag_value[1] == '-' ? 'A' : 'T';
820821
// get the "mismatches" tag
821-
auto nm_tag = bam_aux_get(aln, "NM");
822-
if (!nm_tag) throw dnmt_error("bam_aux_get for NM (invalid bsmap)");
822+
const auto nm_tag = bam_aux_get(aln, "NM");
823+
if (!nm_tag) throw dnmt_error("invalid NM tag in bsmap format");
823824
const int64_t nm = bam_aux2i(nm_tag);
824825

825-
aln->l_data = bam_get_aux(aln) - aln->data; // del aux (no data resize)
826+
// ADS: this should delete the aux data by truncating the used
827+
// range within the bam1_t while avoiding resizing memory
828+
aln->l_data = bam_get_aux(aln) - aln->data;
826829

827830
/* add the tags we want */
828831
// "udpate" for "int" because it determines the right size; even
829-
// though we just deleted all tags, it will add it back here.
832+
// though we just deleted all tags, it will add it back here
830833
err_code = bam_aux_update_int(aln, "NM", nm);
831-
if (err_code < 0) throw dnmt_error(err_code, "bam_aux_update_int");
834+
if (err_code < 0)
835+
throw dnmt_error(err_code, "error setting NM in bsmap format");
836+
832837
// "append" for "char" because there is no corresponding update
833838
err_code = bam_aux_append(aln, "CV", 'A', 1, &cv);
834-
if (err_code < 0) throw dnmt_error(err_code, "bam_aux_append");
839+
if (err_code < 0)
840+
throw dnmt_error(err_code, "error setting conversion in bsmap format");
835841

836-
if (bam_is_rev(aln))
837-
revcomp_seq_by_byte(aln); // reverse complement if needed
842+
// reverse complement if needed
843+
if (bam_is_rev(aln)) revcomp_seq_by_byte(aln);
838844
}
839-
if (input_format == "bismark") {
845+
else if (input_format == "bismark") {
840846
// ADS: Previously we modified the read names at the first
841847
// underscore. Even if the names are still that way, it should no
842848
// longer be needed since we compare names up to a learned suffix.
@@ -864,6 +870,9 @@ standardize_format(const string &input_format, bam1_t *aln) {
864870
if (bam_is_rev(aln))
865871
revcomp_seq_by_byte(aln); // reverse complement if needed
866872
}
873+
// ADS: the condition below should be checked much earlier, ideally
874+
// before the output file is created
875+
else throw runtime_error("incorrect format specified: " + input_format);
867876

868877
// Be sure this doesn't depend on mapper! Removes the "qual" part of
869878
// the data in a bam1_t struct but does not change its uncompressed

src/utils/format-reads.cpp

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,9 @@ using std::vector;
5757

5858
using bamxx::bam_rec;
5959

60-
static size_t
60+
static int32_t
6161
merge_mates(const size_t range, bam_rec &one, bam_rec &two, bam_rec &merged) {
62-
if (!are_mates(one, two)) return -std::numeric_limits<int>::max();
62+
if (!are_mates(one, two)) return -std::numeric_limits<int32_t>::max();
6363

6464
// arithmetic easier using base 0 so subtracting 1 from pos
6565
const int one_s = get_pos(one);
@@ -309,7 +309,7 @@ swap(bam_rec &a, bam_rec &b) {
309309
static void
310310
format(const string &cmd, const size_t n_threads, const string &inputfile,
311311
const string &outfile, const bool bam_format, const string &input_format,
312-
const size_t suff_len, const size_t max_frag_len) {
312+
const size_t suff_len, const int32_t max_frag_len) {
313313
static const dnmt_error bam_write_err{"error writing bam"};
314314

315315
bamxx::bam_tpool tp(n_threads);
@@ -345,8 +345,7 @@ format(const string &cmd, const size_t n_threads, const string &inputfile,
345345
if (same_name(prev_aln, aln, suff_len)) {
346346
// below: essentially check for dovetail
347347
if (!bam_is_rev(aln)) swap(prev_aln, aln);
348-
const size_t frag_len =
349-
merge_mates(max_frag_len, prev_aln, aln, merged);
348+
const auto frag_len = merge_mates(max_frag_len, prev_aln, aln, merged);
350349
if (frag_len > 0 && frag_len < max_frag_len) {
351350
if (is_a_rich(merged)) flip_conversion(merged);
352351
if (!out.write(hdr, merged)) throw bam_write_err;
@@ -385,7 +384,7 @@ main_format(int argc, const char **argv) {
385384

386385
string input_format;
387386
string outfile;
388-
int max_frag_len = std::numeric_limits<int>::max();
387+
int32_t max_frag_len = 10000;
389388
size_t suff_len = 0;
390389
bool single_end = false;
391390
bool VERBOSE = false;
@@ -409,7 +408,7 @@ main_format(int argc, const char **argv) {
409408
opt_parse.add_opt("single-end", '\0',
410409
"assume single-end [do not use with -suff]", false,
411410
single_end);
412-
opt_parse.add_opt("max-frag", 'L', "maximum allowed insert size", false,
411+
opt_parse.add_opt("max-frag", 'L', "max allowed insert size", false,
413412
max_frag_len);
414413
opt_parse.add_opt("check", 'c',
415414
"check this many reads to validate read name suffix",
@@ -443,6 +442,12 @@ main_format(int argc, const char **argv) {
443442
<< opt_parse.about_message() << endl;
444443
return EXIT_FAILURE;
445444
}
445+
if (max_frag_len <= 0) {
446+
cerr << "specified maximum fragment size: " << max_frag_len << endl
447+
<< opt_parse.help_message() << endl
448+
<< opt_parse.about_message() << endl;
449+
return EXIT_FAILURE;
450+
}
446451
const string infile(leftover_args.front());
447452
if (leftover_args.size() == 2 && !use_stdout)
448453
outfile = leftover_args.back();

0 commit comments

Comments
 (0)