Skip to content

Commit c9cfce4

Browse files
committed
sort input reads
1 parent 9e79f3d commit c9cfce4

File tree

8 files changed

+160
-89
lines changed

8 files changed

+160
-89
lines changed

Makefile

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,7 @@ endif
1414
EXTRA_FLAGS = -Wall -Wno-misleading-indentation -Wno-unused-function #-Wno-unused-variable -Wno-alloc-size-larger-than
1515

1616
# Define the version number
17-
VERSION=0.0.6
18-
# LONGCALLD_VERSION =0.0.6
17+
VERSION=0.0.7
1918
# Get the Git commit hash
2019
GIT_COMMIT := $(shell git rev-parse --short HEAD 2> /dev/null)
2120
ifneq ($(GIT_COMMIT),)

README.md

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<!-- # LongcallD: local-haplotagging-based small and structural variant calling -->
22

3-
<!-- [![Latest Release](https://img.shields.io/github/release/yangao07/longcallD.svg?label=Release)](https://github.com/yangao07/longcallD/releases/latest) -->
4-
[![Latest Release](https://img.shields.io/github/v/tag/yangao07/longcalld?label=Release)](https://github.com/yangao07/longcallD/releases/latest)
3+
[![Latest Release](https://img.shields.io/github/release/yangao07/longcallD.svg?label=Release)](https://github.com/yangao07/longcallD/releases/latest)
4+
<!-- [![Latest Release](https://img.shields.io/github/v/tag/yangao07/longcalld?label=Release)](https://github.com/yangao07/longcallD/releases/latest) -->
55
[![Github All Releases](https://img.shields.io/github/downloads/yangao07/longcallD/total.svg?label=Download)](https://github.com/yangao07/longcallD/releases)
66
[![Bioconda Version](https://img.shields.io/conda/vn/bioconda/longcallD.svg?style=flag&label=Bioconda)](https://anaconda.org/bioconda/longcalld)
77
[![Bioconda Install](https://img.shields.io/conda/dn/bioconda/longcallD.svg?style=flag&label=Conda-install)](https://anaconda.org/bioconda/longcalld)
@@ -11,23 +11,24 @@
1111
<!-- [![Published in Bioinformatics](https://img.shields.io/badge/Published%20in-Bioinformatics-blue.svg)](https://dx.doi.org/10.1093/bioinformatics/btaa963) -->
1212
<!-- [![GitHub Issues](https://img.shields.io/github/issues/yangao07/longcallD.svg?label=Issues)](https://github.com/yangao07/longcallD/issues) -->
1313

14-
## Updates (release v0.0.6)
14+
## Updates (release v0.0.7)
1515

16-
* Fix corrupted VCF output in v0.0.5
17-
* Fix missing MEI header in VCF output
18-
* Improved run time and memory usage (especially when mosaic variant calling enabled)
19-
* Add `--input-is-list` and `-X` to support multiple input BAM/CRAM files of the same sample for variant calling
16+
* sort reads internally before processing to fix potential inconsistency when multiple input BAM/CRAM files are provided
17+
<!-- * Fix corrupted VCF output in v0.0.5 -->
18+
<!-- * Fix missing MEI header in VCF output -->
19+
<!-- * Improved run time and memory usage (especially when mosaic variant calling enabled) -->
20+
<!-- * Add `--input-is-list` and `-X` to support multiple input BAM/CRAM files of the same sample for variant calling -->
2021

2122

2223
## Getting Started
2324
```sh
2425
# Download pre-built executables and test data (recommended)
2526
# Linux-x64
26-
wget https://github.com/yangao07/longcallD/releases/download/v0.0.6/longcallD-v0.0.6_x64-linux.tar.gz
27-
tar -zxvf longcallD-v0.0.6_x64-linux.tar.gz && cd longcallD-v0.0.6_x64-linux
27+
wget https://github.com/yangao07/longcallD/releases/download/v0.0.7/longcallD-v0.0.7_x64-linux.tar.gz
28+
tar -zxvf longcallD-v0.0.7_x64-linux.tar.gz && cd longcallD-v0.0.7_x64-linux
2829
# MacOS-arm64
29-
wget https://github.com/yangao07/longcallD/releases/download/v0.0.6/longcallD-v0.0.6_arm64-macos.tar.gz
30-
tar -zxvf longcallD-v0.0.6_arm64-macos.tar.gz && cd longcallD-v0.0.6_arm64-macos
30+
wget https://github.com/yangao07/longcallD/releases/download/v0.0.7/longcallD-v0.0.7_arm64-macos.tar.gz
31+
tar -zxvf longcallD-v0.0.7_arm64-macos.tar.gz && cd longcallD-v0.0.7_arm64-macos
3132

3233
# PacBio HiFi reads
3334
./longcallD call ./test_data/chr11_2M.fa ./test_data/HG002_chr11_hifi_test.bam --hifi > HG002_hifi_test.vcf
@@ -39,7 +40,7 @@ man ./longcallD.1
3940
``` -->
4041

4142
## Table of Contents
42-
- [Updates (release v0.0.6)](#updates-release-v006)
43+
- [Updates (release v0.0.7)](#updates-release-v007)
4344
- [Getting Started](#getting-started)
4445
- [Table of Contents](#table-of-contents)
4546
- [Introduction](#introduction)
@@ -74,13 +75,13 @@ Providing the annotation sequence of common mobile elements, i.e., Alu/L1/SVA, u
7475
### Pre-built executables (recommended)
7576
**Linux-x64**
7677
```
77-
wget https://github.com/yangao07/longcallD/releases/download/v0.0.6/longcallD-v0.0.6_x64-linux.tar.gz
78-
tar -zxvf longcallD-v0.0.6_x64-linux.tar.gz
78+
wget https://github.com/yangao07/longcallD/releases/download/v0.0.7/longcallD-v0.0.7_x64-linux.tar.gz
79+
tar -zxvf longcallD-v0.0.7_x64-linux.tar.gz
7980
```
8081
**MacOS-arm64**
8182
```
82-
wget https://github.com/yangao07/longcallD/releases/download/v0.0.6/longcallD-v0.0.6_arm64-macos.tar.gz
83-
tar -zxvf longcallD-v0.0.6_arm64-macos.tar.gz
83+
wget https://github.com/yangao07/longcallD/releases/download/v0.0.7/longcallD-v0.0.7_arm64-macos.tar.gz
84+
tar -zxvf longcallD-v0.0.7_arm64-macos.tar.gz
8485
```
8586

8687
**Linux-arm64/macOS-x64**
@@ -96,9 +97,9 @@ conda install -c bioconda longcalld
9697
### Build from source
9798
To compile longcallD from source, ensure you have **GCC/clang(9.0+)** and **zlib/libbz2/liblzma/libcurl** (for htslib) installed.
9899
```
99-
wget https://github.com/yangao07/longcallD/releases/download/v0.0.6/longcallD-v0.0.6.tar.gz
100-
tar -zxvf longcallD-v0.0.6.tar.gz
101-
cd longcallD-v0.0.6; make
100+
wget https://github.com/yangao07/longcallD/releases/download/v0.0.7/longcallD-v0.0.7.tar.gz
101+
tar -zxvf longcallD-v0.0.7.tar.gz
102+
cd longcallD-v0.0.7; make
102103
```
103104

104105
## Usage
@@ -142,6 +143,9 @@ longcallD call -t16 ref.fa hifi.bam --autosome > hifi_autosome.vcf
142143
```
143144

144145
### Output phased (& refined) long-read BAM/CRAM
146+
LongcallD performs read phasing during variant calling and can outputs phased long reads in BAM/CRAM.
147+
148+
With `--refine-aln`, it can further output refined read alignment based on multiple sequence alignment within each haplotype, which is especially useful for low-complexity regions like homopolymers and tandem repeats.
145149
```
146150
longcallD call -t16 ref.fa hifi.bam --hifi -b hifi_phased.bam > hifi.vcf # output phased HiFi reads (BAM tag: HP & PS)
147151
longcallD call -t16 ref.fa ont.bam --ont --refine-aln -b ont_phased_refined.bam > ont.vcf # output phased & refined ONT reads (BAM tag: HP & PS)

src/align.c

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -708,7 +708,7 @@ int collect_partial_aln_beg_end(const call_var_opt_t *opt, int sampling_reads,
708708
// int edit_dis = edlib_edit_distance(target, tlen, query, qlen);
709709
// if (edit_dis > MIN_OF_TWO(tlen, qlen) * 0.05) {
710710
int x_gaps = edlib_xgaps(target, tlen, query, qlen);
711-
if (x_gaps > MIN_OF_TWO(tlen, qlen) * 0.05) {
711+
if (x_gaps > MIN_OF_TWO(tlen, qlen) * 0.10) {
712712
// if (LONGCALLD_VERBOSE >= 3) fprintf(stderr, "Skipped in POA due to high edit distance (%d vs %d): %d\n", tlen, qlen, edit_dis);
713713
if (LONGCALLD_VERBOSE >= 3) fprintf(stderr, "Skipped in POA due to high # of X-gaps (%d vs %d): %d\n", tlen, qlen, x_gaps);
714714
return 0;
@@ -755,8 +755,8 @@ int abpoa_partial_aln_msa_cons(const call_var_opt_t *opt, abpoa_t *ab, int sampl
755755
ab = abpoa_init(); needs_free_ab = 1;
756756
}
757757
abpoa_para_t *abpt = abpoa_init_para();
758-
if (opt->out_somatic) abpt->wf = 0.01; // more accurate for somatic variant calling
759-
else abpt->wf = 0.001; // limit memory usage for long sequences
758+
// if (opt->out_somatic) abpt->wf = 0.01; // more accurate for somatic variant calling
759+
// else abpt->wf = 0.001; // limit memory usage for long sequences
760760
abpt->cons_algrm = ABPOA_MF;
761761
abpt->sub_aln = 1;
762762
abpt->inc_path_score = 1;
@@ -861,8 +861,8 @@ int abpoa_partial_aln_msa_cons(const call_var_opt_t *opt, abpoa_t *ab, int sampl
861861
int *clu_n_seqs, int **clu_read_ids, int *msa_seq_len, uint8_t ***msa_seq) {
862862
abpoa_t *ab = abpoa_init();
863863
abpoa_para_t *abpt = abpoa_init_para();
864-
if (opt->out_somatic) abpt->wf = 0.01; // XXX for more accurate somatic variant calling
865-
else abpt->wf = 0.001; // limit memory usage for long sequences
864+
// if (opt->out_somatic) abpt->wf = 0.01; // XXX for more accurate somatic variant calling
865+
// else abpt->wf = 0.001; // limit memory usage for long sequences
866866
abpt->inc_path_score = 1;
867867
abpt->out_msa = 0; abpt->out_cons = 1;
868868
if (msa_seq != NULL && msa_seq_len != NULL) abpt->out_msa = 1; else abpt->out_msa = 0;

src/assign_hap.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -320,7 +320,8 @@ int check_agree_haps(int read_i, int hap, cand_var_t *vars, int var1, int var2,
320320
}
321321

322322
void update_read_phase_set(bam_chunk_t *chunk, int *var_is_valid, read_var_profile_t *p, cand_var_t *cand_vars) {
323-
for (int read_i = 0; read_i < chunk->n_reads; ++read_i) {
323+
for (int i = 0; i < chunk->n_reads; ++i) {
324+
int read_i = chunk->ordered_read_ids[i];
324325
if (chunk->is_skipped[read_i]) continue;
325326
if (p[read_i].start_var_idx == -1) continue;
326327
hts_pos_t phase_set = -1;
@@ -433,7 +434,8 @@ int iter_update_var_hap_to_cons_alle(const call_var_opt_t *opt, bam_chunk_t *chu
433434
// var-wise loop, update phase set XXX
434435
// update hap_to_alle_profile + hap_to_cons_alle
435436
var_init_hap_to_alle_profile(cand_vars, var_idx, n_cand_vars);
436-
for (int read_i = 0; read_i < chunk->n_reads; ++read_i) {
437+
for (int i = 0; i < chunk->n_reads; ++i) {
438+
int read_i = chunk->ordered_read_ids[i];
437439
if (chunk->is_skipped[read_i]) continue;
438440
int hap;
439441
hap = init_assign_read_hap_based_on_cons_alle(chunk, read_i, cand_vars, p, var_i_to_cate, target_var_cate);

src/bam_utils.c

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1322,7 +1322,7 @@ int collect_digar_from_ref_seq(bam_chunk_t *chunk, int read_i, const struct call
13221322

13231323
int bam_chunk_init0(bam_chunk_t *chunk, int n_reads, int n_bam) {
13241324
// input
1325-
chunk->n_reads = 0; chunk->m_reads = n_reads;
1325+
chunk->n_reads = 0; chunk->m_reads = n_reads; chunk->ordered_read_ids = (int*)malloc(n_reads * sizeof(int));
13261326
chunk->n_bam = n_bam;
13271327
chunk->n_up_ovlp_reads = (int*)calloc(n_bam, sizeof(int));
13281328
chunk->up_ovlp_read_i = (int**)malloc(n_bam * sizeof(int*));
@@ -1364,6 +1364,7 @@ int bam_chunk_init0(bam_chunk_t *chunk, int n_reads, int n_bam) {
13641364
int bam_chunk_realloc(bam_chunk_t *chunk) {
13651365
int m_reads = chunk->m_reads * 2;
13661366
chunk->reads = (bam1_t**)realloc(chunk->reads, m_reads * sizeof(bam1_t*));
1367+
chunk->ordered_read_ids = (int*)realloc(chunk->ordered_read_ids, m_reads * sizeof(int));
13671368
for (int i = 0; i < chunk->n_bam; ++i) {
13681369
chunk->up_ovlp_read_i[i] = (int*)realloc(chunk->up_ovlp_read_i[i], m_reads * sizeof(int));
13691370
chunk->down_ovlp_read_i[i] = (int*)realloc(chunk->down_ovlp_read_i[i], m_reads * sizeof(int));
@@ -1408,6 +1409,7 @@ void bam_chunk_free(bam_chunk_t *chunk) {
14081409
cr_destroy(chunk->digars[i].noisy_regs);
14091410
}
14101411
}
1412+
free(chunk->ordered_read_ids);
14111413
if (LONGCALLD_VERBOSE >= 2) {
14121414
for (int i = 0; i < chunk->m_reads; i++) {
14131415
bam_destroy1(chunk->reads[i]);
@@ -1470,6 +1472,7 @@ void bam_chunk_post_free(bam_chunk_t *chunk) {
14701472
}
14711473
free(chunk->reads);
14721474
}
1475+
free(chunk->ordered_read_ids);
14731476
}
14741477

14751478
// free variables that will not be used in stitch & make variants
@@ -1560,6 +1563,48 @@ static int is_ovlp_with_next_region(const struct call_var_pl_t *pl, bam_chunk_t
15601563
}
15611564
}
15621565

1566+
typedef struct {
1567+
int read_i;
1568+
hts_pos_t pos, end;
1569+
int NM;
1570+
char *qname;
1571+
} bam_read_sort_t;
1572+
1573+
int comp_bam_read_sort(const void *a, const void *b) {
1574+
const bam_read_sort_t *ra = (const bam_read_sort_t *)a;
1575+
const bam_read_sort_t *rb = (const bam_read_sort_t *)b;
1576+
if (ra->pos != rb->pos) return (ra->pos < rb->pos) ? -1 : 1;
1577+
if (ra->end != rb->end) return (ra->end < rb->end) ? 1 : -1;
1578+
if (ra->NM != rb->NM) return (ra->NM < rb->NM) ? -1 : 1;
1579+
return strcmp(ra->qname, rb->qname);
1580+
}
1581+
1582+
int bam_get_NM(bam1_t *read) {
1583+
uint8_t *nm_ptr = bam_aux_get(read, "NM");
1584+
if (nm_ptr != NULL) {
1585+
return bam_aux2i(nm_ptr);
1586+
} else {
1587+
return 0;
1588+
}
1589+
}
1590+
1591+
void sort_chunk_reads(bam_chunk_t *chunk) {
1592+
bam_read_sort_t *read_sorts = (bam_read_sort_t*)malloc(chunk->n_reads * sizeof(bam_read_sort_t));
1593+
for (int i = 0; i < chunk->n_reads; ++i) {
1594+
read_sorts[i].read_i = i;
1595+
read_sorts[i].pos = chunk->reads[i]->core.pos;
1596+
read_sorts[i].end = bam_endpos(chunk->reads[i]);
1597+
read_sorts[i].NM = bam_get_NM(chunk->reads[i]);
1598+
read_sorts[i].qname = bam_get_qname(chunk->reads[i]);
1599+
}
1600+
// sort by pos, end, NM, qname
1601+
qsort(read_sorts, chunk->n_reads, sizeof(bam_read_sort_t), comp_bam_read_sort);
1602+
for (int i = 0; i < chunk->n_reads; ++i) {
1603+
chunk->ordered_read_ids[i] = read_sorts[i].read_i;
1604+
}
1605+
free(read_sorts);
1606+
}
1607+
15631608
// load ref_seq/read in reg_chunks[reg_chunk_i]->tid/beg/end[reg_i] to chunks
15641609
int collect_ref_seq_bam_main(const struct call_var_pl_t *pl, struct call_var_io_aux_t *io_aux, int reg_chunk_i, int reg_i, bam_chunk_t *chunk) {
15651610
assert(reg_chunk_i < pl->n_reg_chunks); assert(reg_i < pl->reg_chunks[reg_chunk_i].n_regions);
@@ -1608,6 +1653,7 @@ int collect_ref_seq_bam_main(const struct call_var_pl_t *pl, struct call_var_io_
16081653
if (LONGCALLD_VERBOSE >= 2) {
16091654
fprintf(stderr, "CHUNK: tname: %s, tid: %d, beg: %" PRId64 ", end: %" PRId64 ", n_reads: %d\n", chunk->tname, chunk->tid, chunk->reg_beg, chunk->reg_end, chunk->n_reads);
16101655
}
1656+
sort_chunk_reads(chunk);
16111657
return chunk->n_reads;
16121658
}
16131659

@@ -1928,7 +1974,10 @@ int write_read_to_bam(bam_chunk_t *chunk, const struct call_var_opt_t *opt, cons
19281974
int read_i = 0, start_to_output = 0;
19291975
while (sam_itr_next(in_bam, iter, read) >= 0) {
19301976
if (read->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FSUPPLEMENTARY) || read->core.qual < min_mapq) {
1931-
if (start_to_output == 1) write_unprocessed_read_to_bam(chunk, header, opt->out_aln_fp, read);
1977+
if (start_to_output == 1) {
1978+
// write_unprocessed_read_to_bam(chunk, header, opt->out_aln_fp, read);
1979+
n_out_reads++;
1980+
}
19321981
continue;
19331982
}
19341983
// check if read is overlapping with previous region

src/bam_utils.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ typedef struct bam_chunk_t {
6161
// ref_beg <= reg_beg < reg_end <= ref_end, ref_seq may include additional flanking regions (50kb)
6262
// should always be 1 region XXX
6363
cgranges_t *low_comp_cr; // tandem_rep_cr;
64-
int n_reads, m_reads;
64+
int n_reads, m_reads; int *ordered_read_ids; // size: m_reads, for multiple input bams, merge sort reads by pos, end, NM, name
6565
int n_bam; // for each input bam, record the number of region-overlapping reads
6666
int *n_up_ovlp_reads, *n_down_ovlp_reads; // number of reads overlapping with up/downstream bam chunk
6767
int **up_ovlp_read_i, **down_ovlp_read_i;

0 commit comments

Comments
 (0)