Skip to content

Commit 1d4aacd

Browse files
committed
v0.0.8
1 parent a0a26aa commit 1d4aacd

File tree

11 files changed

+207
-105
lines changed

11 files changed

+207
-105
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ endif
1414
EXTRA_FLAGS = -Wall -Wno-misleading-indentation -Wno-unused-function #-Wno-unused-variable -Wno-alloc-size-larger-than
1515

1616
# Define the version number
17-
VERSION=0.0.7
17+
VERSION=0.0.8
1818
# Get the Git commit hash
1919
GIT_COMMIT := $(shell git rev-parse --short HEAD 2> /dev/null)
2020
ifneq ($(GIT_COMMIT),)

README.md

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -10,11 +10,13 @@
1010
<!-- [![Published in Bioinformatics](https://img.shields.io/badge/Published%20in-Bioinformatics-blue.svg)](https://dx.doi.org/10.1093/bioinformatics/btaa963) -->
1111
<!-- [![GitHub Issues](https://img.shields.io/github/issues/yangao07/longcallD.svg?label=Issues)](https://github.com/yangao07/longcallD/issues) -->
1212

13-
## Updates (release v0.0.7)
13+
## Updates (release v0.0.8)
1414

15-
* sort reads internally before processing to fix potential inconsistency when multiple input BAM/CRAM files are provided
16-
<!-- * Fix corrupted VCF output in v0.0.5 -->
17-
<!-- * Fix missing MEI header in VCF output -->
15+
* Fix variant representation issues for overlapping variants
16+
* Revoke ultraLowMem mode of WFA to avoid potential variant representation issues
17+
* Fix output BAM errors
18+
* Output BAM includes all reads (even those not used for variant calling)
19+
* Fix a bug with --refine-aln
1820
<!-- * Improved run time and memory usage (especially when mosaic variant calling enabled) -->
1921
<!-- * Add `--input-is-list` and `-X` to support multiple input BAM/CRAM files of the same sample for variant calling -->
2022

@@ -23,11 +25,11 @@
2325
```sh
2426
# Download pre-built executables and test data (recommended)
2527
# Linux-x64
26-
wget https://github.com/yangao07/longcallD/releases/download/v0.0.7/longcallD-v0.0.7_x64-linux.tar.gz
27-
tar -zxvf longcallD-v0.0.7_x64-linux.tar.gz && cd longcallD-v0.0.7_x64-linux
28+
wget https://github.com/yangao07/longcallD/releases/download/v0.0.8/longcallD-v0.0.8_x64-linux.tar.gz
29+
tar -zxvf longcallD-v0.0.8_x64-linux.tar.gz && cd longcallD-v0.0.8_x64-linux
2830
# MacOS-arm64
29-
wget https://github.com/yangao07/longcallD/releases/download/v0.0.7/longcallD-v0.0.7_arm64-macos.tar.gz
30-
tar -zxvf longcallD-v0.0.7_arm64-macos.tar.gz && cd longcallD-v0.0.7_arm64-macos
31+
wget https://github.com/yangao07/longcallD/releases/download/v0.0.8/longcallD-v0.0.8_arm64-macos.tar.gz
32+
tar -zxvf longcallD-v0.0.8_arm64-macos.tar.gz && cd longcallD-v0.0.8_arm64-macos
3133

3234
# PacBio HiFi reads
3335
./longcallD call ./test_data/chr11_2M.fa ./test_data/HG002_chr11_hifi_test.bam --hifi > HG002_hifi_test.vcf
@@ -39,7 +41,7 @@ man ./longcallD.1
3941
``` -->
4042

4143
## Table of Contents
42-
- [Updates (release v0.0.7)](#updates-release-v007)
44+
- [Updates (release v0.0.8)](#updates-release-v008)
4345
- [Getting Started](#getting-started)
4446
- [Table of Contents](#table-of-contents)
4547
- [Introduction](#introduction)
@@ -74,13 +76,13 @@ Providing the annotation sequence of common mobile elements, i.e., Alu/L1/SVA, u
7476
### Pre-built executables (recommended)
7577
**Linux-x64**
7678
```
77-
wget https://github.com/yangao07/longcallD/releases/download/v0.0.7/longcallD-v0.0.7_x64-linux.tar.gz
78-
tar -zxvf longcallD-v0.0.7_x64-linux.tar.gz
79+
wget https://github.com/yangao07/longcallD/releases/download/v0.0.8/longcallD-v0.0.8_x64-linux.tar.gz
80+
tar -zxvf longcallD-v0.0.8_x64-linux.tar.gz
7981
```
8082
**MacOS-arm64**
8183
```
82-
wget https://github.com/yangao07/longcallD/releases/download/v0.0.7/longcallD-v0.0.7_arm64-macos.tar.gz
83-
tar -zxvf longcallD-v0.0.7_arm64-macos.tar.gz
84+
wget https://github.com/yangao07/longcallD/releases/download/v0.0.8/longcallD-v0.0.8_arm64-macos.tar.gz
85+
tar -zxvf longcallD-v0.0.8_arm64-macos.tar.gz
8486
```
8587

8688
**Linux-arm64/macOS-x64**
@@ -96,9 +98,9 @@ conda install -c bioconda longcalld
9698
### Build from source
9799
To compile longcallD from source, ensure you have **GCC/clang(9.0+)** and **zlib/libbz2/liblzma/libcurl** (for htslib) installed.
98100
```
99-
wget https://github.com/yangao07/longcallD/releases/download/v0.0.7/longcallD-v0.0.7.tar.gz
100-
tar -zxvf longcallD-v0.0.7.tar.gz
101-
cd longcallD-v0.0.7; make
101+
wget https://github.com/yangao07/longcallD/releases/download/v0.0.8/longcallD-v0.0.8.tar.gz
102+
tar -zxvf longcallD-v0.0.8.tar.gz
103+
cd longcallD-v0.0.8; make
102104
```
103105

104106
## Usage
@@ -142,9 +144,17 @@ longcallD call -t16 ref.fa hifi.bam --autosome > hifi_autosome.vcf
142144
```
143145

144146
### Output phased (& refined) long-read BAM/CRAM
145-
LongcallD performs read phasing during variant calling and can output phased long reads in BAM/CRAM.
147+
LongcallD performs read phasing during variant calling and can output phased long reads in BAM/CRAM.
148+
Each phased read will have two additional tags: `HP` (haplotype) and `PS` (phase set).
146149

147150
With `--refine-aln`, it can further output refined read alignment based on multiple sequence alignment within each haplotype, which is especially useful for low-complexity regions like homopolymers and tandem repeats.
151+
152+
For example:
153+
154+
![refine](https://github.com/yangao07/longcallD/blob/main/refined_example.png)
155+
156+
157+
Note that in refined alignment file, only phased reads, i.e., with HP/PS tags, are refined, other reads remain unchanged.
148158
```
149159
longcallD call -t16 ref.fa hifi.bam --hifi -b hifi_phased.bam > hifi.vcf # output phased HiFi reads (BAM tag: HP & PS)
150160
longcallD call -t16 ref.fa ont.bam --ont --refine-aln -b ont_phased_refined.bam > ont.vcf # output phased & refined ONT reads (BAM tag: HP & PS)

refined_example.png

582 KB
Loading

src/align.c

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -375,8 +375,9 @@ int wfa_end2end_aln(uint8_t *pattern, int plen, uint8_t *text, int tlen,
375375
int gap_aln, int b, int q, int e, int q2, int e2, int heuristic, int affine_gap, // heuristic: 0: no, 1: default, 2: zdrop
376376
uint32_t **cigar_buf, int *cigar_length, uint8_t **pattern_alg, uint8_t **text_alg, int *alg_length) {
377377
// double realtime0 = realtime();
378+
// fprintf(stderr, "WFA-end2end %d vs %d\n", plen, tlen);
378379
wavefront_aligner_attr_t attributes = wavefront_aligner_attr_default;
379-
if (heuristic != LONGCALLD_WFA_ZDROP) attributes.memory_mode = wavefront_memory_ultralow;
380+
// if (heuristic != LONGCALLD_WFA_ZDROP) attributes.memory_mode = wavefront_memory_ultralow;
380381
if (affine_gap == LONGCALLD_WFA_AFFINE_2P) {
381382
attributes.distance_metric = gap_affine_2p;
382383
attributes.affine2p_penalties.match = 0; // -a;
@@ -589,7 +590,7 @@ int wfa_collect_aln_str(const call_var_opt_t *opt, uint8_t *target, int tlen, ui
589590
}
590591
// partial alignment
591592
wfa_end2end_aln(target+_t_start, _tlen, query+_q_start, _qlen,
592-
gap_aln, b, q, e, q2, e2, LONGCALLD_WFA_ZDROP, LONGCALLD_WFA_AFFINE_1P,
593+
gap_aln, b, q, e, q2, e2, LONGCALLD_WFA_ZDROP, LONGCALLD_WFA_AFFINE_2P,
593594
NULL, NULL, &aln_str->target_aln, &aln_str->query_aln, &aln_str->aln_len);
594595
// do not trim for end-gaps
595596
wfa_trim_aln_str(full_cover, aln_str);
@@ -687,7 +688,18 @@ int cal_wfa_partial_aln_beg_end(int ext_direction, const call_var_opt_t *opt, ui
687688
gap_aln = (gap_aln == LONGCALLD_GAP_RIGHT_ALN) ? LONGCALLD_GAP_LEFT_ALN : LONGCALLD_GAP_RIGHT_ALN;
688689
}
689690
// fprintf(stderr, "WFA partial aln: %d vs %d (Raw: %d vs %d)\n", tlen, qlen, _tlen, _qlen);
690-
wfa_end2end_aln(target, tlen, query, qlen, gap_aln, b, q, e, q2, e2, LONGCALLD_WFA_ZDROP, LONGCALLD_WFA_AFFINE_1P, &cigar_buf, &cigar_len, NULL, NULL, NULL);
691+
692+
int min_len = MIN_OF_TWO(tlen, qlen);
693+
if (ext_direction == LONGCALLD_EXT_ALN_LEFT_TO_RIGHT) {
694+
int x_gaps = edlib_xgaps(target, min_len, query, min_len);
695+
if (x_gaps > min_len * 0.10) { return 0;
696+
}
697+
} else {
698+
int x_gaps = edlib_xgaps(target + tlen - min_len, min_len, query + qlen - min_len, min_len);
699+
if (x_gaps > min_len * 0.10) { return 0;
700+
}
701+
}
702+
wfa_end2end_aln(target, tlen, query, qlen, gap_aln, b, q, e, q2, e2, LONGCALLD_WFA_NO_HEURISTIC, LONGCALLD_WFA_AFFINE_2P, &cigar_buf, &cigar_len, NULL, NULL, NULL);
691703
if (cigar_len == 0) ret = 0;
692704
else collect_aln_beg_end(cigar_buf, cigar_len, ext_direction, _tlen, target_beg, target_end, _qlen, query_beg, query_end);
693705
if (cigar_buf != NULL) free(cigar_buf);
@@ -770,28 +782,29 @@ int abpoa_partial_aln_msa_cons(const call_var_opt_t *opt, abpoa_t *ab, int sampl
770782
if (LONGCALLD_VERBOSE >= 2) abpt->out_msa = 1;
771783
abpoa_post_set_para(abpt);
772784
ab->abs->n_seq = n_reads;
773-
int *exc_begs = (int*)malloc(n_reads * sizeof(int)), *exc_ends = (int*)malloc(n_reads * sizeof(int));
774-
int *seq_beg_cuts = (int*)malloc(n_reads * sizeof(int)), *seq_end_cuts = (int*)malloc(n_reads * sizeof(int));
775-
for (int i = 0; i < n_reads; ++i) {
776-
exc_begs[i] = 0; exc_ends[i] = 1, seq_beg_cuts[i] = 0, seq_end_cuts[i] = 0;
777-
}
778785
for (int i = 0; i < n_reads; ++i) {
779786
if (LONGCALLD_VERBOSE >= 3) {
780787
fprintf(stderr, ">%s %d %d\n", names[i], read_lens[i], read_full_cover[i]);
781788
for (int j = 0; j < read_lens[i]; ++j) {
782789
fprintf(stderr, "%c", "ACGTN"[read_seqs[i][j]]);
783790
} fprintf(stderr, "\n");
784791
}
785-
if (exc_begs[i] < 0 || exc_ends[i] < 0) continue;
786792
abpoa_res_t res; res.graph_cigar = 0, res.n_cigar = 0;
787-
if (LONGCALLD_VERBOSE >= 3) fprintf(stderr, "ExcBeg: %d, ExcEnd: %d, SeqBegCut: %d, SeqEndCut: %d, FullCover: %d\n", exc_begs[i], exc_ends[i], seq_beg_cuts[i], seq_end_cuts[i], read_full_cover[i]);
788-
abpoa_align_sequence_to_subgraph(ab, abpt, exc_begs[i], exc_ends[i], read_seqs[i]+seq_beg_cuts[i], read_lens[i]-seq_beg_cuts[i]-seq_end_cuts[i], &res);
789-
abpoa_add_subgraph_alignment(ab, abpt, exc_begs[i], exc_ends[i], read_seqs[i]+seq_beg_cuts[i], NULL, read_lens[i]-seq_beg_cuts[i]-seq_end_cuts[i], NULL, res, i, n_reads, 0);
790-
// collect exc_beg/exc_end for all reads with i>1
791-
if (i == 0) collect_exc_beg_end(opt, ab, abpt, sampling_reads, n_reads, names, read_seqs, read_lens, read_full_cover, exc_begs, exc_ends, seq_beg_cuts, seq_end_cuts);
793+
int exc_beg = 0, exc_end = 1, seq_beg_cut = 0, seq_end_cut = 0;
794+
if (i != 0) {
795+
int ref_beg, ref_end, read_beg, read_end, beg_id, end_id;
796+
if (collect_partial_aln_beg_end(opt, sampling_reads, read_seqs[0], read_lens[0], read_full_cover[0], read_seqs[i], read_lens[i], read_full_cover[i], &ref_beg, &ref_end, &read_beg, &read_end) == 0) continue;
797+
beg_id = ref_beg+1, end_id = ref_end+1;
798+
seq_beg_cut = read_beg - 1, seq_end_cut = read_lens[i] - read_end;
799+
abpoa_subgraph_nodes(ab, abpt, beg_id, end_id, &exc_beg, &exc_end);
800+
// fprintf(stderr, "beg_id: %d, end_id: %d, read_beg: %d, read_end: %d, exc_beg: %d, exc_end: %d\n", beg_id, end_id, read_beg, read_end, exc_beg, exc_end);
801+
}
802+
if (LONGCALLD_VERBOSE >= 3) fprintf(stderr, "ExcBeg: %d, ExcEnd: %d, SeqBegCut: %d, SeqEndCut: %d, FullCover: %d\n", exc_beg, exc_end, seq_beg_cut, seq_end_cut, read_full_cover[i]);
803+
abpoa_align_sequence_to_subgraph(ab, abpt, exc_beg, exc_end, read_seqs[i]+seq_beg_cut, read_lens[i]-seq_beg_cut-seq_end_cut, &res);
804+
// abpoa_add_subgraph_alignment(ab, abpt, exc_begs[i], exc_ends[i], read_seqs[i]+seq_beg_cuts[i], NULL, read_lens[i]-seq_beg_cuts[i]-seq_end_cuts[i], NULL, res, i, n_reads, 1);
805+
abpoa_add_subgraph_alignment(ab, abpt, exc_beg, exc_end, read_seqs[i]+seq_beg_cut, NULL, read_lens[i]-seq_beg_cut-seq_end_cut, NULL, res, i, n_reads, 0);
792806
if (res.n_cigar) free(res.graph_cigar);
793807
}
794-
free(exc_begs); free(exc_ends); free(seq_beg_cuts); free(seq_end_cuts);
795808
if (LONGCALLD_VERBOSE >= 2) abpoa_output(ab, abpt, stderr);
796809
else abpoa_output(ab, abpt, NULL);
797810
abpoa_cons_t *abc = ab->abc;
@@ -863,6 +876,7 @@ int abpoa_partial_aln_msa_cons(const call_var_opt_t *opt, abpoa_t *ab, int sampl
863876
abpoa_para_t *abpt = abpoa_init_para();
864877
// if (opt->out_somatic) abpt->wf = 0.01; // XXX for more accurate somatic variant calling
865878
// else abpt->wf = 0.001; // limit memory usage for long sequences
879+
abpt->wb = -1;
866880
abpt->inc_path_score = 1;
867881
abpt->out_msa = 0; abpt->out_cons = 1;
868882
if (msa_seq != NULL && msa_seq_len != NULL) abpt->out_msa = 1; else abpt->out_msa = 0;
@@ -1176,7 +1190,7 @@ int wfa_collect_noisy_aln_str_no_ps_hap(const call_var_opt_t *opt, int n_reads,
11761190
int read_id = read_ids[read_i];
11771191
clu_read_ids[i][j] = read_id;
11781192
// cons vs read
1179-
// wfa_collect_aln_str(opt, cons_seqs[i], cons_lens[i], seqs[read_i], lens[read_i], fully_covers[read_i], heuristic, affine_2p, LONGCALLD_CONS_READ_ALN_STR(clu_aln_str, n_full_reads));
1193+
// wfa_collect_aln_str(opt, cons_seqs[i], cons_lens[i], seqs[read_i], lens[read_i], fully_covers[read_i], LONGCALLD_WFA_NO_HEURISTIC, LONGCALLD_WFA_AFFINE_2P, LONGCALLD_CONS_READ_ALN_STR(clu_aln_str, n_full_reads));
11801194
make_cons_read_aln_str(opt, msa_seqs[i][clu_n_seqs[i]], msa_seqs[i][j], msa_seq_lens[i], fully_covers[i], LONGCALLD_CONS_READ_ALN_STR(clu_aln_str, n_full_reads));
11811195
if (collect_ref_read_aln_str)
11821196
make_ref_read_aln_str(opt, LONGCALLD_REF_CONS_ALN_STR(clu_aln_str), LONGCALLD_CONS_READ_ALN_STR(clu_aln_str, n_full_reads), LONGCALLD_REF_READ_ALN_STR(clu_aln_str, n_full_reads));
@@ -1335,7 +1349,7 @@ int wfa_collect_noisy_aln_str_with_ps_hap(const call_var_opt_t *opt, int samplin
13351349
if (use_non_full == 0 && LONGCALLD_NOISY_IS_BOTH_COVER(fully_covers[i]) == 0) continue;
13361350
// cons vs read XXX make?
13371351
// heuristic = 1; affine_2p = 0; //
1338-
// wfa_collect_aln_str(opt, cons_seqs[hap-1], cons_lens[hap-1], seqs[i], lens[i], fully_covers[i], heuristic, affine_2p, LONGCALLD_CONS_READ_ALN_STR(clu_aln_str, n_ps_hap_reads));
1352+
// wfa_collect_aln_str(opt, cons_seqs[hap-1], cons_lens[hap-1], seqs[i], lens[i], fully_covers[i], LONGCALLD_WFA_NO_HEURISTIC, LONGCALLD_WFA_AFFINE_2P, LONGCALLD_CONS_READ_ALN_STR(clu_aln_str, n_ps_hap_reads));
13391353
make_cons_read_aln_str(opt, msa_seqs[hap-1][clu_n_seqs[hap-1]], msa_seqs[hap-1][n_ps_hap_reads], msa_seq_lens[hap-1], fully_covers[i], LONGCALLD_CONS_READ_ALN_STR(clu_aln_str, n_ps_hap_reads));
13401354
// ref vs read
13411355
if (collect_ref_read_aln_str)

0 commit comments

Comments
 (0)