Skip to content

Commit a3cc3b5

Browse files
committed
add SVREADS in INFO
1 parent 9a33903 commit a3cc3b5

File tree

11 files changed

+99
-37
lines changed

11 files changed

+99
-37
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ dkms.conf
6262
# vscode
6363
.lintr
6464
.vscode
65+
.claude
6566

6667
# gprof
6768
gmon.out

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ endif
1414
EXTRA_FLAGS = -Wall -Wno-misleading-indentation -Wno-unused-function #-Wno-unused-variable -Wno-alloc-size-larger-than
1515

1616
# Define the version number
17-
VERSION=0.0.8
17+
VERSION=0.0.9
1818
# Get the Git commit hash
1919
GIT_COMMIT := $(shell git rev-parse --short HEAD 2> /dev/null)
2020
ifneq ($(GIT_COMMIT),)

README.md

Lines changed: 15 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -10,25 +10,22 @@
1010
<!-- [![Published in Bioinformatics](https://img.shields.io/badge/Published%20in-Bioinformatics-blue.svg)](https://dx.doi.org/10.1093/bioinformatics/btaa963) -->
1111
<!-- [![GitHub Issues](https://img.shields.io/github/issues/yangao07/longcallD.svg?label=Issues)](https://github.com/yangao07/longcallD/issues) -->
1212

13-
## Updates (release v0.0.8)
13+
## Updates (release v0.0.9)
1414

15-
* Fix variant representation issues for overlapping variants
16-
* Revoke ultraLowMem mode of WFA to avoid potential variant representation issues
17-
* Fix output BAM errors
18-
* Output BAM includes all reads (even those not used for variant calling)
19-
* Fix a bug with --refine-aln
15+
* Fix variant calling close to start/end boundaries of chromosomes
16+
* Add `--out-sv-rnames` and `--out-som-sv-rnames` to output SV-supporing read names (tag: `SVREADS`) in INFO field of VCF
2017
<!-- * Add --STR -->
2118

2219

2320
## Getting Started
2421
```sh
2522
# Download pre-built executables and test data (recommended)
2623
# Linux-x64
27-
wget https://github.com/yangao07/longcallD/releases/download/v0.0.8/longcallD-v0.0.8_x64-linux.tar.gz
28-
tar -zxvf longcallD-v0.0.8_x64-linux.tar.gz && cd longcallD-v0.0.8_x64-linux
24+
wget https://github.com/yangao07/longcallD/releases/download/v0.0.9/longcallD-v0.0.9_x64-linux.tar.gz
25+
tar -zxvf longcallD-v0.0.9_x64-linux.tar.gz && cd longcallD-v0.0.9_x64-linux
2926
# MacOS-arm64
30-
wget https://github.com/yangao07/longcallD/releases/download/v0.0.8/longcallD-v0.0.8_arm64-macos.tar.gz
31-
tar -zxvf longcallD-v0.0.8_arm64-macos.tar.gz && cd longcallD-v0.0.8_arm64-macos
27+
wget https://github.com/yangao07/longcallD/releases/download/v0.0.9/longcallD-v0.0.9_arm64-macos.tar.gz
28+
tar -zxvf longcallD-v0.0.9_arm64-macos.tar.gz && cd longcallD-v0.0.9_arm64-macos
3229

3330
# PacBio HiFi reads
3431
./longcallD call ./test_data/chr11_2M.fa ./test_data/HG002_chr11_hifi_test.bam --hifi > HG002_hifi_test.vcf
@@ -40,7 +37,7 @@ man ./longcallD.1
4037
``` -->
4138

4239
## Table of Contents
43-
- [Updates (release v0.0.8)](#updates-release-v008)
40+
- [Updates (release v0.0.9)](#updates-release-v009)
4441
- [Getting Started](#getting-started)
4542
- [Table of Contents](#table-of-contents)
4643
- [Introduction](#introduction)
@@ -87,13 +84,13 @@ Providing the annotation sequence of common mobile elements, i.e., Alu/L1/SVA, u
8784
### Pre-built executables (recommended)
8885
**Linux-x64**
8986
```
90-
wget https://github.com/yangao07/longcallD/releases/download/v0.0.8/longcallD-v0.0.8_x64-linux.tar.gz
91-
tar -zxvf longcallD-v0.0.8_x64-linux.tar.gz
87+
wget https://github.com/yangao07/longcallD/releases/download/v0.0.9/longcallD-v0.0.9_x64-linux.tar.gz
88+
tar -zxvf longcallD-v0.0.9_x64-linux.tar.gz
9289
```
9390
**MacOS-arm64**
9491
```
95-
wget https://github.com/yangao07/longcallD/releases/download/v0.0.8/longcallD-v0.0.8_arm64-macos.tar.gz
96-
tar -zxvf longcallD-v0.0.8_arm64-macos.tar.gz
92+
wget https://github.com/yangao07/longcallD/releases/download/v0.0.9/longcallD-v0.0.9_arm64-macos.tar.gz
93+
tar -zxvf longcallD-v0.0.9_arm64-macos.tar.gz
9794
```
9895

9996
**Linux-arm64/macOS-x64**
@@ -109,9 +106,9 @@ conda install -c bioconda longcalld
109106
### Build from source
110107
To compile longcallD from source, ensure you have **GCC/clang(9.0+)** and **zlib/libbz2/liblzma/libcurl** (for htslib) installed.
111108
```
112-
wget https://github.com/yangao07/longcallD/releases/download/v0.0.8/longcallD-v0.0.8.tar.gz
113-
tar -zxvf longcallD-v0.0.8.tar.gz
114-
cd longcallD-v0.0.8; make
109+
wget https://github.com/yangao07/longcallD/releases/download/v0.0.9/longcallD-v0.0.9.tar.gz
110+
tar -zxvf longcallD-v0.0.9.tar.gz
111+
cd longcallD-v0.0.9; make
115112
```
116113

117114
## Usage

src/bam_utils.c

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,7 @@ void update_read_var_profile_with_allele(int var_i, int allele_i, int alt_qi, re
230230

231231

232232
int get_digar_ave_qual(digar1_t *digar, const uint8_t *qual) {
233+
if (digar->is_low_qual) return 0;
233234
if (digar->qi < 0) return 0;
234235
int q_start, q_end;
235236
if (digar->type == BAM_CDEL) {
@@ -1327,9 +1328,13 @@ int collect_digar_from_ref_seq(bam_chunk_t *chunk, int read_i, const struct call
13271328
return skip == 1 ? -1 : 0;
13281329
}
13291330

1330-
int bam_chunk_init0(bam_chunk_t *chunk, int n_reads, int n_bam) {
1331+
int bam_chunk_init0(bam_chunk_t *chunk, const struct call_var_opt_t *opt, int n_reads, int n_bam) {
13311332
// input
13321333
chunk->n_reads = 0; chunk->m_reads = n_reads; chunk->ordered_read_ids = (int*)malloc(n_reads * sizeof(int));
1334+
if (opt->output_sv_rnames || opt->output_somatic_sv_rnames) {
1335+
chunk->read_names = (char**)malloc(n_reads * sizeof(char*));
1336+
for (int i = 0; i < n_reads; i++) chunk->read_names[i] = NULL;
1337+
}
13331338
chunk->n_bam = n_bam;
13341339
chunk->n_up_ovlp_reads = (int*)calloc(n_bam, sizeof(int)); chunk->n_up_ovlp_skip_reads = (int*)calloc(n_bam, sizeof(int));
13351340
chunk->up_ovlp_read_i = (int**)malloc(n_bam * sizeof(int*));
@@ -1368,9 +1373,10 @@ int bam_chunk_init0(bam_chunk_t *chunk, int n_reads, int n_bam) {
13681373
return 0;
13691374
}
13701375

1371-
int bam_chunk_realloc(bam_chunk_t *chunk) {
1376+
int bam_chunk_realloc(bam_chunk_t *chunk, const struct call_var_opt_t *opt) {
13721377
int m_reads = chunk->m_reads * 2;
13731378
chunk->reads = (bam1_t**)realloc(chunk->reads, m_reads * sizeof(bam1_t*));
1379+
if (opt->output_sv_rnames || opt->output_somatic_sv_rnames) chunk->read_names = (char**)realloc(chunk->read_names, m_reads * sizeof(char*));
13741380
chunk->ordered_read_ids = (int*)realloc(chunk->ordered_read_ids, m_reads * sizeof(int));
13751381
for (int i = 0; i < chunk->n_bam; ++i) {
13761382
chunk->up_ovlp_read_i[i] = (int*)realloc(chunk->up_ovlp_read_i[i], m_reads * sizeof(int));
@@ -1480,6 +1486,12 @@ void bam_chunk_post_free(bam_chunk_t *chunk, const struct call_var_opt_t *opt) {
14801486
free(chunk->reads);
14811487
}
14821488
free(chunk->ordered_read_ids);
1489+
// save read_var_profile for output_sv_rnames or output_somatic_sv_rnames, which will be used in make_variants
1490+
if (chunk->read_var_profile != NULL) free_read_var_profile(chunk->read_var_profile, chunk->n_reads);
1491+
if (opt->output_sv_rnames || opt->output_somatic_sv_rnames) {
1492+
for (int i = 0; i < chunk->n_reads; i++) free(chunk->read_names[i]);
1493+
free(chunk->read_names);
1494+
}
14831495
}
14841496

14851497
// free variables that will not be used in stitch & make variants
@@ -1493,7 +1505,6 @@ void bam_chunk_mid_free(bam_chunk_t *chunk, const struct call_var_opt_t *opt) {
14931505
} free(chunk->noisy_reg_to_reads);
14941506
}
14951507
if (chunk->noisy_reg_to_n_reads != NULL) free(chunk->noisy_reg_to_n_reads);
1496-
if (chunk->read_var_profile != NULL) free_read_var_profile(chunk->read_var_profile, chunk->n_reads);
14971508
if (chunk->read_var_cr != NULL) cr_destroy(chunk->read_var_cr);
14981509
}
14991510

@@ -1622,7 +1633,7 @@ int collect_ref_seq_bam_main(const struct call_var_pl_t *pl, struct call_var_io_
16221633
call_var_opt_t *opt = pl->opt;
16231634

16241635
int min_mapq = opt->min_mq;
1625-
bam_chunk_init0(chunk, 4096, io_aux->n_bam);
1636+
bam_chunk_init0(chunk, opt, 4096, io_aux->n_bam);
16261637
chunk->reg_chunk_i = reg_chunk_i; chunk->reg_i = reg_i;
16271638
chunk->tid = tid; chunk->tname = io_aux->headers[0]->target_name[tid]; chunk->reg_beg = reg_beg; chunk->reg_end = reg_end;
16281639
hts_pos_t min_read_beg = reg_beg, max_read_end = reg_end;
@@ -1655,8 +1666,11 @@ int collect_ref_seq_bam_main(const struct call_var_pl_t *pl, struct call_var_io_
16551666
if (chunk->reads[chunk->n_reads]->core.pos+1 < min_read_beg) min_read_beg = chunk->reads[chunk->n_reads]->core.pos+1;
16561667
if (bam_endpos(chunk->reads[chunk->n_reads]) > max_read_end) max_read_end = bam_endpos(chunk->reads[chunk->n_reads]);
16571668
// check if read is overlapping with next region
1669+
if (opt->output_sv_rnames || opt->output_somatic_sv_rnames) {
1670+
chunk->read_names[chunk->n_reads] = strdup(bam_get_qname(chunk->reads[chunk->n_reads]));
1671+
}
16581672
chunk->n_reads++;
1659-
if (chunk->n_reads == chunk->m_reads) bam_chunk_realloc(chunk);
1673+
if (chunk->n_reads == chunk->m_reads) bam_chunk_realloc(chunk, opt);
16601674
}
16611675
bam_itr_destroy(iter);
16621676
}

src/bam_utils.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ typedef struct bam_chunk_t {
6262
// should always be 1 region XXX
6363
cgranges_t *low_comp_cr; // tandem_rep_cr;
6464
int n_reads, m_reads; int *ordered_read_ids; // size: m_reads, for multiple input bams, merge sort reads by pos, end, NM, name
65+
char **read_names; // size: m_reads, for output
6566
int n_bam; // for each input bam, record the number of region-overlapping reads
6667
int *n_up_ovlp_reads, *n_down_ovlp_reads, *n_up_ovlp_skip_reads, *n_down_ovlp_skip_reads; // number of reads overlapping with up/downstream bam chunk
6768
int **up_ovlp_read_i, **down_ovlp_read_i;

src/call_var_main.c

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ const struct option call_var_opt [] = {
3737
{ "autosome", 0, NULL, 0},
3838
{ "mosaic", 0, NULL, 0},
3939
{ "refine-aln", 0, NULL, 0},
40+
{ "out-sv-rnames", 0, NULL, 0},
41+
{ "out-som-sv-rnames", 0, NULL, 0},
4042
// { "alpha", 1, NULL, 0},
4143
// { "beta", 1, NULL, 0},
4244
{ "max-somvar", 1, NULL, 0},
@@ -206,7 +208,7 @@ call_var_opt_t *call_var_init_para(void) {
206208
opt->min_sv_len = LONGCALLD_MIN_SV_LEN;
207209
opt->min_tsd_len = LONGCALLD_MIN_TSD_LEN; opt->max_tsd_len = LONGCALLD_MAX_TSD_LEN;
208210
opt->min_polya_len = LONGCALLD_MIN_POLYA_LEN; opt->min_polya_ratio = LONGCALLD_MIN_POLYA_RATIO;
209-
opt->output_sv_reads = 0;
211+
opt->output_sv_rnames = 0; opt->output_somatic_sv_rnames = 0;
210212

211213
initialize_lgamma_cache(opt);
212214
opt->te_seq_fn = NULL; opt->n_te_seqs = 0; opt->te_kmer_len = 15; opt->te_for_h = NULL; opt->te_rev_h = NULL;
@@ -297,6 +299,7 @@ void var_free(var_t *v) {
297299
if (v->vars[i].tsd_len > 0) {
298300
free(v->vars[i].tsd_seq);
299301
}
302+
if (v->vars[i].is_sv && v->vars[i].AD[1] > 0) free(v->vars[i].alt_read_i);
300303
}
301304
free(v->vars);
302305
}
@@ -793,7 +796,7 @@ static void *call_var_worker_pipeline(void *shared, int step, void *in) { // kt_
793796
int n_out_vars = 0, n_out_reads = 0;
794797
for (int i = 0; i < s->n_chunks; ++i) {
795798
bam_chunk_t *c = s->chunks + i;
796-
n_out_vars += write_var_to_vcf(s->vars+i, pl->opt, c->tname);
799+
n_out_vars += write_var_to_vcf(s->vars+i, pl->opt, c);
797800
if (pl->opt->out_aln_fp != NULL) n_out_reads += write_read_to_bam(c, pl->opt, pl->io_aux);
798801
var_free(s->vars + i); // free output
799802
}
@@ -858,6 +861,8 @@ static void call_var_usage(void) {//main usage
858861
fprintf(stderr, " note: multiple input BAM/CRAM files will be merged in SAM/BAM/CRAM output\n");
859862
fprintf(stderr, " --refine-aln refine alignment in SAM/BAM/CRAM output\n");
860863
fprintf(stderr, " note: output SAM/BAM/CRAM may be unsorted when --refine-aln is set\n");
864+
fprintf(stderr, " --out-sv-rnames output names of all reads supporting alternative allele in INFO field of VCF output for all SVs [False]\n");
865+
fprintf(stderr, " --out-som-sv-rnames output names of all reads supporting alternative allele in INFO field of VCF output for mosaic/somatic SVs [False]\n");
861866
// fprintf(stderr, "\n");
862867
fprintf(stderr, " Variant calling:\n");
863868
fprintf(stderr, " -c --min-cov INT min. total read coverage for candidate variant [%d]\n", LONGCALLD_MIN_CAND_DP);
@@ -941,6 +946,8 @@ int call_var_main(int argc, char *argv[]) {
941946
opt->somatic_win_max_vars = strtol(optarg, &s, 10);
942947
if (*s == ',') opt->somatic_win = strtol(s+1, &s, 10);
943948
} else if (strcmp(call_var_opt[op_idx].name, "refine-aln") == 0) opt->refine_bam = 1;
949+
else if (strcmp(call_var_opt[op_idx].name, "out-sv-rnames") == 0) opt->output_sv_rnames = 1;
950+
else if (strcmp(call_var_opt[op_idx].name, "out-som-sv-rnames") == 0) opt->output_somatic_sv_rnames = 1;
944951
break;
945952
case 's': opt->out_somatic = 1; break;
946953
case 'm': opt->out_methylation = 1; break;

src/call_var_main.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -109,14 +109,15 @@ typedef struct {
109109
uint8_t type; // BAM_CINS/BAM_CDEL/BAM_CDIFF
110110
hts_pos_t pos, PS; // phase set
111111
uint8_t *ref_bases; int ref_len;
112-
uint8_t is_somatic;
112+
uint8_t is_somatic, is_sv;
113113
uint8_t *tsd_seq; int tsd_len, polya_len; hts_pos_t tsd_pos1, tsd_pos2; // target site duplication, 2 TSDs for DEL
114114
int te_seq_i, te_is_rev; // char *rep_name, *rep_family, *rep_class; use te_seq_i to retrieve TE sequence info
115115

116116
int n_alt_allele; // currently, only one alt allele
117117
uint8_t **alt_bases; int *alt_len;
118118
int DP, AD[2]; uint8_t GT[2]; // DP/AD/GT
119119
int QUAL, GQ, PL[6]; // phred-scaled, QUAL/FILTER/GQ/PL
120+
int *alt_read_i;
120121
} var1_t;
121122

122123
typedef struct var_t {
@@ -163,7 +164,7 @@ typedef struct call_var_opt_t {
163164
// Alu/L1/SVA sequences
164165
char *te_seq_fn; char **te_seq_names; int n_te_seqs;
165166
int te_kmer_len; kmer32_hash_t **te_for_h, **te_rev_h;
166-
int output_sv_reads; // output supporting read IDs for each SV
167+
int output_sv_rnames, output_somatic_sv_rnames; // output supporting read IDs for each SV
167168
// general
168169
// int max_ploidy;
169170
int pl_threads, n_threads;

src/collect_var.c

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1350,7 +1350,7 @@ int make_variants(const call_var_opt_t *opt, bam_chunk_t *chunk, var_t **_var) {
13501350
if (n_cand_vars <= 0) return 0;
13511351
char *ref_seq = chunk->ref_seq; hts_pos_t ref_beg = chunk->ref_beg;
13521352
hts_pos_t active_reg_beg = chunk->reg_beg, active_reg_end = chunk->reg_end;
1353-
cand_var_t *cand_vars = chunk->cand_vars;
1353+
cand_var_t *cand_vars = chunk->cand_vars; read_var_profile_t *p = chunk->read_var_profile;
13541354
(*_var) = (var_t*)malloc(sizeof(var_t));
13551355
var_t *var = *_var;
13561356
var->n = 0; var->m = n_cand_vars;
@@ -1411,6 +1411,7 @@ int make_variants(const call_var_opt_t *opt, bam_chunk_t *chunk, var_t **_var) {
14111411
var->vars[i].alt_bases = (uint8_t**)malloc(2 * sizeof(uint8_t*));
14121412
}
14131413
var->vars[i].n_alt_allele = 0;
1414+
var->vars[i].is_sv = 0;
14141415
for (int hap=1; hap <= 2; ++hap) {
14151416
int hap_alle = hap == 1 ? hap1_alle : hap2_alle;
14161417
if (hap_alle != 0) { // alt allele
@@ -1433,12 +1434,36 @@ int make_variants(const call_var_opt_t *opt, bam_chunk_t *chunk, var_t **_var) {
14331434
}
14341435
}
14351436
var->vars[i].alt_len[var->vars[i].n_alt_allele] = alt_len;
1437+
if (abs(alt_len - var->vars[i].ref_len) >= opt->min_sv_len) var->vars[i].is_sv = 1;
14361438
var->vars[i].GT[hap-1] = ++var->vars[i].n_alt_allele;
14371439
if (is_hom) hom_alt_is_set = 1;
14381440
} else var->vars[i].GT[hap-1] = 0;
14391441
}
14401442
var->vars[i].DP = cand_vars[cand_i].total_cov;
14411443
for (int j = 0; j < cand_vars[cand_i].n_uniq_alles; ++j) var->vars[i].AD[j] = cand_vars[cand_i].alle_covs[j];
1444+
if (var->vars[i].is_sv && var->vars[i].AD[1] > 0) { // collect alt_read_i from read_var_profile
1445+
var->vars[i].alt_read_i = (int*)malloc(var->vars[i].AD[1] * sizeof(int));
1446+
int alt_read_i_idx = 0;
1447+
for (int k = 0; k < chunk->n_reads; ++k) {
1448+
int read_i = chunk->ordered_read_ids[k];
1449+
if (chunk->is_skipped[read_i]) continue;
1450+
read_var_profile_t *p1 = p + read_i;
1451+
if (p1->start_var_idx < 0 || p1->end_var_idx < 0) continue;
1452+
if (cand_i < p1->start_var_idx || cand_i > p1->end_var_idx) continue;
1453+
int allele = p1->alleles[cand_i - p1->start_var_idx];
1454+
if (allele == 1) {
1455+
// fprintf(stderr, "Collect alt_read_i for var: %s:%" PRIi64 " read: %s allele: %d\n", chunk->tname, var->vars[i].pos, chunk->read_names[read_i], allele);
1456+
if (alt_read_i_idx >= var->vars[i].AD[1]) {
1457+
_err_error_exit("Error: alt_read_i_idx: %d exceeds AD[1]: %d for var: %s:%" PRIi64 " %d-%c-%d\n", alt_read_i_idx, var->vars[i].AD[1], chunk->tname, var->vars[i].pos, var->vars[i].ref_len, BAM_CIGAR_STR[var->vars[i].type], var->vars[i].alt_len[0]);
1458+
}
1459+
var->vars[i].alt_read_i[alt_read_i_idx++] = read_i;
1460+
}
1461+
}
1462+
if (alt_read_i_idx != var->vars[i].AD[1]) {
1463+
_err_error("Error: alt_read_i_idx: %d does not match AD[1]: %d for var: %s:%" PRIi64 " %d-%c-%d\n", alt_read_i_idx, var->vars[i].AD[1], chunk->tname, var->vars[i].pos, var->vars[i].ref_len, BAM_CIGAR_STR[var->vars[i].type], var->vars[i].alt_len[0]);
1464+
var->vars[i].AD[1] = alt_read_i_idx;
1465+
}
1466+
} else var->vars[i].alt_read_i = NULL;
14421467
var->vars[i].QUAL = cal_var_QUAL1(var->vars[i].AD[0], var->vars[i].AD[1], opt->log_p, opt->log_1p, opt->max_qual);
14431468
var->vars[i].GQ = cal_sample_GQ(var->vars[i].AD[0], var->vars[i].AD[1], opt->log_p, opt->log_1p, opt->log_2, opt->max_gq);
14441469
i++;

src/main.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
const char PROG[20] = "longcallD";
1010
const char DESCRIP[100] = "local-haplotagging-based small and structural variant calling";
1111
#ifndef LONGCALLD_VERSION
12-
const char LONGCALLD_VERSION[20] = "0.0.8";
12+
const char LONGCALLD_VERSION[20] = "0.0.9";
1313
#endif
1414
const char CONTACT[50] = "yangao@ds.dfci.harvard.edu";
1515
int LONGCALLD_VERBOSE = 0;

0 commit comments

Comments
 (0)