Skip to content

Commit e7f638b

Browse files
committed
Release 1.16
2 parents c28190b + bad8b54 commit e7f638b

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

86 files changed

+8269
-7358
lines changed

.cirrus.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ gcc_task:
7676
ubuntu_task:
7777
name: ubuntu-clang
7878
container:
79-
image: ubuntu:devel
79+
image: ubuntu:latest
8080
cpu: 2
8181
memory: 1G
8282

@@ -103,7 +103,7 @@ ubuntu_task:
103103
apt-get install -y --no-install-suggests --no-install-recommends \
104104
ca-certificates clang git autoconf automake \
105105
make zlib1g-dev libbz2-dev liblzma-dev libcurl4-gnutls-dev \
106-
libssl-dev libdeflate-dev libncurses5-dev
106+
libssl-dev libdeflate-dev
107107
108108
<< : *COMPILE
109109
<< : *TEST
@@ -113,7 +113,7 @@ ubuntu_task:
113113
rockylinux_task:
114114
name: rockylinux-gcc
115115
container:
116-
image: rockylinux:latest
116+
image: rockylinux:9
117117
cpu: 2
118118
memory: 1G
119119

@@ -126,8 +126,8 @@ rockylinux_task:
126126
# NB: we could consider building a docker image with these
127127
# preinstalled and specifying that instead, to speed up testing.
128128
install_script: |
129-
yum install -y autoconf automake make gcc perl-Data-Dumper zlib-devel \
130-
bzip2 bzip2-devel xz-devel curl-devel openssl-devel ncurses-devel \
129+
yum install -y autoconf automake make gcc perl-Data-Dumper perl-FindBin \
130+
zlib-devel bzip2 bzip2-devel xz-devel curl-devel openssl-devel \
131131
git diffutils
132132
133133
<< : *COMPILE

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ endif
104104

105105
include config.mk
106106

107-
PACKAGE_VERSION = 1.15.1
107+
PACKAGE_VERSION = 1.16
108108

109109
# If building from a Git repository, replace $(PACKAGE_VERSION) with the Git
110110
# description of the working tree: either a release tag with the same value

NEWS

Lines changed: 47 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,50 @@
1+
## Release 1.16 (18th August 2022)
2+
3+
4+
5+
* New plugin `bcftools +variant-distance` to annotate records with distance to the
6+
nearest variant (#1690)
7+
8+
9+
Changes affecting the whole of bcftools, or multiple commands:
10+
11+
* The -i/-e filtering expressions
12+
13+
- Added support for querying of multiple filters, for example `-i 'FILTER="A;B"'`
14+
can be used to select sites with two filters "A" and "B" set. See the documentation
15+
for more examples.
16+
17+
- Added modulo arithmetic operator
18+
19+
Changes affecting specific commands:
20+
21+
* bcftools annotate
22+
23+
- A bug introduced in 1.14 caused that records with INFO/END annotation would
24+
incorrectly trigger `-c ~INFO/END` mode of comparison even when not explicitly
25+
requested, which would result in not transferring the annotation from a tab-delimited
26+
file (#1733)
27+
28+
* bcftools merge
29+
30+
- New `-m snp-ins-del` switch to merge SNVs, insertions and deletions separately (#1704)
31+
32+
* bcftools mpileup
33+
34+
- New NMBZ annotation for Mann-Whitney U-z test on number of mismatches within
35+
supporting reads
36+
37+
- Suppress the output of MQSBZ and FS annotations in absence of alternate allele
38+
39+
* bcftools +scatter
40+
41+
- Fix erroneous addition of duplicate PG lines
42+
43+
* bcftools +setGT
44+
45+
- Custom genotypes (e.g. `-n c:1/1`) now correctly override ploidy
46+
47+
148
## Release 1.15.1 (7th April 2022)
249

350

@@ -44,7 +91,6 @@
4491

4592
## Release 1.15 (21st February 2022)
4693

47-
4894
* New `bcftools head` subcommand for conveniently displaying the headers
4995
of a VCF or BCF file. Without any options, this is equivalent to
5096
`bcftools view --header-only --no-version` but more succinct and memorable.

bam2bcf.c

Lines changed: 120 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
/* bam2bcf.c -- variant calling.
22
33
Copyright (C) 2010-2012 Broad Institute.
4-
Copyright (C) 2012-2021 Genome Research Ltd.
4+
Copyright (C) 2012-2022 Genome Research Ltd.
55
66
Author: Heng Li <[email protected]>
77
@@ -89,6 +89,39 @@ void bcf_call_destroy(bcf_callaux_t *bca)
8989
free(bca->bases); free(bca->inscns); free(bca);
9090
}
9191

92+
static int get_aux_nm(bam1_t *rec, int32_t qpos, int is_ref)
93+
{
94+
uint8_t *nm_tag = bam_aux_get(rec, "NM");
95+
if ( !nm_tag ) return -1;
96+
int64_t nm = bam_aux2i(nm_tag);
97+
98+
// Count indels as single events, not as the number of inserted/deleted
99+
// bases (which is what NM does). Add soft clips as mismatches.
100+
int i;
101+
for (i=0; i < rec->core.n_cigar; i++)
102+
{
103+
int val = bam_get_cigar(rec)[i] & BAM_CIGAR_MASK;
104+
if ( val==BAM_CSOFT_CLIP )
105+
{
106+
nm += bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT;
107+
}
108+
else if ( val==BAM_CINS || val==BAM_CDEL )
109+
{
110+
val = bam_get_cigar(rec)[i] >> BAM_CIGAR_SHIFT;
111+
if ( val > 1 ) nm -= val - 1;
112+
}
113+
}
114+
115+
// Take into account MNPs, 2% of de novo SNVs appear within 20bp of another de novo SNV
116+
// http://www.genome.org/cgi/doi/10.1101/gr.239756.118
117+
nm -= is_ref ? 1 : 2;
118+
119+
if ( nm < 0 ) nm = 0;
120+
if ( nm >= B2B_N_NM ) nm = B2B_N_NM - 1;
121+
122+
return nm;
123+
}
124+
92125
// position in the sequence with respect to the aligned part of the read
93126
static int get_position(const bam_pileup1_t *p, int *len,
94127
int *sc_len, int *sc_dist) {
@@ -158,6 +191,17 @@ void bcf_callaux_clean(bcf_callaux_t *bca, bcf_call_t *call)
158191
if ( call->ADF ) memset(call->ADF,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
159192
if ( call->ADR ) memset(call->ADR,0,sizeof(int32_t)*(call->n+1)*B2B_MAX_ALLELES);
160193
if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
194+
if ( call->SCR ) memset(call->SCR,0,sizeof(*call->SCR)*(call->n+1));
195+
if ( bca->fmt_flag&B2B_FMT_NMBZ )
196+
{
197+
memset(call->ref_nm,0,sizeof(*call->ref_nm)*(call->n+1)*B2B_N_NM);
198+
memset(call->alt_nm,0,sizeof(*call->alt_nm)*(call->n+1)*B2B_N_NM);
199+
}
200+
else
201+
{
202+
memset(call->ref_nm,0,sizeof(*call->ref_nm)*B2B_N_NM);
203+
memset(call->alt_nm,0,sizeof(*call->alt_nm)*B2B_N_NM);
204+
}
161205
memset(call->QS,0,sizeof(*call->QS)*call->n*B2B_MAX_ALLELES);
162206
memset(bca->ref_scl, 0, 100*sizeof(int));
163207
memset(bca->alt_scl, 0, 100*sizeof(int));
@@ -309,28 +353,38 @@ int bcf_call_glfgen(int _n, const bam_pileup1_t *pl, int ref_base, bcf_callaux_t
309353
if (sc_len > 99) sc_len = 99;
310354
}
311355
}
312-
313356
int imq = mapQ * nqual_over_60;
314357
int ibq = baseQ * nqual_over_60;
358+
int inm = get_aux_nm(p->b,p->qpos,is_diff?0:1);
315359

316360
if ( bam_is_rev(p->b) )
317361
bca->rev_mqs[imq]++;
318362
else
319363
bca->fwd_mqs[imq]++;
320364

321-
if ( bam_seqi(bam_get_seq(p->b),p->qpos) == ref_base )
365+
if ( !is_diff )
322366
{
323367
bca->ref_pos[epos]++;
324368
bca->ref_bq[ibq]++;
325369
bca->ref_mq[imq]++;
326370
bca->ref_scl[sc_len]++;
371+
if ( inm>=0 )
372+
{
373+
bca->ref_nm[inm]++;
374+
if ( r->ref_nm ) r->ref_nm[inm]++;
375+
}
327376
}
328377
else
329378
{
330379
bca->alt_pos[epos]++;
331380
bca->alt_bq[ibq]++;
332381
bca->alt_mq[imq]++;
333382
bca->alt_scl[sc_len]++;
383+
if ( inm>=0 )
384+
{
385+
bca->alt_nm[inm]++;
386+
if ( r->alt_nm ) r->alt_nm[inm]++;
387+
}
334388
}
335389
}
336390

@@ -798,6 +852,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
798852
call->n_alleles = j;
799853
if (call->n_alleles == 1) return -1; // no reliable supporting read. stop doing anything
800854
}
855+
int has_alt = (call->n_alleles==2 && call->unseen!=-1) ? 0 : 1;
801856
/*
802857
* Set the phread likelihood array (call->PL) This array is 15 entries long
803858
* for each sample because that is size of an upper or lower triangle of a
@@ -914,6 +969,9 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
914969
for (j = 0; j < 16; ++j) call->anno[j] += calls[i].anno[j];
915970
}
916971

972+
// No need to calculate MWU tests when there is no ALT allele, this should speed up things slightly
973+
if ( !has_alt ) return 0;
974+
917975
calc_SegBias(calls, call);
918976

919977
// calc_chisq_bias("XPOS", call->bcf_hdr->id[BCF_DT_CTG][call->tid].key, call->pos, bca->ref_pos, bca->alt_pos, bca->npos);
@@ -922,7 +980,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
922980

923981
if (bca->fmt_flag & B2B_INFO_ZSCORE) {
924982
// U z-normalised as +/- number of standard deviations from mean.
925-
if (call->ori_ref < 0) {
983+
if (call->ori_ref < 0) { // indel
926984
if (bca->fmt_flag & B2B_INFO_RPB)
927985
call->mwu_pos = calc_mwu_biasZ(bca->iref_pos, bca->ialt_pos,
928986
bca->npos, 0, 1);
@@ -945,6 +1003,15 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
9451003
call->mwu_sc = calc_mwu_biasZ(bca->ref_scl, bca->alt_scl,
9461004
100, 0,1);
9471005
}
1006+
call->mwu_nm[0] = calc_mwu_biasZ(bca->ref_nm, bca->alt_nm, B2B_N_NM,0,1);
1007+
if ( bca->fmt_flag & B2B_FMT_NMBZ )
1008+
{
1009+
for (i=0; i<n; i++)
1010+
{
1011+
float val = calc_mwu_biasZ(calls[i].ref_nm, calls[i].alt_nm, B2B_N_NM,0,1);
1012+
call->mwu_nm[i+1] = val!=HUGE_VAL ? val : 0;
1013+
}
1014+
}
9481015
} else {
9491016
// Old method; U as probability between 0 and 1
9501017
if ( bca->fmt_flag & B2B_INFO_RPB )
@@ -976,7 +1043,7 @@ int bcf_call_combine(int n, const bcf_callret1_t *calls, bcf_callaux_t *bca, int
9761043
int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag, const bcf_callaux_t *bca, const char *ref)
9771044
{
9781045
extern double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
979-
int i, j, nals = 1;
1046+
int i, j, nals = 1, has_alt = 0;
9801047

9811048
bcf_hdr_t *hdr = bc->bcf_hdr;
9821049
rec->rid = bc->tid;
@@ -1006,6 +1073,7 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
10061073
for (j = 0; j < bca->indelreg; ++j) kputc(ref[bc->pos+1+j], &bc->tmp);
10071074
}
10081075
nals++;
1076+
has_alt = 1;
10091077
}
10101078
}
10111079
else // SNP
@@ -1016,7 +1084,11 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
10161084
if (bc->a[i] < 0) break;
10171085
kputc(',', &bc->tmp);
10181086
if ( bc->unseen==i ) kputs("<*>", &bc->tmp);
1019-
else kputc("ACGT"[bc->a[i]], &bc->tmp);
1087+
else
1088+
{
1089+
kputc("ACGT"[bc->a[i]], &bc->tmp);
1090+
has_alt = 1;
1091+
}
10201092
nals++;
10211093
}
10221094
}
@@ -1052,40 +1124,46 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
10521124
bcf_update_info_float(hdr, rec, "I16", tmpf, 16);
10531125
bcf_update_info_float(hdr, rec, "QS", bc->qsum, nals);
10541126

1055-
if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
1056-
if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
1057-
1058-
if (bca->fmt_flag & B2B_INFO_ZSCORE) {
1059-
if ( bc->mwu_pos != HUGE_VAL )
1060-
bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
1061-
if ( bc->mwu_mq != HUGE_VAL )
1062-
bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
1063-
if ( bc->mwu_mqs != HUGE_VAL )
1064-
bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
1065-
if ( bc->mwu_bq != HUGE_VAL )
1066-
bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
1067-
if ( bc->mwu_sc != HUGE_VAL )
1068-
bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
1069-
} else {
1070-
if ( bc->mwu_pos != HUGE_VAL )
1071-
bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
1072-
if ( bc->mwu_mq != HUGE_VAL )
1073-
bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
1074-
if ( bc->mwu_mqs != HUGE_VAL )
1075-
bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
1076-
if ( bc->mwu_bq != HUGE_VAL )
1077-
bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
1078-
}
1127+
if ( has_alt )
1128+
{
1129+
if ( bc->vdb != HUGE_VAL ) bcf_update_info_float(hdr, rec, "VDB", &bc->vdb, 1);
1130+
if ( bc->seg_bias != HUGE_VAL ) bcf_update_info_float(hdr, rec, "SGB", &bc->seg_bias, 1);
1131+
1132+
if (bca->fmt_flag & B2B_INFO_ZSCORE) {
1133+
if ( bc->mwu_pos != HUGE_VAL )
1134+
bcf_update_info_float(hdr, rec, "RPBZ", &bc->mwu_pos, 1);
1135+
if ( bc->mwu_mq != HUGE_VAL )
1136+
bcf_update_info_float(hdr, rec, "MQBZ", &bc->mwu_mq, 1);
1137+
if ( bc->mwu_mqs != HUGE_VAL )
1138+
bcf_update_info_float(hdr, rec, "MQSBZ", &bc->mwu_mqs, 1);
1139+
if ( bc->mwu_bq != HUGE_VAL )
1140+
bcf_update_info_float(hdr, rec, "BQBZ", &bc->mwu_bq, 1);
1141+
if ( bc->mwu_nm[0] != HUGE_VAL )
1142+
bcf_update_info_float(hdr, rec, "NMBZ", bc->mwu_nm, 1);
1143+
if ( bc->mwu_sc != HUGE_VAL )
1144+
bcf_update_info_float(hdr, rec, "SCBZ", &bc->mwu_sc, 1);
1145+
} else {
1146+
if ( bc->mwu_pos != HUGE_VAL )
1147+
bcf_update_info_float(hdr, rec, "RPB", &bc->mwu_pos, 1);
1148+
if ( bc->mwu_mq != HUGE_VAL )
1149+
bcf_update_info_float(hdr, rec, "MQB", &bc->mwu_mq, 1);
1150+
if ( bc->mwu_mqs != HUGE_VAL )
1151+
bcf_update_info_float(hdr, rec, "MQSB", &bc->mwu_mqs, 1);
1152+
if ( bc->mwu_bq != HUGE_VAL )
1153+
bcf_update_info_float(hdr, rec, "BQB", &bc->mwu_bq, 1);
1154+
}
10791155

1080-
if ( bc->strand_bias != HUGE_VAL )
1081-
bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
1156+
if ( bc->strand_bias != HUGE_VAL )
1157+
bcf_update_info_float(hdr, rec, "FS", &bc->strand_bias, 1);
10821158

10831159
#if CDF_MWU_TESTS
1084-
if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
1085-
if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
1086-
if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
1087-
if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
1160+
if ( bc->mwu_pos_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "RPB2", &bc->mwu_pos_cdf, 1);
1161+
if ( bc->mwu_mq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQB2", &bc->mwu_mq_cdf, 1);
1162+
if ( bc->mwu_mqs_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "MQSB2", &bc->mwu_mqs_cdf, 1);
1163+
if ( bc->mwu_bq_cdf != HUGE_VAL ) bcf_update_info_float(hdr, rec, "BQB2", &bc->mwu_bq_cdf, 1);
10881164
#endif
1165+
}
1166+
10891167
tmpf[0] = bc->ori_depth ? (float)bc->mq0/bc->ori_depth : 0;
10901168
bcf_update_info_float(hdr, rec, "MQ0F", tmpf, 1);
10911169

@@ -1144,5 +1222,11 @@ int bcf_call2bcf(bcf_call_t *bc, bcf1_t *rec, bcf_callret1_t *bcr, int fmt_flag,
11441222
if ( fmt_flag&B2B_FMT_QS )
11451223
bcf_update_format_int32(hdr, rec, "QS", bc->QS, rec->n_sample*rec->n_allele);
11461224

1225+
if ( has_alt )
1226+
{
1227+
if ( fmt_flag&B2B_FMT_NMBZ )
1228+
bcf_update_format_float(hdr, rec, "NMBZ", bc->mwu_nm+1, rec->n_sample);
1229+
}
1230+
11471231
return 0;
11481232
}

0 commit comments

Comments
 (0)