Skip to content

Commit 3b24f36

Browse files
committed
New -*,--keep-unseen-allele option. Resolves #2015
1 parent 6374c6a commit 3b24f36

File tree

8 files changed

+52
-14
lines changed

8 files changed

+52
-14
lines changed

NEWS

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,14 @@ Changes affecting specific commands:
2121
- Fix `bcftools annotate --mark-sites`, VCF sites overlapping regions in a BED file
2222
were not annotated (#1989)
2323

24-
* bcftools call and mpileup
24+
* bcftools call
25+
26+
- Output MIN_DP rather than MinDP in gVCF mode
27+
28+
- New `-*, --keep-unseen-allele` option to output the unobserved allele <*>,
29+
intended for gVCF.
30+
31+
* bcftools mpileup
2532

2633
- Output MIN_DP rather than MinDP in gVCF mode
2734

call.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ THE SOFTWARE. */
3333
#define CALL_VARONLY (1<<1)
3434
#define CALL_CONSTR_TRIO (1<<2)
3535
#define CALL_CONSTR_ALLELES (1<<3)
36-
//
36+
#define CALL_KEEP_UNSEEN (1<<4)
3737
#define CALL_FMT_PV4 (1<<5)
3838
#define CALL_FMT_GQ (1<<6)
3939
#define CALL_FMT_GP (1<<7)
@@ -125,8 +125,7 @@ call_t;
125125
void error(const char *format, ...);
126126

127127
/*
128-
* call() - return -1 value on critical error; -2 to skip the site; or the number of non-reference
129-
* alleles on success.
128+
* call() - return -1 value on critical error; -2 to skip the site; or the number of alleles on success
130129
*/
131130
int mcall(call_t *call, bcf1_t *rec); // multiallic and rare-variant calling model
132131
int ccall(call_t *call, bcf1_t *rec); // the default consensus calling model

doc/bcftools.1

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,12 @@
22
.\" Title: bcftools
33
.\" Author: [see the "AUTHOR(S)" section]
44
.\" Generator: Asciidoctor 2.0.16
5-
.\" Date: 2023-08-23
5+
.\" Date: 2023-10-23
66
.\" Manual: \ \&
77
.\" Source: \ \&
88
.\" Language: English
99
.\"
10-
.TH "BCFTOOLS" "1" "2023-08-23" "\ \&" "\ \&"
10+
.TH "BCFTOOLS" "1" "2023-10-23" "\ \&" "\ \&"
1111
.ie \n(.g .ds Aq \(aq
1212
.el .ds Aq '
1313
.ss \n[.ss] 0
@@ -51,7 +51,7 @@ standard input (stdin) and outputs to the standard output (stdout). Several
5151
commands can thus be combined with Unix pipes.
5252
.SS "VERSION"
5353
.sp
54-
This manual page was last updated \fB2023\-08\-23 15:48 CEST\fP and refers to bcftools git version \fB1.18\-8\-g23dcc5a+\fP.
54+
This manual page was last updated \fB2023\-10\-23 09:31 CEST\fP and refers to bcftools git version \fB1.18\-19\-g6374c6a+\fP.
5555
.SS "BCF1"
5656
.sp
5757
The obsolete BCF1 format output by versions of samtools <= 0.1.19 is \fBnot\fP
@@ -1054,6 +1054,11 @@ output all alternate alleles present in the alignments even if they do not
10541054
appear in any of the genotypes
10551055
.RE
10561056
.sp
1057+
\fB\-\fP*\fB, \-\-keep\-unseen\-allele\fP
1058+
.RS 4
1059+
keep the unobserved allele <*> or <NON_REF>, useful mainly for gVCF output
1060+
.RE
1061+
.sp
10571062
\fB\-f, \-\-format\-fields\fP \fIlist\fP
10581063
.RS 4
10591064
comma\-separated list of FORMAT fields to output for each sample. Currently
@@ -2796,7 +2801,7 @@ see \fBCommon Options\fP
27962801
.sp
27972802
\fB\-w, \-\-write\fP \fILIST\fP
27982803
.RS 4
2799-
list of input files to output given as 1\-based indices. With \fB\-p\fP and no
2804+
comma\-separated list of input files to output given as 1\-based indices. With \fB\-p\fP and no
28002805
\fB\-w\fP, all files are written.
28012806
.RE
28022807
.sp
@@ -4428,6 +4433,13 @@ continue even when some samples requested via \fB\-s/\-S\fP do not exist
44284433
learn by example, see below
44294434
.RE
44304435
.sp
4436+
\fB\-F, \-\-print\-filtered\fP \fISTR\fP
4437+
.RS 4
4438+
by default, samples failing \fB\-i/\-e\fP filtering expressions are suppressed from output
4439+
when FORMAT fields are queried (for example \fI%CHROM %POS [ %GT]\fP). With \fB\-F\fP, such
4440+
fields will be still printed but instead of their actual value, \fISTR\fP will be used.
4441+
.RE
4442+
.sp
44314443
\fB\-H, \-\-print\-header\fP
44324444
.RS 4
44334445
print header

doc/bcftools.html

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,7 @@ <h2 id="_description">DESCRIPTION</h2>
5050
<div class="sect2">
5151
<h3 id="_version">VERSION</h3>
5252
<div class="paragraph">
53-
<p>This manual page was last updated <strong>2023-08-23 15:48 CEST</strong> and refers to bcftools git version <strong>1.18-8-g23dcc5a+</strong>.</p>
53+
<p>This manual page was last updated <strong>2023-10-23 09:31 CEST</strong> and refers to bcftools git version <strong>1.18-19-g6374c6a+</strong>.</p>
5454
</div>
5555
</div>
5656
<div class="sect2">
@@ -830,6 +830,10 @@ <h4 id="_inputoutput_options">Input/output options:</h4>
830830
<p>output all alternate alleles present in the alignments even if they do not
831831
appear in any of the genotypes</p>
832832
</dd>
833+
<dt class="hdlist1"><strong>-</strong>*<strong>, --keep-unseen-allele</strong></dt>
834+
<dd>
835+
<p>keep the unobserved allele &lt;*&gt; or &lt;NON_REF&gt;, useful mainly for gVCF output</p>
836+
</dd>
833837
<dt class="hdlist1"><strong>-f, --format-fields</strong> <em>list</em></dt>
834838
<dd>
835839
<p>comma-separated list of FORMAT fields to output for each sample. Currently
@@ -2487,7 +2491,7 @@ <h3 id="isec">bcftools isec [<em>OPTIONS</em>] <em>A.vcf.gz</em> <em>B.vcf.gz</
24872491
</dd>
24882492
<dt class="hdlist1"><strong>-w, --write</strong> <em>LIST</em></dt>
24892493
<dd>
2490-
<p>list of input files to output given as 1-based indices. With <strong>-p</strong> and no
2494+
<p>comma-separated list of input files to output given as 1-based indices. With <strong>-p</strong> and no
24912495
<strong>-w</strong>, all files are written.</p>
24922496
</dd>
24932497
<dt class="hdlist1"><strong>--write-index</strong></dt>
@@ -3849,6 +3853,12 @@ <h3 id="query">bcftools query [<em>OPTIONS</em>] <em>file.vcf.gz</em> [<em>file.
38493853
<dd>
38503854
<p>learn by example, see below</p>
38513855
</dd>
3856+
<dt class="hdlist1"><strong>-F, --print-filtered</strong> <em>STR</em></dt>
3857+
<dd>
3858+
<p>by default, samples failing <strong>-i/-e</strong> filtering expressions are suppressed from output
3859+
when FORMAT fields are queried (for example <em>%CHROM %POS [ %GT]</em>). With <strong>-F</strong>, such
3860+
fields will be still printed but instead of their actual value, <em>STR</em> will be used.</p>
3861+
</dd>
38523862
<dt class="hdlist1"><strong>-H, --print-header</strong></dt>
38533863
<dd>
38543864
<p>print header</p>
@@ -5317,7 +5327,7 @@ <h2 id="_copying">COPYING</h2>
53175327
</div>
53185328
<div id="footer">
53195329
<div id="footer-text">
5320-
Last updated 2023-08-23 15:48:11 +0200
5330+
Last updated 2023-10-23 09:31:31 +0200
53215331
</div>
53225332
</div>
53235333
</body>

doc/bcftools.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -624,6 +624,9 @@ demand. The original calling model can be invoked with the *-c* option.
624624
output all alternate alleles present in the alignments even if they do not
625625
appear in any of the genotypes
626626

627+
*-***, --keep-unseen-allele*::
628+
keep the unobserved allele <*> or <NON_REF>, useful mainly for gVCF output
629+
627630
*-f, --format-fields* 'list'::
628631
comma-separated list of FORMAT fields to output for each sample. Currently
629632
GQ and GP fields are supported. For convenience, the fields can be given

gvcf.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,6 @@ bcf1_t *gvcf_write(gvcf_t *gvcf, htsFile *fh, bcf_hdr_t *hdr, bcf1_t *rec, int i
9898
// encountered, or other conditions not met (block broken by a non-ref or DP too low).
9999
int needs_flush = can_collapse ? 0 : 1;
100100

101-
102101
// Can the record be included in a gVCF block? That is, is this a ref-only site?
103102
if ( rec && can_collapse )
104103
{

mcall.c

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1558,6 +1558,11 @@ int mcall(call_t *call, bcf1_t *rec)
15581558
call->nals_new = 0;
15591559
for (i=0; i<nals_ori; i++)
15601560
{
1561+
if ( (call->flag&CALL_KEEP_UNSEEN) && i==unseen && call->nals_new==1 )
1562+
{
1563+
call->nals_new++;
1564+
call->als_new |= 1<<i;
1565+
}
15611566
if ( i>0 && i==unseen ) continue;
15621567
if ( call->flag & CALL_KEEPALT ) call->als_new |= 1<<i;
15631568
if ( call->als_new & (1<<i) ) call->nals_new++;
@@ -1669,6 +1674,6 @@ int mcall(call_t *call, bcf1_t *rec)
16691674

16701675
bcf_update_info_int32(call->hdr, rec, "I16", NULL, 0); // remove I16 tag
16711676

1672-
return call->nals_new;
1677+
return is_variant ? call->nals_new : 1;
16731678
}
16741679

vcfcall.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -910,6 +910,7 @@ static void usage(args_t *args)
910910
fprintf(stderr, "\n");
911911
fprintf(stderr, "Input/output options:\n");
912912
fprintf(stderr, " -A, --keep-alts Keep all possible alternate alleles at variant sites\n");
913+
fprintf(stderr, " -*, --keep-unseen-allele Keep the unobserved allele <*> or <NON_REF>\n");
913914
fprintf(stderr, " -a, --annotate LIST Optional tags to output (lowercase allowed); '?' to list available tags\n");
914915
fprintf(stderr, " -F, --prior-freqs AN,AC Use prior allele frequencies, determined from these pre-filled tags\n");
915916
fprintf(stderr, " -G, --group-samples FILE|- Group samples by population (file with \"sample\\tgroup\") or \"-\" for single-sample calling.\n");
@@ -987,6 +988,7 @@ int main_vcfcall(int argc, char *argv[])
987988
{"targets-file",required_argument,NULL,'T'},
988989
{"threads",required_argument,NULL,9},
989990
{"keep-alts",no_argument,NULL,'A'},
991+
{"keep-unseen-allele",no_argument,NULL,'*'},
990992
{"insert-missed",no_argument,NULL,'i'},
991993
{"skip-Ns",no_argument,NULL,'N'}, // now the new default
992994
{"keep-masked-refs",no_argument,NULL,'M'},
@@ -1008,7 +1010,7 @@ int main_vcfcall(int argc, char *argv[])
10081010
};
10091011

10101012
char *tmp = NULL;
1011-
while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:ANMV:vcmp:C:n:P:f:a:ig:XYF:G:", loptions, NULL)) >= 0)
1013+
while ((c = getopt_long(argc, argv, "h?o:O:r:R:s:S:t:T:A*NMV:vcmp:C:n:P:f:a:ig:XYF:G:", loptions, NULL)) >= 0)
10121014
{
10131015
switch (c)
10141016
{
@@ -1026,6 +1028,7 @@ int main_vcfcall(int argc, char *argv[])
10261028
case 'M': args.flag &= ~CF_ACGT_ONLY; break; // keep sites where REF is N
10271029
case 'N': args.flag |= CF_ACGT_ONLY; break; // omit sites where first base in REF is N (the new default)
10281030
case 'A': args.aux.flag |= CALL_KEEPALT; break;
1031+
case '*': args.aux.flag |= CALL_KEEP_UNSEEN; break;
10291032
case 'c': args.flag |= CF_CCALL; break; // the original EM based calling method
10301033
case 'i': args.flag |= CF_INS_MISSED; break;
10311034
case 'v': args.aux.flag |= CALL_VARONLY; break;

0 commit comments

Comments
 (0)