Skip to content

Commit 6c66ae3

Browse files
pd3jkbonfield
authored andcommitted
New -m snp-ins-del switch to merge SNVs, insertions and deletions separately
Resolves #1704 and depends on samtools/htslib#1467 to be merged
1 parent 633970b commit 6c66ae3

File tree

11 files changed

+107
-27
lines changed

11 files changed

+107
-27
lines changed

NEWS

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,10 @@ Changes affecting specific commands:
2424
requested, which would result in not transferring the annotation from a tab-delimited
2525
file (#1733)
2626

27+
* bcftools merge
28+
29+
- New `-m snp-ins-del` switch to merge SNVs, insertions and deletions separately (#1704)
30+
2731
* bcftools mpileup
2832

2933
- New NMBZ annotation for Mann-Whitney U-z test on number of mismatches within

doc/bcftools.txt

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1902,15 +1902,16 @@ For "vertical" merge take a look at *<<concat,bcftools concat>>* or *<<norm,bcft
19021902
maximum number of alternate alleles that can be included in the PL tag. The default value
19031903
is 0 which disables the feature and outputs values for all alternate alleles.
19041904

1905-
*-m, --merge* 'snps'|'indels'|'both'|'all'|'none'|'id'::
1905+
*-m, --merge* 'snps'|'indels'|'both'|'snp-ins-del'|'all'|'none'|'id'::
19061906
The option controls what types of multiallelic records can be created:
19071907
----
1908-
-m none .. no new multiallelics, output multiple records instead
1909-
-m snps .. allow multiallelic SNP records
1910-
-m indels .. allow multiallelic indel records
1911-
-m both .. both SNP and indel records can be multiallelic
1912-
-m all .. SNP records can be merged with indel records
1913-
-m id .. merge by ID
1908+
-m none .. no new multiallelics, output multiple records instead
1909+
-m snps .. allow multiallelic SNP records
1910+
-m indels .. allow multiallelic indel records
1911+
-m both .. both SNP and indel records can be multiallelic
1912+
-m all .. SNP records can be merged with indel records
1913+
-m snp-ins-del .. allow multiallelic SNVs, insertions, deletions, but don't mix them
1914+
-m id .. merge by ID
19141915
----
19151916

19161917
*--no-index*::

plugins/smpl-stats.c

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
/* The MIT License
22
3-
Copyright (c) 2018-2021 Genome Research Ltd.
3+
Copyright (c) 2018-2022 Genome Research Ltd.
44
55
Author: Petr Danecek <[email protected]>
6-
6+
77
Permission is hereby granted, free of charge, to any person obtaining a copy
88
of this software and associated documentation files (the "Software"), to deal
99
in the Software without restriction, including without limitation the rights
1010
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
1111
copies of the Software, and to permit persons to whom the Software is
1212
furnished to do so, subject to the following conditions:
13-
13+
1414
The above copyright notice and this permission notice shall be included in
1515
all copies or substantial portions of the Software.
16-
16+
1717
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
1818
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
1919
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
@@ -92,7 +92,7 @@ const char *about(void)
9292

9393
static const char *usage_text(void)
9494
{
95-
return
95+
return
9696
"\n"
9797
"About: Calculates basic per-sample stats. Use curly brackets to scan a range of values simultaneously\n"
9898
"Usage: bcftools +smpl-stats [Plugin Options]\n"
@@ -147,7 +147,7 @@ static void parse_filters(args_t *args)
147147
}
148148
if ( !expanded ) break;
149149
}
150-
150+
151151
fprintf(stderr,"Collecting data for %d filtering expressions\n", args->nflt_str);
152152
}
153153

@@ -184,9 +184,9 @@ static void init_data(args_t *args)
184184
// replace tab's with spaces so that the output stays parsable
185185
char *tmp = args->filters[i].expr;
186186
while ( *tmp )
187-
{
188-
if ( *tmp=='\t' ) *tmp = ' ';
189-
tmp++;
187+
{
188+
if ( *tmp=='\t' ) *tmp = ' ';
189+
tmp++;
190190
}
191191
}
192192
}
@@ -339,7 +339,7 @@ static void process_record(args_t *args, bcf1_t *rec, flt_stats_t *flt)
339339
int ngt = bcf_get_genotypes(args->hdr, rec, &args->gt_arr, &args->mgt_arr);
340340
if ( ngt<0 ) return;
341341
int ngt1 = ngt / rec->n_sample;
342-
342+
343343

344344
// For ts/tv: numeric code of the reference allele, -1 for insertions
345345
int ref = !rec->d.allele[0][1] ? bcf_acgt2int(*rec->d.allele[0]) : -1;
@@ -381,7 +381,7 @@ static void process_record(args_t *args, bcf1_t *rec, flt_stats_t *flt)
381381
has_nonref = 1;
382382
}
383383
if ( !has_nonref ) continue; // only ref or * in this genotype
384-
384+
385385
stats->nnon_ref++;
386386

387387
// Calculate ts/tv, count SNPs, indels. It does the right thing and handles also HetAA genotypes
@@ -395,8 +395,7 @@ static void process_record(args_t *args, bcf1_t *rec, flt_stats_t *flt)
395395

396396
if ( args->ac[als[j]]==1 ) { stats->nsingleton++; site_singleton = 1; }
397397

398-
int var_type = bcf_get_variant_type(rec, als[j]);
399-
if ( var_type==VCF_SNP || var_type==VCF_MNP )
398+
if ( bcf_has_variant_type(rec, als[j], VCF_SNP|VCF_MNP, subset) )
400399
{
401400
int k = 0;
402401
while ( rec->d.allele[0][k] && rec->d.allele[als[j]][k] )
@@ -411,7 +410,7 @@ static void process_record(args_t *args, bcf1_t *rec, flt_stats_t *flt)
411410
k++;
412411
}
413412
}
414-
else if ( var_type==VCF_INDEL ) has_indel = 1;
413+
else if ( bcf_has_variant_type(rec, als[j], VCF_INDEL, exact) ) has_indel = 1;
415414
}
416415
if ( has_ts ) { stats->nts++; site_has_ts = 1; }
417416
if ( has_tv ) { stats->ntv++; site_has_tv = 1; }
@@ -446,7 +445,7 @@ int run(int argc, char **argv)
446445
int c, i;
447446
while ((c = getopt_long(argc, argv, "o:s:i:e:r:R:t:T:",loptions,NULL)) >= 0)
448447
{
449-
switch (c)
448+
switch (c)
450449
{
451450
case 'e':
452451
if ( args->filter_str ) error("Error: only one -i or -e expression can be given, and they cannot be combined\n");

test/merge.10.1.out

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
##fileformat=VCFv4.3
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##contig=<ID=1,assembly=b37,length=249250621>
4+
##reference=file:ref.fa
5+
#CHROM POS ID REF ALT QUAL FILTER INFO
6+
1 3000000 . C CCG . . .
7+
1 3000000 . C CAA . . .
8+
1 3000150 . CC C . . .
9+
1 3000150 . C CTT . . .
10+
1 3000152 . C A . . .
11+
1 3000152 . C CA . . .
12+
1 3000154 . C A . . .
13+
1 3000154 . C T . . .

test/merge.10.2.out

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
##fileformat=VCFv4.3
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##contig=<ID=1,assembly=b37,length=249250621>
4+
##reference=file:ref.fa
5+
#CHROM POS ID REF ALT QUAL FILTER INFO
6+
1 3000000 . C CCG,CAA . . .
7+
1 3000150 . CC C,CTTC . . .
8+
1 3000152 . C A . . .
9+
1 3000152 . C CA . . .
10+
1 3000154 . C A,T . . .

test/merge.10.3.out

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
##fileformat=VCFv4.3
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##contig=<ID=1,assembly=b37,length=249250621>
4+
##reference=file:ref.fa
5+
#CHROM POS ID REF ALT QUAL FILTER INFO
6+
1 3000000 . C CCG,CAA . . .
7+
1 3000150 . CC C,CTTC . . .
8+
1 3000152 . C A . . .
9+
1 3000152 . C CA . . .
10+
1 3000154 . C A . . .
11+
1 3000154 . C T . . .

test/merge.10.a.vcf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
##fileformat=VCFv4.3
2+
##contig=<ID=1,assembly=b37,length=249250621>
3+
##reference=file:ref.fa
4+
#CHROM POS ID REF ALT QUAL FILTER INFO
5+
1 3000000 . C CCG . . .
6+
1 3000150 . CC C . . .
7+
1 3000152 . C A . . .
8+
1 3000154 . C A . . .

test/merge.10.b.vcf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
##fileformat=VCFv4.3
2+
##contig=<ID=1,assembly=b37,length=249250621>
3+
##reference=file:ref.fa
4+
#CHROM POS ID REF ALT QUAL FILTER INFO
5+
1 3000000 . C CAA . . .
6+
1 3000150 . C CTT . . .
7+
1 3000152 . C CA . . .
8+
1 3000154 . C T . . .

test/test.pl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,9 @@
9797
run_test(\&test_vcf_merge,$opts,in=>['merge.8.a','merge.8.b'],out=>'merge.8.out',args=>'-i AN:sum,AC:sum');
9898
run_test(\&test_vcf_merge,$opts,in=>['merge.9.a','merge.9.b'],out=>'merge.9.1.out',args=>'');
9999
run_test(\&test_vcf_merge,$opts,in=>['merge.9.a','merge.9.b'],out=>'merge.9.2.out',args=>'-i AN:sum,AC:sum');
100+
run_test(\&test_vcf_merge,$opts,in=>['merge.10.a','merge.10.b'],out=>'merge.10.1.out',args=>'-m none');
101+
run_test(\&test_vcf_merge,$opts,in=>['merge.10.a','merge.10.b'],out=>'merge.10.2.out',args=>'-m both');
102+
run_test(\&test_vcf_merge,$opts,in=>['merge.10.a','merge.10.b'],out=>'merge.10.3.out',args=>'-m snp-ins-del');
100103
# run_test(\&test_vcf_merge_big,$opts,in=>'merge_big.1',out=>'merge_big.1.1',nsmpl=>79000,nfiles=>79,nalts=>486,args=>''); # commented out for speed
101104
run_test(\&test_vcf_query,$opts,in=>'query.string',out=>'query.string.1.out',args=>q[-f '%CHROM\\t%POS\\t%CLNREVSTAT\\n' -i'CLNREVSTAT="criteria_provided,_conflicting_interpretations"']);
102105
run_test(\&test_vcf_query,$opts,in=>'query.string',out=>'query.string.1.out',args=>q[-f '%CHROM\\t%POS\\t%CLNREVSTAT\\n' -i'CLNREVSTAT="criteria_provided" || CLNREVSTAT="_conflicting_interpretations"']);

vcfmerge.c

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,8 @@ THE SOFTWARE. */
4343

4444
#define DBG 0
4545

46+
#define COLLAPSE_SNP_INS_DEL (1<<10)
47+
4648
#include <htslib/khash.h>
4749
KHASH_MAP_INIT_STR(strdict, int)
4850
typedef khash_t(strdict) strdict_t;
@@ -2517,7 +2519,16 @@ static inline int is_gvcf_block(bcf1_t *line)
25172519
}
25182520
return 0;
25192521
}
2520-
static const int snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2), indel_mask = VCF_INDEL<<2, ref_mask = 2;
2522+
2523+
// Lines can come with any combination of variant types. We use a subset of types defined in vcf.h
2524+
// but shift by two bits to account for VCF_REF defined as 0 (design flaw in vcf.h, my fault) and
2525+
// to accommodate for VCF_GVCF_REF defined below
2526+
static const int
2527+
snp_mask = (VCF_SNP<<2)|(VCF_MNP<<2),
2528+
indel_mask = VCF_INDEL<<2,
2529+
ins_mask = VCF_INS<<2,
2530+
del_mask = VCF_DEL<<2,
2531+
ref_mask = 2;
25212532

25222533
/*
25232534
Check incoming lines for new gVCF blocks, set pointer to the current source
@@ -2743,6 +2754,11 @@ int can_merge(args_t *args)
27432754
else
27442755
{
27452756
int var_type = bcf_get_variant_types(line);
2757+
if ( args->collapse==COLLAPSE_SNP_INS_DEL )
2758+
{
2759+
// need to distinguish between ins and del so strip the VCF_INDEL flag
2760+
var_type &= ~VCF_INDEL;
2761+
}
27462762
maux->var_types |= var_type ? var_type<<2 : 2;
27472763

27482764
// for the `-m none -g` mode
@@ -2812,7 +2828,7 @@ int can_merge(args_t *args)
28122828
// - SNPs+SNPs+MNPs+REF if -m both,snps
28132829
// - indels+indels+REF if -m both,indels, REF only if SNPs are not present
28142830
// - SNPs come first
2815-
if ( line_type & indel_mask )
2831+
if ( line_type & (indel_mask|ins_mask|del_mask) )
28162832
{
28172833
if ( !(line_type&snp_mask) && maux->var_types&snp_mask ) continue; // SNPs come first
28182834
if ( args->do_gvcf && maux->var_types&ref_mask ) continue; // never merge indels with gVCF blocks
@@ -2898,16 +2914,22 @@ void stage_line(args_t *args)
28982914
int line_type = bcf_get_variant_types(buf->lines[j]);
28992915
if ( maux->var_types&snp_mask && line_type&VCF_SNP && (args->collapse&COLLAPSE_SNPS) ) break;
29002916
if ( maux->var_types&indel_mask && line_type&VCF_INDEL && (args->collapse&COLLAPSE_INDELS) ) break;
2917+
if ( maux->var_types&ins_mask && line_type&VCF_INS && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
2918+
if ( maux->var_types&del_mask && line_type&VCF_DEL && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
29012919
if ( line_type==VCF_REF )
29022920
{
29032921
if ( maux->var_types&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
29042922
if ( maux->var_types&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
2923+
if ( maux->var_types&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
2924+
if ( maux->var_types&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
29052925
if ( maux->var_types&ref_mask ) break;
29062926
}
29072927
else if ( maux->var_types&ref_mask )
29082928
{
29092929
if ( line_type&snp_mask && (args->collapse&COLLAPSE_SNPS) ) break;
29102930
if ( line_type&indel_mask && (args->collapse&COLLAPSE_INDELS) ) break;
2931+
if ( line_type&ins_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
2932+
if ( line_type&del_mask && (args->collapse&COLLAPSE_SNP_INS_DEL) ) break;
29112933
}
29122934
}
29132935
}
@@ -3125,7 +3147,7 @@ static void usage(void)
31253147
fprintf(stderr, " -i, --info-rules TAG:METHOD,.. Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
31263148
fprintf(stderr, " -l, --file-list FILE Read file names from the file\n");
31273149
fprintf(stderr, " -L, --local-alleles INT EXPERIMENTAL: if more than <int> ALT alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
3128-
fprintf(stderr, " -m, --merge STRING Allow multiallelic records for <snps|indels|both|all|none|id>, see man page for details [both]\n");
3150+
fprintf(stderr, " -m, --merge STRING Allow multiallelic records for <snps|indels|both|snp-ins-del|all|none|id>, see man page for details [both]\n");
31293151
fprintf(stderr, " --no-index Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
31303152
fprintf(stderr, " --no-version Do not append version and command line to the header\n");
31313153
fprintf(stderr, " -o, --output FILE Write output to a file [standard output]\n");
@@ -3229,6 +3251,7 @@ int main_vcfmerge(int argc, char *argv[])
32293251
else if ( !strcmp(optarg,"any") ) args->collapse |= COLLAPSE_ANY;
32303252
else if ( !strcmp(optarg,"all") ) args->collapse |= COLLAPSE_ANY;
32313253
else if ( !strcmp(optarg,"none") ) args->collapse = COLLAPSE_NONE;
3254+
else if ( !strcmp(optarg,"snp-ins-del") ) args->collapse = COLLAPSE_SNP_INS_DEL;
32323255
else if ( !strcmp(optarg,"id") ) { args->collapse = COLLAPSE_NONE; args->merge_by_id = 1; }
32333256
else error("The -m type \"%s\" is not recognised.\n", optarg);
32343257
break;

0 commit comments

Comments
 (0)