Skip to content

Commit 9be8367

Browse files
committed
Add new -m exact merging mode. Resolves #2498
1 parent 89dfe49 commit 9be8367

File tree

8 files changed

+63
-10
lines changed

8 files changed

+63
-10
lines changed

NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,9 @@ Changes affecting specific commands:
2525
- Fixes in memory access with localized alleles, it could result in printing
2626
printing trailing garbage in localized FORMAT fields (#2520)
2727

28+
- Add new `-m exact` mode to merge only records with the exact same alleles
29+
(#2498)
30+
2831
* bcftools mpileup
2932

3033
- Remove unused experimental INFO/MIN_PL_SUM annotation

doc/bcftools.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2141,6 +2141,7 @@ For "vertical" merge take a look at *<<concat,bcftools concat>>* or *<<norm,bcft
21412141
'\*' is appended, the unobserved allele '<*>' or '<NON_REF>' will be removed at variant sites;
21422142
if two asterisks '**' are appended, the unobserved allele will be removed all sites.
21432143
----
2144+
-m exact .. require the exact same alleles
21442145
-m none .. no new multiallelics, output multiple records instead
21452146
-m snps .. allow multiallelic SNP records
21462147
-m indels .. allow multiallelic indel records

test/merge.exact.1.out

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
##fileformat=VCFv4.2
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##contig=<ID=1>
4+
#CHROM POS ID REF ALT QUAL FILTER INFO
5+
1 100 . C T,G . . .
6+
1 200 . C T,A,G . . .
7+
1 300 . C T,T,G . . .

test/merge.exact.2.out

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
##fileformat=VCFv4.2
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##contig=<ID=1>
4+
#CHROM POS ID REF ALT QUAL FILTER INFO
5+
1 100 . C T . . .
6+
1 100 . C T,G . . .
7+
1 200 . C T,A . . .
8+
1 200 . C T,G . . .
9+
1 300 . C T,T . . .
10+
1 300 . C T,G . . .

test/merge.exact.a.vcf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
##fileformat=VCFv4.2
2+
##contig=<ID=1>
3+
#CHROM POS ID REF ALT QUAL FILTER INFO
4+
1 100 . C T . . .
5+
1 200 . C T,A . . .
6+
1 300 . C T,T . . .

test/merge.exact.b.vcf

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
##fileformat=VCFv4.2
2+
##contig=<ID=1>
3+
#CHROM POS ID REF ALT QUAL FILTER INFO
4+
1 100 . C T,G . . .
5+
1 200 . C T,G . . .
6+
1 300 . C T,G . . .

test/test.pl

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@
6464
run_test(\&test_vcf_isec,$opts,in=>['isec-miss.2.1','isec-miss.2.2','isec-miss.2.3'],out=>'isec-miss.2.1.out',args=>'-n +1 -r 20:100,20:140,12:55,20:140,20:100');
6565
run_test(\&test_vcf_isec,$opts,in=>['isec-miss.2.1','isec-miss.2.2','isec-miss.2.3'],out=>'isec-miss.2.1.out',args=>'-R {PATH}/isec-miss.1.regs.txt -n +1');
6666
#run_test(\&test_vcf_merge,$opts,in=>['merge.broken-phase.a','merge.broken-phase.b'],out=>'merge.broken-phase.1.out',args=>'');
67+
run_test(\&test_vcf_merge,$opts,in=>['merge.exact.a','merge.exact.b'],out=>'merge.exact.1.out',args=>'');
68+
run_test(\&test_vcf_merge,$opts,in=>['merge.exact.a','merge.exact.b'],out=>'merge.exact.2.out',args=>'-m exact');
6769
run_test(\&test_vcf_merge,$opts,in=>['merge.broken-gvcf.a','merge.broken-gvcf.b'],out=>'merge.broken-gvcf.1.out',args=>'');
6870
run_test(\&test_vcf_merge,$opts,in=>['merge.symbolic.1.a','merge.symbolic.1.b'],out=>'merge.symbolic.1.1.out',args=>'');
6971
run_test(\&test_vcf_merge,$opts,in=>['merge.multiallelics.1.a','merge.multiallelics.1.b'],out=>'merge.multiallelics.1.1.out',args=>'--merge none');

vcfmerge.c

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/* vcfmerge.c -- Merge multiple VCF/BCF files to create one multi-sample file.
22
3-
Copyright (C) 2012-2025 Genome Research Ltd.
3+
Copyright (C) 2012-2026 Genome Research Ltd.
44
55
Author: Petr Danecek <pd3@sanger.ac.uk>
66
@@ -2980,13 +2980,18 @@ static inline int types_compatible(args_t *args, selected_t *selected, buffer_t
29802980
return 1;
29812981
}
29822982

2983+
if ( args->collapse==BCF_SR_PAIR_EXACT )
2984+
{
2985+
if ( maux->nals != rec->n_allele ) return 0;
2986+
}
2987+
29832988
if ( args->collapse & COLLAPSE_ANY ) return 1; // can merge anything with anything
29842989

29852990
// REF and gVCF_REF with no other alleles present can be merged with anything
29862991
if ( (selected->types&ref_mask) && !(selected->types&(~ref_mask)) ) return 1;
29872992
if ( (rec_types&ref_mask) && !(rec_types&(~ref_mask)) ) return 1;
29882993

2989-
if ( args->collapse!=COLLAPSE_NONE )
2994+
if ( args->collapse!=COLLAPSE_NONE && args->collapse!=BCF_SR_PAIR_EXACT )
29902995
{
29912996
// If we are here, one the following modes must have been set: both,snps,indels,snp-ins-del
29922997
// Include the new record if
@@ -3013,19 +3018,31 @@ static inline int types_compatible(args_t *args, selected_t *selected, buffer_t
30133018
int x = selected->types;
30143019
int y = rec_types;
30153020
if ( !(x&y) ) return 0; // no matching type
3016-
if ( (x&y)!=x && (x&y)!=y ) return 0; // not a subset
3021+
if ( (x&y)!=x && (x&y)!=y ) return 0; // neither is a type subset of the other
3022+
if ( args->collapse==BCF_SR_PAIR_EXACT && ((x&y)!=x || (x&y)!=y) ) return 0;
30173023

30183024
if ( vcmp_set_ref(args->vcmp,maux->als[0],rec->d.allele[0]) < 0 ) return 0; // refs are not compatible
3019-
for (k=1; k<rec->n_allele; k++)
3025+
if ( args->collapse!=BCF_SR_PAIR_EXACT )
3026+
{
3027+
for (k=1; k<rec->n_allele; k++)
3028+
{
3029+
if ( bcf_has_variant_type(rec,k,VCF_REF) ) continue; // this must be gVCF_REF (<*> or <NON_REF>)
3030+
if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,rec->d.allele[k])>=0 ) break;
3031+
}
3032+
if ( k==rec->n_allele ) return 0; // none of the alleles in rec is present in maux
3033+
}
3034+
else
30203035
{
3021-
if ( bcf_has_variant_type(rec,k,VCF_REF) ) continue; // this must be gVCF_REF (<*> or <NON_REF>)
3022-
if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,rec->d.allele[k])>=0 ) break;
3036+
for (k=1; k<rec->n_allele; k++)
3037+
{
3038+
if ( vcmp_find_allele(args->vcmp,maux->als+1,maux->nals-1,rec->d.allele[k]) < 0 ) break;
3039+
}
3040+
if ( k!=rec->n_allele ) return 0; // at least one of the alleles in rec is not present in maux
30233041
}
3024-
if ( k==rec->n_allele ) return 0; // this record has a new allele rec->d.allele[k]
30253042

30263043
if ( selected->types&other_mask && rec_types&other_mask )
30273044
{
3028-
// both records have symbolic alleles and the alleles are the same
3045+
// both records have symbolic alleles and the alleles are the same - does END match as well?
30293046
if ( selected->end!=end ) return 0;
30303047
}
30313048

@@ -3267,7 +3284,7 @@ void stage_line(args_t *args)
32673284
if ( j>=buf->end )
32683285
{
32693286
// no matching allele found in this file
3270-
if ( args->collapse==COLLAPSE_NONE ) continue; // exact matching requested, skip
3287+
if ( args->collapse==COLLAPSE_NONE || args->collapse==BCF_SR_PAIR_EXACT ) continue; // exact matching requested, skip
32713288

32723289
// choose something compatible to create a multiallelic site given the -m criteria
32733290
for (j=buf->beg; j<buf->end; j++)
@@ -3546,7 +3563,7 @@ static void usage(void)
35463563
fprintf(stderr, " -i, --info-rules TAG:METHOD,.. Rules for merging INFO fields (method is one of sum,avg,min,max,join) or \"-\" to turn off the default [DP:sum,DP4:sum]\n");
35473564
fprintf(stderr, " -l, --file-list FILE Read file names from the file\n");
35483565
fprintf(stderr, " -L, --local-alleles INT If more than INT alt alleles are encountered, drop FMT/PL and output LAA+LPL instead; 0=unlimited [0]\n");
3549-
fprintf(stderr, " -m, --merge STRING[*|**] Allow multiallelic records for snps,indels,both,snp-ins-del,all,none,id,*,**; see man page for details [both]\n");
3566+
fprintf(stderr, " -m, --merge STRING[*|**] Allow multiallelic records for snps,indels,both,snp-ins-del,all,exact,none,id,*,**; see man page for details [both]\n");
35503567
fprintf(stderr, " -M, --missing-rules TAG:METHOD Rules for replacing missing values in numeric vectors (.,0,max) when unknown allele <*> is not present [.]\n");
35513568
fprintf(stderr, " --no-index Merge unindexed files, the same chromosomal order is required and -r/-R are not allowed\n");
35523569
fprintf(stderr, " --no-version Do not append version and command line to the header\n");
@@ -3669,6 +3686,7 @@ int main_vcfmerge(int argc, char *argv[])
36693686
else if ( !strncmp(optarg,"all",len) ) args->collapse |= COLLAPSE_ANY;
36703687
else if ( !strncmp(optarg,"both",len) ) args->collapse |= COLLAPSE_BOTH;
36713688
else if ( !strncmp(optarg,"none",len) ) args->collapse = COLLAPSE_NONE;
3689+
else if ( !strncmp(optarg,"exact",len) ) args->collapse = BCF_SR_PAIR_EXACT;
36723690
else error("The -m type \"%s\" is not recognised.\n", optarg);
36733691
break;
36743692
}

0 commit comments

Comments
 (0)