Skip to content

Commit 7b0c9ae

Browse files
committed
Add new -g, --gff-annot option
When present, the program will right-align indels in forward transcripts to follow HGVS recommendations (3'rule) https://varnomen.hgvs.org/recommendations/DNA/variant/deletion/ Resolves #1929
1 parent 62b4e50 commit 7b0c9ae

File tree

11 files changed

+728
-70
lines changed

11 files changed

+728
-70
lines changed

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ vcfgtcheck.o: vcfgtcheck.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htsli
246246
vcfindex.o: vcfindex.c $(htslib_vcf_h) $(htslib_tbx_h) $(htslib_kstring_h) $(htslib_bgzf_h) $(bcftools_h)
247247
vcfisec.o: vcfisec.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_hts_os_h) $(bcftools_h) $(filter_h)
248248
vcfmerge.o: vcfmerge.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_vcfutils_h) $(htslib_faidx_h) regidx.h $(bcftools_h) vcmp.h $(htslib_khash_h)
249-
vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h
249+
vcfnorm.o: vcfnorm.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_faidx_h) $(htslib_khash_str2int_h) $(bcftools_h) rbuf.h abuf.h gff.h
250250
vcfquery.o: vcfquery.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_khash_str2int_h) $(htslib_vcfutils_h) $(bcftools_h) $(filter_h) $(convert_h) $(smpl_ilist_h)
251251
vcfroh.o: vcfroh.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kseq_h) $(htslib_bgzf_h) $(bcftools_h) HMM.h $(smpl_ilist_h) $(filter_h)
252252
vcfcnv.o: vcfcnv.c $(htslib_vcf_h) $(htslib_synced_bcf_reader_h) $(htslib_kstring_h) $(htslib_kfunc_h) $(htslib_khash_str2int_h) $(bcftools_h) HMM.h rbuf.h

NEWS

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,9 @@ Changes affecting specific commands:
8080

8181
- Symbolic <DEL.*> alleles are now normalized too (#1919)
8282

83+
- New `-g, --gff-annot` option to right-align indels in forward transcripts to follow
84+
HGVS 3'rule (#1929)
85+
8386
* bcftools query
8487

8588
- Force newline character in formatting expression when not given explicitly

doc/bcftools.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2451,6 +2451,11 @@ the *<<fasta_ref,--fasta-ref>>* option is supplied.
24512451
try to proceed with *-m-* even if malformed tags with incorrect number of fields
24522452
are encountered, discarding such tags. (Experimental, use at your own risk.)
24532453

2454+
*-g, --gff-annot* 'FILE'::
2455+
when a GFF file is provided, follow HGVS 3'rule and right-align variants in transcripts on the forward
2456+
strand. In case of overlapping transcripts, the default mode is to left-align the variant. For a
2457+
description of the supported GFF3 file format see *<<csq,bcftools csq>>*.
2458+
24542459
*--keep-sum* 'TAG'[,...]::
24552460
keep vector sum constant when splitting multiallelic sites. Only AD tag
24562461
is currently supported. See also https://github.com/samtools/bcftools/issues/360

gff.c

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1074,15 +1074,18 @@ gff_t *gff_init(const char *fname)
10741074
}
10751075
void gff_destroy(gff_t *gff)
10761076
{
1077-
khint_t k; //,i,j;
1078-
for (k=0; k<kh_end(gff->init.gid2gene); k++)
1077+
khint_t k;
1078+
if ( gff->init.gid2gene )
10791079
{
1080-
if ( !kh_exist(gff->init.gid2gene, k) ) continue;
1081-
gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
1082-
free(gene->name);
1083-
free(gene);
1080+
for (k=0; k<kh_end(gff->init.gid2gene); k++)
1081+
{
1082+
if ( !kh_exist(gff->init.gid2gene, k) ) continue;
1083+
gf_gene_t *gene = (gf_gene_t*) kh_val(gff->init.gid2gene, k);
1084+
free(gene->name);
1085+
free(gene);
1086+
}
1087+
kh_destroy(int2gene,gff->init.gid2gene);
10841088
}
1085-
kh_destroy(int2gene,gff->init.gid2gene);
10861089

10871090
regidx_destroy(gff->idx_cds);
10881091
regidx_destroy(gff->idx_utr);

test/norm.right-align.1.out

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
##fileformat=VCFv4.2
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##contig=<ID=7,length=249250621>
4+
##INFO=<ID=type,Number=.,Type=String,Description="">
5+
##INFO=<ID=EXP,Number=1,Type=String,Description="Expected consequence">
6+
##INFO=<ID=EXPL,Number=1,Type=String,Description="Expected consequence with bt/csq -l">
7+
##INFO=<ID=ORI,Number=1,Type=String,Description="Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX">
8+
#CHROM POS ID REF ALT QUAL FILTER INFO
9+
7 897 . GGAATTAAGA G . . .
10+
7 910 . G C . . .

test/norm.right-align.2.out

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
##fileformat=VCFv4.2
2+
##FILTER=<ID=PASS,Description="All filters passed">
3+
##contig=<ID=7,length=249250621>
4+
##INFO=<ID=type,Number=.,Type=String,Description="">
5+
##INFO=<ID=EXP,Number=1,Type=String,Description="Expected consequence">
6+
##INFO=<ID=EXPL,Number=1,Type=String,Description="Expected consequence with bt/csq -l">
7+
##INFO=<ID=ORI,Number=1,Type=String,Description="Original variant. Format: CHR|POS|REF|ALT|USED_ALT_IDX">
8+
#CHROM POS ID REF ALT QUAL FILTER INFO
9+
7 900 . ATTAAGAGAA A . . ORI=7|897|GGAATTAAGA|G
10+
7 910 . G C . . .

test/norm.right-align.fa

Lines changed: 488 additions & 0 deletions
Large diffs are not rendered by default.

test/norm.right-align.gff

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
7 ensembl_havana gene 100 29201 . + . ID=gene:ENSG00000146648;Name=EGFR;biotype=protein_coding;description=epidermal growth factor receptor [Source:HGNC Symbol%3BAcc:3236];gene_id=ENSG00000146648;logic_name=ensembl_havana_gene;version=11
2+
7 ensembl_havana mRNA 100 29201 . + . ID=transcript:ENST00000455089;Parent=gene:ENSG00000146648;Name=EGFR-004;biotype=protein_coding;havana_transcript=OTTHUMT00000343056;havana_version=1;tag=basic;transcript_id=ENST00000455089;version=1
3+
7 havana gene 5875 15059 . - . ID=gene:ENSG00000224057;Name=EGFR-AS1;biotype=antisense;description=EGFR antisense RNA 1 [Source:HGNC Symbol%3BAcc:40207];gene_id=ENSG00000224057;logic_name=havana;version=1
4+
7 havana transcript 5875 15059 . - . ID=transcript:ENST00000442411;Parent=gene:ENSG00000224057;Name=EGFR-AS1-001;biotype=antisense;havana_transcript=OTTHUMT00000343091;havana_version=1;tag=basic;transcript_id=ENST00000442411;version=1

test/norm.right-align.vcf

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
##fileformat=VCFv4.2
2+
##contig=<ID=7,length=249250621>
3+
##INFO=<ID=type,Number=.,Type=String,Description="">
4+
##INFO=<ID=EXP,Number=1,Type=String,Description="Expected consequence">
5+
##INFO=<ID=EXPL,Number=1,Type=String,Description="Expected consequence with bt/csq -l">
6+
#CHROM POS ID REF ALT QUAL FILTER INFO
7+
7 897 . GGAATTAAGA G . . .
8+
7 910 . G C . . .

test/test.pl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,8 @@
286286
run_test(\&test_vcf_norm,$opts,in=>'norm.phased-split',out=>'norm.phased-split.1.out',args=>'-m -any');
287287
run_test(\&test_vcf_norm,$opts,in=>'norm.phased-join',out=>'norm.phased-join.1.out',args=>'-m +any');
288288
run_test(\&test_vcf_norm,$opts,in=>'norm.symbolic',fai=>'norm.symbolic',out=>'norm.symbolic.1.out',args=>'--old-rec-tag ORI');
289+
run_test(\&test_vcf_norm,$opts,in=>'norm.right-align',fai=>'norm.right-align',out=>'norm.right-align.1.out',args=>'--old-rec-tag ORI');
290+
run_test(\&test_vcf_norm,$opts,in=>'norm.right-align',fai=>'norm.right-align',out=>'norm.right-align.2.out',args=>'--old-rec-tag ORI -g {PATH}/norm.right-align.gff');
289291
run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.1.out',args=>'-aUc1 -C1 -s NA00002 -v snps',reg=>'');
290292
run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.2.out',args=>'-f PASS -Xks NA00003',reg=>'-r20,Y');
291293
run_test(\&test_vcf_view,$opts,in=>'view',out=>'view.3.out',args=>'-xs NA00003',reg=>'');
@@ -1409,6 +1411,7 @@ sub test_vcf_norm
14091411
my ($opts,%args) = @_;
14101412
bgzip_tabix_vcf($opts,$args{in});
14111413
my $params = '';
1414+
$args{args} =~ s/{PATH}/$$opts{path}/g;
14121415
if ( exists($args{args}) ) { $params .= " $args{args}"; }
14131416
if ( exists($args{fai} ) ) { $params .= " -f $$opts{path}/$args{fai}.fa"; }
14141417
test_cmd($opts,%args,cmd=>"$$opts{bin}/bcftools norm --no-version $params $$opts{tmp}/$args{in}.vcf.gz",exp_fix=>1);

0 commit comments

Comments
 (0)