Skip to content

Commit f3c11d8

Browse files
Fix bug in Tabix handling
1 parent a2aefac commit f3c11d8

File tree

7 files changed

+65
-5
lines changed

7 files changed

+65
-5
lines changed

bio2zarr/vcf_utils.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ def __post_init__(self):
6363
assert self.start > 0
6464
if self.end is not None:
6565
self.end = int(self.end)
66-
assert self.end > 0
66+
assert self.end > self.start
6767

6868
def __str__(self):
6969
s = f"{self.contig}"
@@ -641,15 +641,17 @@ def partition_into_regions(
641641
next_contig = self.sequence_names[region_contigs[i + 1]]
642642
next_start = region_starts[i + 1]
643643
end = next_start - 1 # subtract one since positions are inclusive
644+
# print("next_start", next_contig, next_start)
644645
if next_contig == contig: # contig doesn't change
645646
regions.append(Region(contig, start, end))
646647
else:
647648
# contig changes, so need two regions (or possibly more if any
648649
# sequences were skipped)
649650
regions.append(Region(contig, start))
650651
for ri in range(region_contigs[i] + 1, region_contigs[i + 1]):
651-
regions.append(self.sequence_names[ri])
652-
regions.append(Region(next_contig, 1, end))
652+
regions.append(Region(self.sequence_names[ri]))
653+
if end >= 1:
654+
regions.append(Region(next_contig, 1, end))
653655

654656
# Add any sequences at the end that were not skipped
655657
for ri in range(region_contigs[-1] + 1, len(self.sequence_names)):

tests/data/vcf/multi_contig.vcf.gz

10 KB
Binary file not shown.
176 Bytes
Binary file not shown.

tests/test_vcf_utils.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
("1kg_2020_chrM.bcf.csi", {"chrM": 23}),
3333
("1kg_2020_chr20_annotations.bcf.csi", {"chr20": 21}),
3434
("NA12878.prod.chr20snippet.g.vcf.gz.tbi", {"20": 301778}),
35+
("multi_contig.vcf.gz.tbi", {str(j): 933 for j in range(5)}),
3536
],
3637
)
3738
def test_index_record_count(index_file, expected):
@@ -53,6 +54,7 @@ def test_index_record_count(index_file, expected):
5354
("1kg_2020_chrM.bcf.csi", ["chrM:1-"]),
5455
("1kg_2020_chr20_annotations.bcf.csi", ["chr20:49153-"]),
5556
("NA12878.prod.chr20snippet.g.vcf.gz.tbi", ["20:1-"]),
57+
("multi_contig.vcf.gz.tbi", ["0:1-"] + [str(j) for j in range(1, 5)]),
5658
],
5759
)
5860
def test_partition_into_one_part(index_file, expected):
@@ -63,6 +65,18 @@ def test_partition_into_one_part(index_file, expected):
6365
assert [str(r) for r in regions] == expected
6466

6567

68+
def test_tabix_multi_chrom_bug():
69+
index_file = "multi_contig.vcf.gz.tbi"
70+
vcf_path = data_path / (".".join(list(index_file.split("."))[:-1]))
71+
indexed_vcf = vcf_utils.IndexedVcf(vcf_path, data_path / index_file)
72+
regions = indexed_vcf.partition_into_regions(num_parts=10)
73+
# An earlier version of the code returned this, i.e. with a duplicate
74+
# for 4 with end coord of 0
75+
# ["0:1-", "1", "2", "3", "4:1-0", "4:1-"]
76+
expected = ["0:1-", "1", "2", "3", "4:1-"]
77+
assert [str(r) for r in regions] == expected
78+
79+
6680
class TestCsiIndex:
6781
@pytest.mark.parametrize(
6882
"filename",

validation-data/Makefile

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,21 @@
11
all: 1kg_2020_chr20.bcf.csi \
22
1kg_2020_chr20.vcf.gz.tbi \
3+
1kg_2020_chr20.vcf.gz.csi \
34
1kg_2020_chr20_annotations.vcf.gz.tbi \
5+
1kg_2020_chr20_annotations.vcf.gz.csi \
46
1kg_2020_chr20_annotations.bcf.csi\
57
1kg_2020_others.bcf.csi \
68
1kg_2020_others.vcf.gz.tbi \
9+
1kg_2020_others.vcf.gz.csi \
10+
1kg_2020_freebayes.vcf.gz.tbi \
11+
1kg_2020_freebayes.vcf.gz.csi \
12+
1kg_2020_freebayes.bcf.csi \
713
1kg_p3_all_chr1.bcf.csi \
814
1kg_p3_all_chr1.vcf.gz.tbi\
15+
1kg_p3_all_chr1.vcf.gz.csi\
916
1kg_p1_all_chr6.bcf.csi\
10-
1kg_p1_all_chr6.vcf.gz.tbi
17+
1kg_p1_all_chr6.vcf.gz.tbi \
18+
1kg_p1_all_chr6.vcf.gz.csi
1119

1220
.PRECIOUS: %.bcf
1321
.PRECIOUS: %.vcf.gz
@@ -17,6 +25,7 @@ all: 1kg_2020_chr20.bcf.csi \
1725
1KG_2020_GENOTYPES_URL="${1KG_2020_BASE}chr20.recalibrated_variants.vcf.gz"
1826
1KG_2020_ANNOTATIONS_URL="${1KG_2020_BASE}chr20.recalibrated_variants.annotated.vcf.gz"
1927
1KG_2020_OTHERS_URL="${1KG_2020_BASE}others.recalibrated_variants.vcf.gz"
28+
1KG_2020_FREEBAYES="http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20200515_EBI_Freebayescalls/ALL.chrX.freebayes.20200518.snps_indels.NYhc.GRCh38.vcf.gz"
2029

2130
# 1000 genomes phase 3
2231
1KG_P3_ALL_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
@@ -29,7 +38,7 @@ all: 1kg_2020_chr20.bcf.csi \
2938
%.vcf.gz.tbi: %.vcf.gz
3039
tabix $<
3140

32-
%.bcf.csi: %.bcf
41+
%.csi: %
3342
bcftools index $<
3443

3544
%.bcf: %.vcf.gz
@@ -41,6 +50,9 @@ all: 1kg_2020_chr20.bcf.csi \
4150
1kg_2020_others.vcf.gz:
4251
bcftools view ${1KG_2020_OTHERS_URL} | head -n 10000 | bcftools view -O z -o $@
4352

53+
1kg_2020_freebayes.vcf.gz:
54+
bcftools view ${1KG_2020_FREEBAYES} | head -n 10000 | bcftools view -O z -o $@
55+
4456
1kg_2020_chr20_annotations.vcf.gz:
4557
bcftools view ${1KG_2020_ANNOTATIONS_URL} | head -n 100000 | bcftools view -O z -o $@
4658

validation.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
@click.argument("vcfs", nargs=-1)
1313
@click.option("-p", "--worker-processes", type=int, default=1)
1414
@click.option("-f", "--force", is_flag=True, default=False)
15+
# TODO add options for verbose and to force the use of a given
16+
# index file
1517
def cli(vcfs, worker_processes, force):
1618
data_path = pathlib.Path("validation-data")
1719
if len(vcfs) == 0:

vcf_generator.py

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
# Development script to automate generating multi-contig
2+
# VCFs of various lenghts for testing the indexing code.
3+
# TODO should move to a "scripts" directory or something
4+
import sys
5+
import click
6+
7+
8+
def write_header(num_contigs):
9+
click.echo("##fileformat=VCFv4.2")
10+
click.echo(f"##source={' '.join(sys.argv)}")
11+
click.echo('##FILTER=<ID=PASS,Description="All filters passed">')
12+
for contig in range(num_contigs):
13+
click.echo(f"##contig=<ID={contig}>")
14+
header = "\t".join(["#CHROM", "POS", "ID", "REF", "ALT", "QUAL", "FILTER", "INFO"])
15+
click.echo(header)
16+
17+
18+
@click.command
19+
@click.argument("contigs", type=int)
20+
@click.argument("rows-per-contig", type=int)
21+
def cli(contigs, rows_per_contig):
22+
write_header(contigs)
23+
for j in range(contigs):
24+
for k in range(rows_per_contig):
25+
pos = str(k + 1)
26+
click.echo("\t".join([str(j), pos, "A", "T", ".", ".", ".", "."]))
27+
28+
29+
if __name__ == "__main__":
30+
cli()

0 commit comments

Comments
 (0)