Skip to content

Commit c7f8831

Browse files
Merge pull request #2 from jeromekelleher/validation-init
Validation
2 parents 2bd4c3c + e473d2a commit c7f8831

File tree

10 files changed

+196
-31
lines changed

10 files changed

+196
-31
lines changed

bio2zarr/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
2+
from .vcf import explode as explode_vcf

bio2zarr/vcf.py

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1009,7 +1009,11 @@ def fixed_field_spec(
10091009
shape.append(n)
10101010
chunks.append(chunk_width),
10111011
dimensions.append("samples")
1012-
if field.summary.max_number > 1:
1012+
if field.category == "FORMAT" and field.vcf_type == "String":
1013+
# FIXME not handling format string values very well right now
1014+
# as the max_number value is just the number of samples
1015+
pass
1016+
elif field.summary.max_number > 1:
10131017
shape.append(field.summary.max_number)
10141018
dimensions.append(field.name)
10151019
variable_name = prefix + field.name
@@ -1414,6 +1418,31 @@ def flush_chunk(start, stop):
14141418
return futures
14151419

14161420

1421+
def generate_spec(columnarised, out):
1422+
pcvcf = PickleChunkedVcf.load(columnarised)
1423+
spec = ZarrConversionSpec.generate(pcvcf)
1424+
json.dump(spec.asdict(), out, indent=4)
1425+
1426+
1427+
def to_zarr(
1428+
columnarised, zarr_path, conversion_spec, worker_processes=1, show_progress=False
1429+
):
1430+
pcvcf = PickleChunkedVcf.load(columnarised)
1431+
if conversion_spec is None:
1432+
spec = ZarrConversionSpec.generate(pcvcf)
1433+
else:
1434+
with open(conversion_spec, "r") as f:
1435+
d = json.load(f)
1436+
spec = ZarrConversionSpec.fromdict(d)
1437+
SgvcfZarr.convert(
1438+
pcvcf,
1439+
zarr_path,
1440+
conversion_spec=spec,
1441+
worker_processes=worker_processes,
1442+
show_progress=True,
1443+
)
1444+
1445+
14171446
def convert_vcf(
14181447
vcfs,
14191448
out_path,
@@ -1488,15 +1517,17 @@ def encode_bed_partition_genotypes(bed_path, zarr_path, start_variant, end_varia
14881517
flush_futures(futures)
14891518

14901519

1491-
def validate(vcf_path, zarr_path, show_progress):
1520+
def validate(vcf_path, zarr_path, show_progress=False):
14921521
store = zarr.DirectoryStore(zarr_path)
14931522

14941523
root = zarr.group(store=store)
14951524
pos = root["variant_position"][:]
14961525
allele = root["variant_allele"][:]
14971526
chrom = root["contig_id"][:][root["variant_contig"][:]]
14981527
vid = root["variant_id"][:]
1499-
call_genotype = iter(root["call_genotype"])
1528+
call_genotype = None
1529+
if "call_genotype" in root:
1530+
call_genotype = iter(root["call_genotype"])
15001531

15011532
vcf = cyvcf2.VCF(vcf_path)
15021533
format_headers = {}
@@ -1526,7 +1557,10 @@ def validate(vcf_path, zarr_path, show_progress):
15261557
start_index = np.searchsorted(pos, first_pos)
15271558
assert pos[start_index] == first_pos
15281559
vcf = cyvcf2.VCF(vcf_path)
1529-
iterator = tqdm.tqdm(vcf, total=vcf.num_records)
1560+
if show_progress:
1561+
iterator = tqdm.tqdm(vcf, total=vcf.num_records)
1562+
else:
1563+
iterator = vcf
15301564
for j, row in enumerate(iterator, start_index):
15311565
assert chrom[j] == row.CHROM
15321566
assert pos[j] == row.POS
@@ -1537,16 +1571,19 @@ def validate(vcf_path, zarr_path, show_progress):
15371571
assert np.all(allele[j, k + 1 :] == "")
15381572
# TODO FILTERS
15391573

1540-
gt = row.genotype.array()
1541-
gt_zarr = next(call_genotype)
1542-
gt_vcf = gt[:, :-1]
1543-
# NOTE weirdly cyvcf2 seems to remap genotypes automatically
1544-
# into the same missing/pad encoding that sgkit uses.
1545-
# if np.any(gt_zarr < 0):
1546-
# print("MISSING")
1547-
# print(gt_zarr)
1548-
# print(gt_vcf)
1549-
nt.assert_array_equal(gt_zarr, gt_vcf)
1574+
if call_genotype is None:
1575+
assert row.genotype is None
1576+
else:
1577+
gt = row.genotype.array()
1578+
gt_zarr = next(call_genotype)
1579+
gt_vcf = gt[:, :-1]
1580+
# NOTE weirdly cyvcf2 seems to remap genotypes automatically
1581+
# into the same missing/pad encoding that sgkit uses.
1582+
# if np.any(gt_zarr < 0):
1583+
# print("MISSING")
1584+
# print(gt_zarr)
1585+
# print(gt_vcf)
1586+
nt.assert_array_equal(gt_zarr, gt_vcf)
15501587

15511588
# TODO this is basically right, but the details about float padding
15521589
# need to be worked out in particular. Need to find examples of

tests/data/vcf/sample.vcf.gz

954 Bytes
Binary file not shown.

tests/data/vcf/sample.vcf.gz.tbi

185 Bytes
Binary file not shown.
868 Bytes
Binary file not shown.
187 Bytes
Binary file not shown.

tests/test_vcf.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import numpy as np
22
import numpy.testing as nt
3+
import xarray.testing as xt
34
import pytest
45
import sgkit as sg
56

@@ -218,3 +219,28 @@ def test_call_HQ(self, ds):
218219
[[-1, -1], [-1, -1], [-1, -1]],
219220
]
220221
nt.assert_array_equal(ds["call_HQ"], call_HQ)
222+
223+
def test_no_genotypes(self, ds, tmp_path):
224+
path = "tests/data/vcf/sample_no_genotypes.vcf.gz"
225+
out = tmp_path / "example.vcf.zarr"
226+
vcf.convert_vcf([path], out)
227+
ds2 = sg.load_dataset(out)
228+
assert len(ds2["sample_id"]) == 0
229+
for col in ds:
230+
if col != "sample_id" and not col.startswith("call_"):
231+
xt.assert_equal(ds[col], ds2[col])
232+
233+
234+
@pytest.mark.parametrize(
235+
"name",
236+
[
237+
"sample.vcf.gz",
238+
"sample_no_genotypes.vcf.gz",
239+
# "info_field_type_combos.vcf.gz",
240+
],
241+
)
242+
def test_by_validating(name, tmp_path):
243+
path = f"tests/data/vcf/{name}"
244+
out = tmp_path / "test.zarr"
245+
vcf.convert_vcf([path], out)
246+
vcf.validate(path, out)

validation-data/Makefile

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
all: 1kg_2020_chr20.bcf.csi \
2+
1kg_2020_chr20.vcf.gz.tbi \
3+
1kg_2020_chr20_annotations.vcf.gz.tbi \
4+
1kg_2020_chr20_annotations.bcf.csi\
5+
1kg_2020_others.bcf.csi \
6+
1kg_2020_others.vcf.gz.tbi \
7+
1kg_p3_all_chr1.bcf.csi \
8+
1kg_p3_all_chr1.vcf.gz.tbi\
9+
1kg_p1_all_chr6.vcf.bcf.csi\
10+
1kg_p1_all_chr6.vcf.gz.tbi
11+
12+
.PRECIOUS: %.bcf
13+
.PRECIOUS: %.vcf.gz
14+
15+
# Modern 1000 genomes data
16+
1KG_2020_BASE=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000G_2504_high_coverage/working/20201028_3202_raw_GT_with_annot/20201028_CCDG_14151_B01_GRM_WGS_2020-08-05_
17+
1KG_2020_GENOTYPES_URL="${1KG_2020_BASE}chr20.recalibrated_variants.vcf.gz"
18+
1KG_2020_ANNOTATIONS_URL="${1KG_2020_BASE}chr20.recalibrated_variants.annotated.vcf.gz"
19+
1KG_2020_OTHERS_URL="${1KG_2020_BASE}others.recalibrated_variants.vcf.gz"
20+
21+
# 1000 genomes phase 3
22+
1KG_P3_ALL_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20190312_biallelic_SNV_and_INDEL/ALL.chr1.shapeit2_integrated_snvindels_v2a_27022019.GRCh38.phased.vcf.gz
23+
1KG_P3_SNP_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/1000_genomes_project/release/20181203_biallelic_SNV/ALL.chr1.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz
24+
25+
# 1000 genomes phase 1
26+
1KG_P1_ALL_URL=http://ftp.1000genomes.ebi.ac.uk/vol1/ftp/phase1/analysis_results/integrated_call_sets/ALL.chr6.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.vcf.gz
27+
28+
29+
%.vcf.gz.tbi: %.vcf.gz
30+
tabix $<
31+
32+
%.bcf.csi: %.bcf
33+
bcftools index $<
34+
35+
%.bcf: %.vcf.gz
36+
bcftools view -O b $< > $@
37+
38+
1kg_2020_chr20.vcf.gz:
39+
bcftools view ${1KG_2020_GENOTYPES_URL} | head -n 10000 | bcftools view -O z -o $@
40+
41+
1kg_2020_others.vcf.gz:
42+
bcftools view ${1KG_2020_OTHERS_URL} | head -n 10000 | bcftools view -O z -o $@
43+
44+
1kg_2020_chr20_annotations.vcf.gz:
45+
bcftools view ${1KG_2020_ANNOTATIONS_URL} | head -n 100000 | bcftools view -O z -o $@
46+
47+
1kg_p3_all_chr1.vcf.gz:
48+
bcftools view ${1KG_P3_ALL_URL} | head -n 500000 | bcftools view -O z -o $@
49+
50+
1kg_p3_snp_chr1.vcf.gz:
51+
bcftools view ${1KG_P3_SNP_URL} | head -n 500000 | bcftools view -O z -o $@
52+
53+
1kg_p1_all_chr6.vcf.gz:
54+
bcftools view ${1KG_P1_ALL_URL} | head -n 10000 | bcftools view -O z -o $@
55+
56+
clean:
57+
rm -f *.vcf.gz *.tbi *.bcf *.csi

validation.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Development script to automate running the validation tests.
2+
# These are large-scale tests that are not possible to run
3+
# under unit-test conditions.
4+
import pathlib
5+
import click
6+
7+
import bio2zarr as bz
8+
9+
# tmp, come up with better names and flatten hierarchy
10+
from bio2zarr import vcf
11+
12+
13+
@click.command
14+
@click.argument("vcfs", nargs=-1)
15+
@click.option("-p", "--worker-processes", type=int, default=1)
16+
@click.option("-f", "--force", is_flag=True, default=False)
17+
def cli(vcfs, worker_processes, force):
18+
19+
data_path = pathlib.Path("validation-data")
20+
if len(vcfs) == 0:
21+
vcfs = list(data_path.glob("*.vcf.gz")) + list(data_path.glob("*.bcf"))
22+
else:
23+
vcfs = [pathlib.Path(f) for f in vcfs]
24+
tmp_path = pathlib.Path("validation-tmp")
25+
tmp_path.mkdir(exist_ok=True)
26+
for f in vcfs:
27+
print(f)
28+
exploded = tmp_path / (f.name + ".exploded")
29+
if force or not exploded.exists():
30+
bz.explode_vcf(
31+
[f],
32+
exploded,
33+
worker_processes=worker_processes,
34+
show_progress=True,
35+
)
36+
spec = tmp_path / (f.name + ".spec")
37+
if force or not spec.exists():
38+
with open(spec, "w") as specfile:
39+
vcf.generate_spec(exploded, specfile)
40+
41+
zarr = tmp_path / (f.name + ".zarr")
42+
if force or not zarr.exists():
43+
vcf.to_zarr(
44+
exploded,
45+
zarr,
46+
conversion_spec=spec,
47+
worker_processes=worker_processes,
48+
show_progress=True,
49+
)
50+
51+
vcf.validate(f, zarr, show_progress=True)
52+
53+
54+
if __name__ == "__main__":
55+
cli()

vcf2zarr.py

Lines changed: 5 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,8 @@ def summarise(columnarised):
3434
@click.argument("columnarised", type=click.Path())
3535
# @click.argument("specfile", type=click.Path())
3636
def genspec(columnarised):
37-
pcvcf = cnv.PickleChunkedVcf.load(columnarised)
38-
spec = cnv.ZarrConversionSpec.generate(pcvcf)
39-
# with open(specfile, "w") as f:
4037
stream = click.get_text_stream("stdout")
41-
json.dump(spec.asdict(), stream, indent=4)
38+
cnv.generate_spec(columnarised, stream)
4239

4340

4441
@click.command
@@ -47,21 +44,12 @@ def genspec(columnarised):
4744
@click.option("-s", "--conversion-spec", default=None)
4845
@click.option("-p", "--worker-processes", type=int, default=1)
4946
def to_zarr(columnarised, zarr_path, conversion_spec, worker_processes):
50-
pcvcf = cnv.PickleChunkedVcf.load(columnarised)
51-
if conversion_spec is None:
52-
spec = cnv.ZarrConversionSpec.generate(pcvcf)
53-
else:
54-
with open(conversion_spec, "r") as f:
55-
d = json.load(f)
56-
spec = cnv.ZarrConversionSpec.fromdict(d)
57-
58-
cnv.SgvcfZarr.convert(
59-
pcvcf,
47+
cnv.to_zarr(
48+
columnarised,
6049
zarr_path,
61-
conversion_spec=spec,
50+
conversion_spec,
6251
worker_processes=worker_processes,
63-
show_progress=True,
64-
)
52+
show_progress=True)
6553

6654

6755
@click.command

0 commit comments

Comments
 (0)