Skip to content

Commit 1c7faff

Browse files
Merge pull request #85 from jeromekelleher/check-json-format-versions
Add checking for metadata format versions
2 parents bae2f43 + 81d6733 commit 1c7faff

File tree

3 files changed

+74
-4
lines changed

3 files changed

+74
-4
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,4 @@
22

33
- Merged 1D and 2D encode steps into one, and change rate reporting to bytes
44
- Add --max-memory for encode
5+
- Change `chunk_width` to `samples_chunk_size` and `chunk_length` to `variants_chunk_size`

bio2zarr/vcf.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,9 @@ class VcfPartition:
142142
num_records: int = -1
143143

144144

145+
VCF_METADATA_FORMAT_VERSION = "0.1"
146+
147+
145148
@dataclasses.dataclass
146149
class VcfMetadata:
147150
format_version: str
@@ -175,6 +178,11 @@ def num_records(self):
175178

176179
@staticmethod
177180
def fromdict(d):
181+
if d["format_version"] != VCF_METADATA_FORMAT_VERSION:
182+
raise ValueError(
183+
"Exploded metadata format version mismatch: "
184+
f"{d['format_version']} != {VCF_METADATA_FORMAT_VERSION}"
185+
)
178186
fields = [VcfField.fromdict(fd) for fd in d["fields"]]
179187
partitions = [VcfPartition(**pd) for pd in d["partitions"]]
180188
d = d.copy()
@@ -239,8 +247,7 @@ def scan_vcf(path, target_num_partitions):
239247
# TODO use the mapping dictionary
240248
fields=fields,
241249
partitions=[],
242-
# FIXME do something systematic with this
243-
format_version="0.1",
250+
format_version=VCF_METADATA_FORMAT_VERSION,
244251
)
245252
try:
246253
metadata.contig_lengths = vcf.seqlens
@@ -1060,6 +1067,9 @@ def __post_init__(self):
10601067
self.dimensions = tuple(self.dimensions)
10611068

10621069

1070+
ZARR_SCHEMA_FORMAT_VERSION = "0.2"
1071+
1072+
10631073
@dataclasses.dataclass
10641074
class ZarrConversionSpec:
10651075
format_version: str
@@ -1080,6 +1090,11 @@ def asjson(self):
10801090

10811091
@staticmethod
10821092
def fromdict(d):
1093+
if d["format_version"] != ZARR_SCHEMA_FORMAT_VERSION:
1094+
raise ValueError(
1095+
"Zarr schema format version mismatch: "
1096+
f"{d['format_version']} != {ZARR_SCHEMA_FORMAT_VERSION}"
1097+
)
10831098
ret = ZarrConversionSpec(**d)
10841099
ret.columns = {
10851100
key: ZarrColumnSpec(**value) for key, value in d["columns"].items()
@@ -1241,8 +1256,7 @@ def fixed_field_spec(
12411256
)
12421257

12431258
return ZarrConversionSpec(
1244-
# TODO do something systematic
1245-
format_version="0.1",
1259+
format_version=ZARR_SCHEMA_FORMAT_VERSION,
12461260
samples_chunk_size=samples_chunk_size,
12471261
variants_chunk_size=variants_chunk_size,
12481262
columns={col.name: col for col in colspecs},

tests/test_vcf.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
import json
2+
3+
import pytest
4+
5+
6+
from bio2zarr import vcf
7+
8+
9+
@pytest.fixture(scope="module")
10+
def vcf_file():
11+
return "tests/data/vcf/sample.vcf.gz"
12+
13+
14+
@pytest.fixture(scope="module")
15+
def exploded_path(vcf_file, tmp_path_factory):
16+
out = tmp_path_factory.mktemp("data") / "example.exploded"
17+
vcf.explode([vcf_file], out)
18+
return out
19+
20+
21+
@pytest.fixture(scope="module")
22+
def schema_path(exploded_path, tmp_path_factory):
23+
out = tmp_path_factory.mktemp("data") / "example.schema.json"
24+
with open(out, "w") as f:
25+
vcf.mkschema(exploded_path, f)
26+
return out
27+
28+
29+
@pytest.fixture(scope="module")
30+
def zarr_path(exploded_path, tmp_path_factory):
31+
out = tmp_path_factory.mktemp("data") / "example.zarr"
32+
vcf.encode(exploded_path, out)
33+
return out
34+
35+
36+
class TestJsonVersions:
37+
@pytest.mark.parametrize("version", ["0.1", "1.0", "xxxxx", 0.2])
38+
def test_zarr_schema_mismatch(self, schema_path, version):
39+
with open(schema_path) as f:
40+
d = json.load(f)
41+
42+
d["format_version"] = version
43+
with pytest.raises(ValueError, match="Zarr schema format version mismatch"):
44+
vcf.ZarrConversionSpec.fromdict(d)
45+
46+
@pytest.mark.parametrize("version", ["0.0", "1.0", "xxxxx", 0.1])
47+
def test_exploded_metadata_mismatch(self, tmpdir, exploded_path, version):
48+
with open(exploded_path / "metadata.json", "r") as f:
49+
d = json.load(f)
50+
51+
d["format_version"] = version
52+
with pytest.raises(
53+
ValueError, match="Exploded metadata format version mismatch"
54+
):
55+
vcf.VcfMetadata.fromdict(d)

0 commit comments

Comments
 (0)