Skip to content

Commit 535687e

Browse files
Add VCF header and format version
Closes #14
1 parent 97977c4 commit 535687e

File tree

2 files changed

+22
-6
lines changed

2 files changed

+22
-6
lines changed

bio2zarr/vcf.py

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ def make_field_def(name, vcf_type, vcf_number):
178178
def scan_vcfs(paths, show_progress):
179179
partitions = []
180180
vcf_metadata = None
181+
header = None
181182
logger.info(f"Scanning {len(paths)} VCFs")
182183
for path in tqdm.tqdm(paths, desc="Scan ", disable=not show_progress):
183184
vcf = cyvcf2.VCF(path)
@@ -215,6 +216,9 @@ def scan_vcfs(paths, show_progress):
215216

216217
if vcf_metadata is None:
217218
vcf_metadata = metadata
219+
# We just take the first header, assuming the others
220+
# are compatible.
221+
header = vcf.raw_header
218222
else:
219223
if metadata != vcf_metadata:
220224
raise ValueError("Incompatible VCF chunks")
@@ -230,7 +234,7 @@ def scan_vcfs(paths, show_progress):
230234
)
231235
partitions.sort(key=lambda x: x.first_position)
232236
vcf_metadata.partitions = partitions
233-
return vcf_metadata
237+
return vcf_metadata, header
234238

235239

236240
def sanitise_value_bool(buff, j, value):
@@ -668,9 +672,10 @@ def __exit__(self, exc_type, exc_val, exc_tb):
668672

669673

670674
class PickleChunkedVcf(collections.abc.Mapping):
671-
def __init__(self, path, metadata):
675+
def __init__(self, path, metadata, vcf_header):
672676
self.path = path
673677
self.metadata = metadata
678+
self.vcf_header = vcf_header
674679

675680
self.columns = {}
676681
for field in self.metadata.fields:
@@ -753,7 +758,9 @@ def load(path):
753758
path = pathlib.Path(path)
754759
with open(path / "metadata.json") as f:
755760
metadata = VcfMetadata.fromdict(json.load(f))
756-
return PickleChunkedVcf(path, metadata)
761+
with open(path / "header.txt") as f:
762+
header = f.read()
763+
return PickleChunkedVcf(path, metadata, header)
757764

758765
@staticmethod
759766
def convert_partition(
@@ -820,8 +827,8 @@ def convert(
820827
):
821828
out_path = pathlib.Path(out_path)
822829
# TODO make scan work in parallel using general progress code too
823-
vcf_metadata = scan_vcfs(vcfs, show_progress=show_progress)
824-
pcvcf = PickleChunkedVcf(out_path, vcf_metadata)
830+
vcf_metadata, header = scan_vcfs(vcfs, show_progress=show_progress)
831+
pcvcf = PickleChunkedVcf(out_path, vcf_metadata, header)
825832
pcvcf.mkdirs()
826833

827834
total_variants = sum(
@@ -855,6 +862,8 @@ def convert(
855862

856863
with open(out_path / "metadata.json", "w") as f:
857864
json.dump(vcf_metadata.asdict(), f, indent=4)
865+
with open(out_path / "header.txt", "w") as f:
866+
f.write(header)
858867
return pcvcf
859868

860869

@@ -1214,7 +1223,6 @@ def encode_contig(self, pcvcf, contig_names, contig_lengths):
12141223
logger.debug("Contig done")
12151224

12161225
def encode_filters(self, pcvcf, filter_names):
1217-
self.root.attrs["filters"] = filter_names
12181226
array = self.root.array(
12191227
"filter_id",
12201228
filter_names,
@@ -1277,6 +1285,9 @@ def convert(
12771285
for column in conversion_spec.columns.values():
12781286
sgvcf.create_array(column)
12791287

1288+
sgvcf.root.attrs["vcf_zarr_version"] = "0.2"
1289+
sgvcf.root.attrs["vcf_header"] = pcvcf.vcf_header
1290+
12801291
progress_config = core.ProgressConfig(
12811292
total=pcvcf.total_uncompressed_bytes,
12821293
title="Encode",

tests/test_vcf_examples.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import xarray.testing as xt
44
import pytest
55
import sgkit as sg
6+
import cyvcf2
67

78
from bio2zarr import vcf
89

@@ -33,6 +34,10 @@ def test_filters(self, ds):
3334
],
3435
)
3536

37+
def test_header(self, ds):
38+
vcf = cyvcf2.VCF(self.data_path)
39+
assert ds.attrs["vcf_header"] == vcf.raw_header
40+
3641
def test_contigs(self, ds):
3742
nt.assert_array_equal(ds["contig_id"], ["19", "20", "X"])
3843
assert "contig_length" not in ds

0 commit comments

Comments
 (0)