Skip to content

Commit c5e5a97

Browse files
tomwhitejeromekelleher
authored andcommitted
Don't store full VCF header in vcz, only non-redundant meta information lines
1 parent 5d2bd3b commit c5e5a97

File tree

2 files changed

+29
-5
lines changed

2 files changed

+29
-5
lines changed

bio2zarr/vcf.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import math
77
import pathlib
88
import pickle
9+
import re
910
import shutil
1011
import sys
1112
import tempfile
@@ -973,8 +974,19 @@ def num_fields(self):
973974

974975
@property
975976
def root_attrs(self):
977+
meta_information_pattern = re.compile("##([^=]+)=(.*)")
978+
vcf_meta_information = []
979+
for line in self.vcf_header.split("\n"):
980+
match = re.fullmatch(meta_information_pattern, line)
981+
if match:
982+
key = match.group(1)
983+
if key in ("contig", "FILTER", "INFO", "FORMAT"):
984+
# these fields are stored in Zarr arrays
985+
continue
986+
value = match.group(2)
987+
vcf_meta_information.append((key, value))
976988
return {
977-
"vcf_header": self.vcf_header,
989+
"vcf_meta_information": vcf_meta_information,
978990
}
979991

980992
def iter_id(self, start, stop):

tests/test_vcf_examples.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import re
44
from unittest import mock
55

6-
import cyvcf2
76
import numpy as np
87
import numpy.testing as nt
98
import pytest
@@ -55,9 +54,22 @@ def test_filters(self, ds):
5554
],
5655
)
5756

58-
def test_header(self, ds):
59-
vcf = cyvcf2.VCF(self.data_path)
60-
assert ds.attrs["vcf_header"] == vcf.raw_header
57+
def test_vcf_meta_information(self, ds):
58+
assert ds.attrs["vcf_meta_information"] == [
59+
["fileformat", "VCFv4.0"],
60+
["fileDate", "20090805"],
61+
["source", "myImputationProgramV3.1"],
62+
["reference", "1000GenomesPilot-NCBI36"],
63+
["phasing", "partial"],
64+
["ALT", '<ID=DEL:ME:ALU,Description="Deletion of ALU element">'],
65+
["ALT", '<ID=CNV,Description="Copy number variable region">'],
66+
["bcftools_viewVersion", "1.11+htslib-1.11-4"],
67+
[
68+
"bcftools_viewCommand",
69+
"view -O b sample.vcf.gz; Date=Tue Feb 27 14:41:07 2024",
70+
],
71+
["bcftools_viewCommand", "view sample.bcf; Date=Wed Mar 27 11:42:16 2024"],
72+
]
6173

6274
def test_source(self, ds):
6375
assert ds.attrs["source"] == f"bio2zarr-{provenance.__version__}"

0 commit comments

Comments
 (0)