Skip to content

Commit 8b974de

Browse files
committed
Don't store full VCF header in vcz, only non-redundant meta information lines
1 parent 6680178 commit 8b974de

File tree

2 files changed

+29
-5
lines changed

2 files changed

+29
-5
lines changed

bio2zarr/vcf.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import math
77
import pathlib
88
import pickle
9+
import re
910
import shutil
1011
import sys
1112
import tempfile
@@ -971,8 +972,19 @@ def num_fields(self):
971972

972973
@property
973974
def root_attrs(self):
975+
meta_information_pattern = re.compile("##([^=]+)=(.*)")
976+
vcf_meta_information = []
977+
for line in self.vcf_header.split("\n"):
978+
match = re.fullmatch(meta_information_pattern, line)
979+
if match:
980+
key = match.group(1)
981+
if key in ("contig", "FILTER", "INFO", "FORMAT"):
982+
# these fields are stored in Zarr arrays
983+
continue
984+
value = match.group(2)
985+
vcf_meta_information.append((key, value))
974986
return {
975-
"vcf_header": self.vcf_header,
987+
"vcf_meta_information": vcf_meta_information,
976988
}
977989

978990
def iter_id(self, start, stop):

tests/test_vcf_examples.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@
22
import pathlib
33
import re
44

5-
import cyvcf2
65
import numpy as np
76
import numpy.testing as nt
87
import pytest
@@ -54,9 +53,22 @@ def test_filters(self, ds):
5453
],
5554
)
5655

57-
def test_header(self, ds):
58-
vcf = cyvcf2.VCF(self.data_path)
59-
assert ds.attrs["vcf_header"] == vcf.raw_header
56+
def test_vcf_meta_information(self, ds):
57+
assert ds.attrs["vcf_meta_information"] == [
58+
["fileformat", "VCFv4.0"],
59+
["fileDate", "20090805"],
60+
["source", "myImputationProgramV3.1"],
61+
["reference", "1000GenomesPilot-NCBI36"],
62+
["phasing", "partial"],
63+
["ALT", '<ID=DEL:ME:ALU,Description="Deletion of ALU element">'],
64+
["ALT", '<ID=CNV,Description="Copy number variable region">'],
65+
["bcftools_viewVersion", "1.11+htslib-1.11-4"],
66+
[
67+
"bcftools_viewCommand",
68+
"view -O b sample.vcf.gz; Date=Tue Feb 27 14:41:07 2024",
69+
],
70+
["bcftools_viewCommand", "view sample.bcf; Date=Wed Mar 27 11:42:16 2024"],
71+
]
6072

6173
def test_source(self, ds):
6274
assert ds.attrs["source"] == f"bio2zarr-{provenance.__version__}"

0 commit comments

Comments
 (0)