diff --git a/bio2zarr/vcf.py b/bio2zarr/vcf.py index 0726dcdc..c64023af 100644 --- a/bio2zarr/vcf.py +++ b/bio2zarr/vcf.py @@ -6,6 +6,7 @@ import math import pathlib import pickle +import re import shutil import sys import tempfile @@ -973,8 +974,19 @@ def num_fields(self): @property def root_attrs(self): + meta_information_pattern = re.compile("##([^=]+)=(.*)") + vcf_meta_information = [] + for line in self.vcf_header.split("\n"): + match = re.fullmatch(meta_information_pattern, line) + if match: + key = match.group(1) + if key in ("contig", "FILTER", "INFO", "FORMAT"): + # these fields are stored in Zarr arrays + continue + value = match.group(2) + vcf_meta_information.append((key, value)) return { - "vcf_header": self.vcf_header, + "vcf_meta_information": vcf_meta_information, } def iter_id(self, start, stop): diff --git a/bio2zarr/vcz.py b/bio2zarr/vcz.py index 14d3ba19..28a60741 100644 --- a/bio2zarr/vcz.py +++ b/bio2zarr/vcz.py @@ -604,7 +604,7 @@ def init( root = zarr.open(store=self.path, mode="a", **zarr_utils.ZARR_FORMAT_KWARGS) root.attrs.update( { - "vcf_zarr_version": "0.2", + "vcf_zarr_version": "0.4", "source": f"bio2zarr-{provenance.__version__}", } ) diff --git a/tests/test_vcf_examples.py b/tests/test_vcf_examples.py index 7aaed7a3..3698f9ef 100644 --- a/tests/test_vcf_examples.py +++ b/tests/test_vcf_examples.py @@ -3,7 +3,6 @@ import re from unittest import mock -import cyvcf2 import numpy as np import numpy.testing as nt import pytest @@ -55,9 +54,22 @@ def test_filters(self, ds): ], ) - def test_header(self, ds): - vcf = cyvcf2.VCF(self.data_path) - assert ds.attrs["vcf_header"] == vcf.raw_header + def test_vcf_meta_information(self, ds): + assert ds.attrs["vcf_meta_information"] == [ + ["fileformat", "VCFv4.0"], + ["fileDate", "20090805"], + ["source", "myImputationProgramV3.1"], + ["reference", "1000GenomesPilot-NCBI36"], + ["phasing", "partial"], + ["ALT", ''], + ["ALT", ''], + ["bcftools_viewVersion", "1.11+htslib-1.11-4"], + [ + "bcftools_viewCommand", + "view -O b sample.vcf.gz; Date=Tue Feb 27 14:41:07 2024", + ], + ["bcftools_viewCommand", "view sample.bcf; Date=Wed Mar 27 11:42:16 2024"], + ] def test_source(self, ds): assert ds.attrs["source"] == f"bio2zarr-{provenance.__version__}"